# Feature Manipulation

Glenn Louis Opitz, Alexandre Violleau

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

#sns.set_theme()

In [12]:
data = pd.read_parquet(Path("data") / "train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [13]:
def _encode_dates(X):
    X = X.copy()  # Ensure we're working on a copy
    # Encode the date information
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday  # 0=Monday, 6=Sunday
    X["hour"] = X["date"].dt.hour
    # Keep the rest of the columns as they are
    return X

# Apply the encoding function to the dataset
data = data.copy()  # Ensure we're working on a copy
data = _encode_dates(data)

data["weekend"] = (data["weekday"] > 4).astype(int)  # 1 stands for weekend, 0 stands for no weekend
data.head()


Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count,year,month,day,weekday,hour,weekend
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,2,0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147,2020,9,1,1,3,0
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0,2020,9,1,1,4,0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438,2020,9,1,1,15,0
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585,2020,9,1,1,18,0


In [20]:
import holidays

# Define French holidays
FR_holidays = holidays.FR(years=range(2019, 2022))

data["FR_holidays"] = data["date"].dt.date.isin(FR_holidays).astype(int)
print(f"Number of rows marked as holidays: {data['FR_holidays'].sum()}")

Number of rows marked as holidays: 14688


# Adding Weather Data

In [33]:
weather_data = pd.read_csv(Path("data") / "external_data.csv")

weather_data["date"] = pd.to_datetime(weather_data["date"], errors="coerce")
print(weather_data["date"].isna().sum())
weather_data = _encode_dates(weather_data)

# Drop the duplicate rows based on the 'date' column
weather_data = weather_data.drop_duplicates(subset="date")

# Verify that the duplicate is removed
duplicate_rows = weather_data[weather_data["date"].duplicated(keep=False)]
print(f"Number of duplicate rows after dropping: {len(duplicate_rows)}")

# Interpolate linearly to get from 3 hour data to 1 hour data
weather_data.set_index("date", inplace=True)  # Set date as the index
weather_data = weather_data.resample("H").interpolate(method="linear")  # Interpolate missing values
weather_data.reset_index(inplace=True)  # Reset index

# Check the new shape of the weather data
print(f"Resampled Weather Data Shape: {weather_data.shape}")

# Merge bike data with weather data using a left join
merged_data = pd.merge(data, weather_data, on="date", how="left")

# Check the merged dataset
print(f"Merged Data Shape: {merged_data.shape}")


0
Number of duplicate rows after dropping: 0
Resampled Weather Data Shape: (9973, 64)


  weather_data = weather_data.resample("H").interpolate(method="linear")  # Interpolate missing values


Merged Data Shape: (496827, 82)


In [34]:
from sklearn.preprocessing import FunctionTransformer

date_encoder = FunctionTransformer(_encode_dates, validate=False)
sample_encoded = date_encoder.fit_transform(merged_data[["date"]]).head()
sample_encoded

Unnamed: 0,date,year,month,day,weekday,hour
0,2020-09-01 02:00:00,2020,9,1,1,2
1,2020-09-01 03:00:00,2020,9,1,1,3
2,2020-09-01 04:00:00,2020,9,1,1,4
3,2020-09-01 15:00:00,2020,9,1,1,15
4,2020-09-01 18:00:00,2020,9,1,1,18


Total changes in features extraction:
1. Encode the date information to single columns (year, month, day, weekday, hour)
2. Add a column indicating weekends (1 = weekend, 0 = no weekend)
3. Add a column of French holidays (1 = French holidays, 0 = no French holidays)
4. Encode a "date" column to match scikit learn requirements.

In [35]:
# Save the processed_data file in the data folder
merged_data.to_parquet(Path("data") / "processed_data.parquet")