In [47]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import seaborn as sns
from scipy import stats
import folium
from folium.plugins import HeatMap
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.ticker as ticker

In [48]:
df = pd.read_csv('data/bike_checkout_processed_features.csv')

In [49]:
df.set_index('timestamp_hour', inplace=True)

In [50]:
df.drop(columns='Unnamed: 0', inplace=True)

In [51]:
df.head()

Unnamed: 0_level_0,station_id,num_bikes_available_mean,mechanical_available,ebike_available,num_docks_available_mean,num_docks_available_max,is_charging_station,status,is_installed,is_renting,...,checkouts_hour_station,temperature,precipitation,wind_speed,weather_category,day_of_week,hour_of_day,month,checkouts_hour_station_lag1,checkouts_hour_station_lag24
timestamp_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-07-31 21:00:00,24.0,18.0,13.0,5.0,17.0,17.0,True,IN_SERVICE,1.0,1.0,...,0.0,,,,,2,21,7,,
2024-07-31 21:00:00,26.0,7.0,5.0,2.0,9.0,9.0,True,IN_SERVICE,1.0,1.0,...,0.0,,,,,2,21,7,,
2024-07-31 21:00:00,29.0,3.0,0.0,3.0,9.0,9.0,True,IN_SERVICE,1.0,1.0,...,0.0,,,,,2,21,7,,
2024-07-31 21:00:00,41.0,20.0,19.0,1.0,3.0,3.0,True,IN_SERVICE,1.0,1.0,...,0.0,,,,,2,21,7,,
2024-07-31 21:00:00,42.0,3.0,0.0,3.0,9.0,9.0,True,IN_SERVICE,1.0,1.0,...,0.0,,,,,2,21,7,,


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217208 entries, 2024-07-31 21:00:00 to 2025-01-31 23:00:00
Data columns (total 22 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   station_id                    217208 non-null  float64
 1   num_bikes_available_mean      217208 non-null  float64
 2   mechanical_available          217208 non-null  float64
 3   ebike_available               217208 non-null  float64
 4   num_docks_available_mean      217208 non-null  float64
 5   num_docks_available_max       217208 non-null  float64
 6   is_charging_station           217208 non-null  bool   
 7   status                        217208 non-null  object 
 8   is_installed                  217208 non-null  float64
 9   is_renting                    217208 non-null  float64
 10  is_returning                  217208 non-null  float64
 11  ttl                           217208 non-null  float64
 12  checkouts_hour_sta

In [53]:
# Apply log1p transformation (log(1+x)) to handle zeros
df['log_checkouts'] = np.log1p(df['checkouts_hour_station'])

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217208 entries, 2024-07-31 21:00:00 to 2025-01-31 23:00:00
Data columns (total 23 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   station_id                    217208 non-null  float64
 1   num_bikes_available_mean      217208 non-null  float64
 2   mechanical_available          217208 non-null  float64
 3   ebike_available               217208 non-null  float64
 4   num_docks_available_mean      217208 non-null  float64
 5   num_docks_available_max       217208 non-null  float64
 6   is_charging_station           217208 non-null  bool   
 7   status                        217208 non-null  object 
 8   is_installed                  217208 non-null  float64
 9   is_renting                    217208 non-null  float64
 10  is_returning                  217208 non-null  float64
 11  ttl                           217208 non-null  float64
 12  checkouts_hour_sta

In [55]:
df = df.drop('status', axis=1)

In [56]:
df['weather_category'].value_counts()

weather_category
clear     112235
cloudy     87466
rain       12480
storm       4877
Name: count, dtype: int64

## **Splitting the dataset into train, validation  and test sets**

We'll use a time-based split: 

70% of the data for training (earliest dates)

15% for validation (middle dates)

15% for testing (most recent dates)

In [57]:
df = df.sort_index()

# Determine split points 
n_samples = len(df)
train_size = int(0.7 * n_samples)
val_size = int(0.15 * n_samples)


In [58]:
# Split the data
train_data = df.iloc[:train_size]
val_data = df.iloc[train_size:train_size+val_size]
test_data = df.iloc[train_size+val_size:]

In [59]:
print(f"Training data: {len(train_data)} samples")
print(f"Validation data: {len(val_data)} samples")
print(f"Test data: {len(test_data)} samples")

Training data: 152045 samples
Validation data: 32581 samples
Test data: 32582 samples


In [60]:
# Separate features and target
X_train = train_data.drop(['checkouts_hour_station', 'log_checkouts'], axis=1)
y_train = train_data['log_checkouts']  # Use log-transformed target

X_val = val_data.drop(['checkouts_hour_station', 'log_checkouts'], axis=1)
y_val = val_data['log_checkouts']

X_test = test_data.drop(['checkouts_hour_station', 'log_checkouts'], axis=1)
y_test = test_data['log_checkouts']

## **Handling Missing Values, Apply One-hot encoding to categorical feature & scaling numerical variables**

In [62]:
# Identify categorical and numerical columns
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object' or X_train[col].dtype.name == 'category']
num_cols = [col for col in X_train.columns if col not in cat_cols and X_train[col].dtype != 'bool']
bool_cols = [col for col in X_train.columns if X_train[col].dtype == 'bool']

print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")
print(f"Boolean columns: {bool_cols}")

Categorical columns: ['weather_category']
Numerical columns: ['station_id', 'num_bikes_available_mean', 'mechanical_available', 'ebike_available', 'num_docks_available_mean', 'num_docks_available_max', 'is_installed', 'is_renting', 'is_returning', 'ttl', 'temperature', 'precipitation', 'wind_speed', 'day_of_week', 'hour_of_day', 'month', 'checkouts_hour_station_lag1', 'checkouts_hour_station_lag24']
Boolean columns: ['is_charging_station']


In [63]:
# Handle missing values
from sklearn.impute import SimpleImputer

# Create imputers for numerical features
num_imputer = SimpleImputer(strategy='median')
num_imputer.fit(X_train[num_cols])

In [64]:
# Impute numerical columns
X_train_num_imputed = pd.DataFrame(
    num_imputer.transform(X_train[num_cols]), 
    columns=num_cols, 
    index=X_train.index
)

X_val_num_imputed = pd.DataFrame(
    num_imputer.transform(X_val[num_cols]), 
    columns=num_cols, 
    index=X_val.index
)

X_test_num_imputed = pd.DataFrame(
    num_imputer.transform(X_test[num_cols]), 
    columns=num_cols, 
    index=X_test.index
)

In [65]:
# For categorical columns, we use most frequent value
if cat_cols:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    cat_imputer.fit(X_train[cat_cols])
    
    X_train_cat_imputed = pd.DataFrame(
        cat_imputer.transform(X_train[cat_cols]), 
        columns=cat_cols, 
        index=X_train.index
    )
    
    X_val_cat_imputed = pd.DataFrame(
        cat_imputer.transform(X_val[cat_cols]), 
        columns=cat_cols, 
        index=X_val.index
    )
    
    X_test_cat_imputed = pd.DataFrame(
        cat_imputer.transform(X_test[cat_cols]), 
        columns=cat_cols, 
        index=X_test.index
    )

In [66]:
# For boolean columns, they don't need imputation, but ensure they stay as-is
if bool_cols:
    X_train_bool = X_train[bool_cols].copy()
    X_val_bool = X_val[bool_cols].copy()
    X_test_bool = X_test[bool_cols].copy()

In [67]:
# One-hot encoding for the categorical variables
from sklearn.preprocessing import OneHotEncoder

if cat_cols:
    ohe = OneHotEncoder(sparse_output=False, drop='if_binary')
    ohe.fit(X_train_cat_imputed)
    
    # Get feature names
    cat_feature_names = []
    for i, col in enumerate(cat_cols):
        cat_feature_names.extend([f"{col}_{val}" for val in ohe.categories_[i]])
    
    # Transform data
    X_train_cat_encoded = pd.DataFrame(
        ohe.transform(X_train_cat_imputed), 
        columns=cat_feature_names, 
        index=X_train.index
    )
    
    X_val_cat_encoded = pd.DataFrame(
        ohe.transform(X_val_cat_imputed), 
        columns=cat_feature_names, 
        index=X_val.index
    )
    
    X_test_cat_encoded = pd.DataFrame(
        ohe.transform(X_test_cat_imputed), 
        columns=cat_feature_names, 
        index=X_test.index
    )


In [69]:
# Scale numerical features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train_num_imputed)

X_train_num_scaled = pd.DataFrame(
    scaler.transform(X_train_num_imputed), 
    columns=num_cols, 
    index=X_train.index
)

X_val_num_scaled = pd.DataFrame(
    scaler.transform(X_val_num_imputed), 
    columns=num_cols, 
    index=X_val.index
)

X_test_num_scaled = pd.DataFrame(
    scaler.transform(X_test_num_imputed), 
    columns=num_cols, 
    index=X_test.index
)

In [70]:
# Combining the datasets
if cat_cols and bool_cols:
    X_train_processed = pd.concat([X_train_num_scaled, X_train_cat_encoded, X_train_bool], axis=1)
    X_val_processed = pd.concat([X_val_num_scaled, X_val_cat_encoded, X_val_bool], axis=1)
    X_test_processed = pd.concat([X_test_num_scaled, X_test_cat_encoded, X_test_bool], axis=1)
elif cat_cols:
    X_train_processed = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
    X_val_processed = pd.concat([X_val_num_scaled, X_val_cat_encoded], axis=1)
    X_test_processed = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)
elif bool_cols:
    X_train_processed = pd.concat([X_train_num_scaled, X_train_bool], axis=1)
    X_val_processed = pd.concat([X_val_num_scaled, X_val_bool], axis=1)
    X_test_processed = pd.concat([X_test_num_scaled, X_test_bool], axis=1)
else:
    X_train_processed = X_train_num_scaled
    X_val_processed = X_val_num_scaled
    X_test_processed = X_test_num_scaled

print(f"Final processed training data shape: {X_train_processed.shape}")
print(f"Final processed validation data shape: {X_val_processed.shape}")
print(f"Final processed test data shape: {X_test_processed.shape}")

Final processed training data shape: (152045, 23)
Final processed validation data shape: (32581, 23)
Final processed test data shape: (32582, 23)


## **Building a baseline model**