# Modeling - Feature Engineering - Bike Share System in the SF Bay Area

Author: Owen Hsu

## Table of content

1. Loading and Setup
2. Assessment
3. Spliting Training Set into Training and Validation Sets
4. Baseline Modeling
   * Logistic Regression
   * Decision Tree
   * Random Forest
   * K-Nearest Neighbors (KNN)
   * Support Vector Machine (SVM)
   * XGBoost


## Loading and Setup

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import time
import pickle
import os
# Filter warnings
from warnings import filterwarnings
filterwarnings('ignore')

  from pandas.core import (


In [30]:
# Load the datasets
train_df = pd.read_parquet('data/train_dataset.parquet')
test_df = pd.read_parquet('data/test_dataset.parquet')

## Assessment

In [31]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Train Data

In [32]:
# Print the shape of the train data
train_df.shape

(11831433, 17)

In [33]:
# Look at the first 5 rows of the train dataset
train_df.head()

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,weekend,holiday,month,day,hour,minute,usage_rate_category
0,-1.375345,-0.667162,-0.099799,-0.428081,0.38837,0.306077,-1.406683,-0.436337,-1.01458,1.820021,1.582616,-0.168116,-1.310568,-0.195224,1.228122,-1.303354,2.0
1,1.125367,0.336991,0.776477,0.410568,-0.15374,0.306077,1.015281,-0.436337,0.779536,-0.548136,1.582616,-0.168116,0.428313,0.14541,0.216731,-0.434267,2.0
2,1.125367,0.336991,-1.122121,0.410568,2.0147,-0.455251,-0.195701,-0.436337,-0.566051,-0.548136,-0.631865,-0.168116,-1.600382,0.713132,-1.372598,-0.723963,1.0
3,0.083403,-0.667162,-0.537937,1.715132,0.543259,-2.739235,-1.103938,-0.436337,0.779536,-0.548136,1.582616,-0.168116,-1.600382,-0.535857,0.939153,-0.434267,2.0
4,0.125082,-0.667162,0.776477,1.06285,-0.463517,0.306077,0.712536,-0.436337,0.331007,-0.548136,-0.631865,-0.168116,1.00794,-0.762946,0.650184,1.593602,2.0


In [34]:
# Verify the train dataset
train_df.sample(10)

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,weekend,holiday,month,day,hour,minute,usage_rate_category
3107110,-1.291988,0.336991,-0.099799,-0.428081,0.38837,0.306077,-1.406683,-0.436337,-1.01458,1.820021,1.582616,-0.168116,-1.310568,-0.195224,-0.79466,-1.013658,2.0
9864376,0.188993,-0.667162,-2.290489,-0.241714,1.859811,0.306077,-1.406683,-0.436337,-1.01458,-0.548136,1.582616,-0.168116,-1.600382,-1.444213,0.5057,-1.303354,3.0
11720339,-1.125274,-0.667162,0.776477,1.621949,1.627478,-0.455251,-0.498447,-0.436337,0.779536,1.820021,-0.631865,-0.168116,1.587567,0.713132,0.795065,0.145124,3.0
2559446,0.75026,2.345298,0.046247,1.621949,0.310926,-2.739235,-0.498447,-0.436337,0.779536,-0.548136,-0.631865,-0.168116,-1.310568,-1.330668,-0.072238,0.43482,1.0
2193818,-0.041632,-0.667162,0.630431,-0.148531,-2.244735,-0.455251,1.015281,-0.436337,-1.01458,-0.548136,-0.631865,-0.168116,-0.151314,-0.762946,-1.228114,1.593602,2.0
11113675,1.375438,0.336991,0.630431,0.503751,-0.463517,0.306077,1.620772,2.291809,1.228065,-0.548136,1.582616,-0.168116,0.428313,-0.762946,-1.055776,-0.378421,3.0
9677937,-0.083311,-0.667162,-0.830029,-0.334897,0.853036,0.306077,0.40979,2.291809,0.331007,-0.548136,-0.631865,-0.168116,-0.730941,-0.87649,1.661576,0.017525,3.0
7451148,1.093027,1.341144,-0.099799,-0.334897,-0.463517,0.306077,2.831755,-0.436337,0.779536,-0.548136,-0.631865,-0.168116,-0.441128,0.372499,1.228122,-1.013658,1.0
3095733,-0.875203,1.843221,-0.245845,0.410568,2.092144,0.306077,-0.801192,-0.436337,0.331007,-0.651713,-0.631865,-0.168116,-1.020755,0.940221,0.650184,-0.723963,2.0
843176,-1.25031,-0.667162,-3.896996,-1.639461,1.007924,0.306077,1.923518,-0.436337,-1.01458,1.820021,-0.631865,-0.168116,1.587567,1.621488,1.083638,0.145124,2.0


In [35]:
# Get a quick overview of dataset variables
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11831433 entries, 0 to 11831432
Data columns (total 17 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   station_id                      float64
 1   dock_count                      float64
 2   mean_dew_point_f                float64
 3   mean_humidity                   float64
 4   mean_sea_level_pressure_inches  float64
 5   mean_visibility_miles           float64
 6   mean_wind_speed_mph             float64
 7   precipitation_inches            float64
 8   cloud_cover                     float64
 9   zip_code                        float64
 10  weekend                         float64
 11  holiday                         float64
 12  month                           float64
 13  day                             float64
 14  hour                            float64
 15  minute                          float64
 16  usage_rate_category             float64
dtypes: float64(17)
memory usa

In [36]:
# Get a statistical summary of the dataset
train_df.describe()

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,weekend,holiday,month,day,hour,minute,usage_rate_category
count,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0
mean,0.08732592,0.0453991,0.0006204326,0.02849624,0.01324608,-0.03006019,0.03542715,0.02947676,0.04194627,-0.05911522,-0.02994011,-0.01487793,-0.005402367,0.001451161,-0.008357365,0.0002343803,2.0
std,0.9862928,1.019173,0.9762112,0.9795185,1.004865,0.9955866,1.016665,1.02656,1.002547,0.9581601,0.9852097,0.9559194,1.012743,0.9917042,1.006497,0.9962533,0.8164966
min,-1.708774,-1.671316,-5.357456,-4.155406,-3.019177,-4.261891,-2.012174,-0.4363365,-1.463109,-0.703502,-0.6318653,-0.168116,-1.600382,-1.671302,-1.661567,-1.593049,1.0
25%,-0.6668103,-0.6671623,-0.5379373,-0.4280805,-0.6958494,0.3060765,-0.8011921,-0.4363365,-1.01458,-0.5481358,-0.6318653,-0.168116,-1.020755,-0.8764904,-0.9302618,-0.873044,1.0
50%,0.2084391,-0.6671623,0.04624675,0.1310183,-0.1537396,0.3060765,-0.1957009,-0.4363365,-0.117522,-0.5481358,-0.6318653,-0.168116,0.1384995,0.03186537,-0.0670432,0.004509445,2.0
75%,0.9586529,0.336991,0.7764768,0.596934,0.7755915,0.3060765,0.7125358,-0.4363365,0.7795363,-0.09145356,1.582616,-0.168116,1.00794,0.8266767,0.9138567,0.868847,3.0
max,1.708867,2.345298,2.236937,2.55378,3.021475,7.919356,4.950974,2.291809,2.125124,1.820021,1.582616,5.948273,1.587567,1.735032,1.661576,1.593602,3.0


#### Test Data

In [37]:
# Print the shape of the test data
test_df.shape

(1467439, 17)

In [38]:
# Look at the first 5 rows of the test dataset
test_df.head()

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,weekend,holiday,month,day,hour,minute,usage_rate_category
0,0.75026,2.345298,0.922523,0.596934,0.001149,0.306077,-0.801192,2.291809,0.779536,-0.548136,-0.631865,-0.168116,1.00794,0.826677,-1.083629,0.724515,3.0
1,-0.416739,-0.667162,0.776477,1.249216,-0.540961,-0.455251,0.107045,2.291809,-0.117522,-0.703502,-0.631865,-0.168116,1.587567,-0.649401,-1.661567,-0.434267,2.0
2,-0.37506,1.341144,0.192293,-0.800813,0.853036,0.306077,0.40979,-0.436337,-0.117522,-0.091454,1.582616,-0.168116,-1.020755,1.394399,-0.216723,-1.303354,3.0
3,1.33376,0.336991,0.922523,0.317385,-0.540961,0.306077,0.712536,-0.436337,0.779536,-0.548136,-0.631865,-0.168116,0.138499,1.735032,1.661576,0.724515,2.0
4,-1.667095,-0.667162,-0.537937,-0.893996,-0.463517,0.306077,0.712536,-0.436337,0.779536,1.820021,-0.631865,-0.168116,-0.441128,0.372499,-0.361207,0.145124,2.0


In [39]:
# Verify the test dataset
test_df.sample(10)

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,weekend,holiday,month,day,hour,minute,usage_rate_category
123524,-0.916882,-0.667162,0.484385,-0.70763,-1.547736,0.306077,-1.103938,-0.436337,-1.01458,-0.651713,-0.631865,-0.168116,-0.151314,1.621488,-0.361207,-1.013658,2.0
332866,-1.708774,2.345298,-0.099799,-0.70763,-0.773294,0.306077,-0.195701,-0.436337,-0.117522,1.820021,1.582616,-0.168116,-0.151314,0.599588,0.794669,1.014211,3.0
131771,-0.875203,1.843221,0.630431,0.410568,-0.15374,0.306077,-0.195701,-0.436337,-0.566051,-0.651713,-0.631865,-0.168116,0.138499,-0.535857,0.650184,0.145124,2.0
787402,-0.625132,1.341144,-1.122121,-1.266729,0.93048,0.306077,-0.498447,-0.436337,-0.117522,-0.703502,-0.631865,-0.168116,-0.730941,-0.87649,-1.228114,0.724515,2.0
24633,0.083403,-0.667162,0.192293,0.317385,1.550034,0.306077,-0.498447,-0.436337,-0.117522,-0.548136,-0.631865,-0.168116,-1.020755,-0.422313,-1.083629,-0.144572,2.0
287979,-0.291703,-0.667162,1.652753,-0.241714,-2.399623,4.112716,0.40979,-0.436337,-1.463109,-0.091454,-0.631865,-0.168116,-0.151314,-0.762946,0.361215,0.724515,2.0
1265003,-0.791846,-0.667162,0.922523,0.7833,0.078593,0.306077,-0.801192,-0.436337,-0.566051,-0.651713,-0.631865,-0.168116,0.428313,1.394399,1.372607,1.303906,2.0
1147449,-1.625417,-1.671316,-1.122121,0.131018,-0.850738,-0.455251,0.712536,-0.436337,0.779536,1.820021,-0.631865,-0.168116,-0.730941,-0.990035,0.650184,-1.013658,2.0
399135,-0.541775,-0.667162,-0.245845,-1.732644,-0.386072,0.306077,-0.195701,-0.436337,-1.463109,-0.703502,-0.631865,-0.168116,-0.151314,-0.308768,-0.072238,-1.303354,2.0
91465,0.291796,1.341144,-0.976075,0.037835,1.240257,-2.739235,-1.103938,-0.436337,-0.566051,-0.548136,-0.631865,-0.168116,-1.600382,-0.308768,-0.505691,-1.013658,1.0


In [40]:
# Get a quick overview of dataset variables
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467439 entries, 0 to 1467438
Data columns (total 17 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   station_id                      1467439 non-null  float64
 1   dock_count                      1467439 non-null  float64
 2   mean_dew_point_f                1467439 non-null  float64
 3   mean_humidity                   1467439 non-null  float64
 4   mean_sea_level_pressure_inches  1467439 non-null  float64
 5   mean_visibility_miles           1467439 non-null  float64
 6   mean_wind_speed_mph             1467439 non-null  float64
 7   precipitation_inches            1467439 non-null  float64
 8   cloud_cover                     1467439 non-null  float64
 9   zip_code                        1467439 non-null  float64
 10  weekend                         1467439 non-null  float64
 11  holiday                         1467439 non-null  float64
 12  

In [41]:
# Get a statistical summary of the dataset
test_df.describe()

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,weekend,holiday,month,day,hour,minute,usage_rate_category
count,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0
mean,-0.001110399,-0.0009088503,-0.0002253745,0.00110414,8.589169e-05,0.0006632739,0.001124246,0.0005678869,0.002040834,0.0002704992,0.0003843302,-0.0003428524,-3.142766e-05,-0.0008078574,2.174742e-05,0.0012127,2.071246
std,1.000178,0.9984484,1.001204,1.000368,1.000515,1.000051,0.9999969,1.000527,0.9995878,1.000361,1.000183,0.9990089,0.9997948,0.9996474,0.9993584,1.000312,0.5689108
min,-1.708774,-1.671316,-5.357456,-4.155406,-3.019177,-4.261891,-2.012174,-0.4363365,-1.463109,-0.703502,-0.6318653,-0.168116,-1.600382,-1.671302,-1.661567,-1.593049,1.0
25%,-0.7918459,-0.6671623,-0.5379373,-0.5212637,-0.6958494,0.3060765,-0.8011921,-0.4363365,-1.01458,-0.5481358,-0.6318653,-0.168116,-0.7309411,-0.8764904,-0.7946604,-0.7239626,2.0
50%,-0.04163215,-0.6671623,0.04624675,0.03783516,-0.1537396,0.3060765,-0.1957009,-0.4363365,-0.117522,-0.5481358,-0.6318653,-0.168116,0.1384995,0.03186537,0.07224641,0.145124,2.0
75%,0.8752958,0.336991,0.7764768,0.596934,0.6981473,0.3060765,0.7125358,-0.4363365,0.7795363,-0.09145356,1.582616,-0.168116,1.00794,0.8266767,0.7946688,1.014211,2.0
max,1.708867,2.345298,2.236937,2.55378,3.021475,7.919356,4.950974,2.291809,2.125124,1.820021,1.582616,5.948273,1.587567,1.735032,1.661576,1.593602,3.0


## Spliting Training Set into Training and Validation Sets

In [42]:
# Separate features and target variable from the train set
X_train_full = train_df.drop(columns=['usage_rate_category'])
y_train_full = train_df['usage_rate_category']

# Split the training dataset into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Separate features and target variable from the test set
X_test = test_df.drop(columns=['usage_rate_category'])
y_test = test_df['usage_rate_category']

# Check the shapes of the datasets
print(f'Shape of test set: {X_train.shape}')
print(f'Shape of validation set: {X_val.shape}')
print(f'Shape of train set: {X_test.shape}')

Shape of test set: (9465146, 16)
Shape of validation set: (2366287, 16)
Shape of train set: (1467439, 16)


## Baseline Modeling

#### Logistic Regression

Let's fit a logistic regression model on the data and analyze the test and train accuracy.

In [43]:
# Start the timer
start_time = time.time()

# Initialize the model
logreg = LogisticRegression(random_state=42)

# Fitting a model
logreg.fit(X_train, y_train)

# Get class predictions
y_pred_logreg = logreg.predict(X_val)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Training and test score
print('Logistic Regression')
print(f"Train score: {logreg.score(X_train, y_train)}")
print(f"Test score: {logreg.score(X_val, y_val)}")

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_logreg))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_logreg))

# Print the elapsed time
print(f"\nElapsed time: {elapsed_time:.2f} seconds")

Logistic Regression
Train score: 0.39916795789520837
Test score: 0.39969623295906204

Confusion Matrix:
[[287923 248509 251837]
 [211851 357588 219460]
 [248069 240765 300285]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.39      0.37      0.37    788269
         2.0       0.42      0.45      0.44    788899
         3.0       0.39      0.38      0.38    789119

    accuracy                           0.40   2366287
   macro avg       0.40      0.40      0.40   2366287
weighted avg       0.40      0.40      0.40   2366287


Elapsed time: 26.92 seconds


#### Decision Tree

In [44]:
# Start the timer
start_time = time.time()

# Initialize the model
dt_model = DecisionTreeClassifier(random_state=42)

# Fitting a model
dt_model.fit(X_train, y_train);

# Get class predictions
y_pred_dt_model = dt_model.predict(X_val)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Evaluate its classification accuracy
print('Decision Tree:')
print(f'Train score: {dt_model.score(X_train, y_train)}')
print(f'Test score: {dt_model.score(X_val, y_val)}')

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_dt_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_dt_model))

# Print the elapsed time
print(f"\nElapsed time: {elapsed_time:.2f} seconds")

Decision Tree:
Train score: 1.0
Test score: 0.980181609415933

Confusion Matrix:
[[774722   7458   6089]
 [  8086 771209   9604]
 [  6324   9335 773460]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.98      0.98      0.98    788269
         2.0       0.98      0.98      0.98    788899
         3.0       0.98      0.98      0.98    789119

    accuracy                           0.98   2366287
   macro avg       0.98      0.98      0.98   2366287
weighted avg       0.98      0.98      0.98   2366287


Elapsed time: 232.04 seconds


#### Random Forest

In [45]:
# Start the timer
start_time = time.time()

# Initialize the model
random_forest_model = RandomForestClassifier(random_state=42)

# Fitting a model
random_forest_model.fit(X_train, y_train)

# Evaluate its classification accuracy
print(f'Train score: {random_forest_model.score(X_train, y_train)}')
print(f'Test score: {random_forest_model.score(X_val, y_val)}')

# Get class predictions
y_pred_random_forest_model = random_forest_model.predict(X_val)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_random_forest_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_random_forest_model))

# Print the elapsed time
print(f"\nElapsed time: {elapsed_time:.2f} seconds")

Train score: 0.999997886984522
Test score: 0.9848467240026252

Confusion Matrix:
[[779375   5119   3775]
 [  7462 773460   7977]
 [  3584   7940 777595]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.99      0.99      0.99    788269
         2.0       0.98      0.98      0.98    788899
         3.0       0.99      0.99      0.99    789119

    accuracy                           0.98   2366287
   macro avg       0.98      0.98      0.98   2366287
weighted avg       0.98      0.98      0.98   2366287


Elapsed time: 9548.81 seconds


#### K-Nearest Neighbors (KNN)

In [None]:
# Start the timer
start_time = time.time()

# Initialize the model
KNN_model = KNeighborsClassifier(n_neighbors=3)

# Fitting a model
KNN_model.fit(X_train, y_train)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Evaluate its classification accuracy
print(f'Train score: {KNN_model.score(X_train, y_train)}')
print(f'Test score: {KNN_model.score(X_val, y_val)}')

# Get class predictions
y_pred_KNN_model = KNN_model.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_KNN_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_KNN_model))

# Print the elapsed time
print(f"\nElapsed time: {elapsed_time:.2f} seconds")

#### Support Vector Machine (SVM)

In [None]:
# Start the timer
start_time = time.time()

# Initialize the model
svm = SVC(random_state=42)

# Fitting a model
svm.fit(X_train, y_train)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Evaluate its classification accuracy
print(f'Train score: {svm.score(X_train, y_train)}')
print(f'Test score: {svm.score(X_val, y_val)}')

# Get class predictions
y_pred_svm = svm.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_svm))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_svm))

# Print the elapsed time
print(f"\nElapsed time: {elapsed_time:.2f} seconds")

#### XGBoost

In [None]:
# Start the timer
start_time = time.time()

# Initialize the model
xgbc = XGBClassifier()

# Fitting a model
xgbc.fit(X_train, y_train)

# Stop the timer
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Evaluate its classification accuracy
print(f'Train score: {xgbc.score(X_train, y_train)}')
print(f'Test score: {xgbc.score(X_val, y_val)}')

# Get class predictions
y_pred_xgbc = xgbc.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_xgbc))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_xgbc))

# Print the elapsed time
print(f"\nElapsed time: {elapsed_time:.2f} seconds")