# Modeling - Feature Engineering - Bike Share System in the SF Bay Area

Author: Owen Hsu

## Table of content

1. Loading and Setup
2. Assessment
3. Spliting Training Set into Training and Validation Sets
4. Baseline Modeling
   * Logistic Regression
   * Decision Tree
   * Random Forest
   * K-Nearest Neighbors (KNN)
   * Support Vector Machine (SVM)
   * XGBoost


## Loading and Setup

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import time
import pickle
import os
# Filter warnings
from warnings import filterwarnings
filterwarnings('ignore')

  from pandas.core import (


In [51]:
# Load the datasets
train_df = pd.read_parquet('data/train_dataset.parquet')
test_df = pd.read_parquet('data/test_dataset.parquet')

## Assessment

In [52]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Train Data

In [53]:
# Print the shape of the train data
train_df.shape

(11831433, 19)

In [54]:
# Look at the first 5 rows of the train dataset
train_df.head()

Unnamed: 0,station_id,bikes_available,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute,usage_rate_category
0,-1.375345,-0.789945,-0.667162,-0.099799,-0.428081,0.38837,0.306077,-1.406683,-0.436337,-1.01458,1.820021,-0.168116,1.482137,1.427242,-1.310568,-0.195224,1.228122,-1.303354,2.0
1,1.125367,0.647661,0.336991,0.776477,0.410568,-0.15374,0.306077,1.015281,-0.436337,0.779536,-0.548136,-0.168116,1.482137,-0.700652,0.428313,0.14541,0.216731,-0.434267,2.0
2,1.125367,1.366463,0.336991,-1.122121,0.410568,2.0147,-0.455251,-0.195701,-0.436337,-0.566051,-0.548136,-0.168116,-0.674701,1.427242,-1.600382,0.713132,-1.372598,-0.723963,1.0
3,0.083403,0.168459,-0.667162,-0.537937,1.715132,0.543259,-2.739235,-1.103938,-0.436337,0.779536,-0.548136,-0.168116,1.482137,1.427242,-1.600382,-0.535857,0.939153,-0.434267,2.0
4,0.125082,-0.310743,-0.667162,0.776477,1.06285,-0.463517,0.306077,0.712536,-0.436337,0.331007,-0.548136,-0.168116,-0.674701,-0.700652,1.00794,-0.762946,0.650184,1.593602,2.0


In [55]:
# Verify the train dataset
train_df.sample(10)

Unnamed: 0,station_id,bikes_available,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute,usage_rate_category
1942511,0.75026,1.845665,2.345298,0.484385,0.503751,-0.773294,0.306077,-0.195701,-0.436337,0.779536,-0.548136,-0.168116,-0.674701,-0.700652,1.00794,1.621488,0.650184,0.145124,2.0
6175911,-0.333382,-0.071142,-1.671316,1.360661,0.131018,-1.392848,0.306077,0.712536,-0.436337,-0.117522,-0.091454,-0.168116,-0.674701,-0.700652,0.428313,0.372499,-0.74032,-1.412307,1.0
4300025,-1.583738,-0.550344,0.336991,-0.391891,-1.173546,0.853036,0.306077,-0.195701,-0.436337,-0.566051,1.820021,-0.168116,1.482137,1.427242,-1.020755,1.394399,0.794669,1.303906,2.0
11180396,1.62551,-1.269147,-0.667162,1.214615,0.410568,-0.386072,0.306077,1.318027,-0.436337,1.228065,-0.548136,-0.168116,-0.674701,-0.700652,0.138499,-0.081679,-1.071216,1.279017,3.0
11256701,1.708867,-1.269147,-0.667162,0.922523,-0.334897,-0.618405,0.306077,0.107045,-0.436337,-1.01458,1.820021,-0.168116,-0.674701,-0.700652,0.138499,0.940221,0.072246,-1.485883,3.0
10168407,-1.125274,-1.269147,-0.667162,-0.830029,0.503751,0.310926,0.306077,-0.498447,2.291809,0.779536,1.820021,-0.168116,1.482137,-0.700652,1.587567,-0.195224,-0.778723,-0.434267,3.0
11055743,0.75026,-0.643562,2.345298,0.776477,0.224201,-0.308628,0.306077,0.712536,2.291809,1.228065,-0.548136,-0.168116,1.482137,-0.700652,0.428313,-0.649401,0.272943,-0.434267,3.0
764696,0.583546,-0.550344,-0.667162,-0.391891,1.342399,0.93048,-1.216579,-1.103938,-0.436337,0.331007,-0.548136,-0.168116,-0.674701,1.427242,-1.310568,-1.557757,-1.228114,1.014211,2.0
8217357,0.291796,2.725167,1.341144,0.338339,-1.266729,-0.773294,0.306077,-0.498447,-0.436337,-1.01458,-0.548136,-0.168116,1.482137,-0.700652,1.00794,-1.217124,0.986733,-0.434267,1.0
3705682,1.250403,-1.269147,-0.667162,0.484385,1.156033,-0.773294,0.306077,-0.801192,-0.436337,-1.01458,-0.548136,-0.168116,1.482137,-0.700652,1.297754,-0.762946,0.650184,-0.144572,3.0


In [56]:
# Get a quick overview of dataset variables
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11831433 entries, 0 to 11831432
Data columns (total 19 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   station_id                      float64
 1   bikes_available                 float64
 2   dock_count                      float64
 3   mean_dew_point_f                float64
 4   mean_humidity                   float64
 5   mean_sea_level_pressure_inches  float64
 6   mean_visibility_miles           float64
 7   mean_wind_speed_mph             float64
 8   precipitation_inches            float64
 9   cloud_cover                     float64
 10  zip_code                        float64
 11  holiday                         float64
 12  is_weekend_or_holiday           float64
 13  year                            float64
 14  month                           float64
 15  day                             float64
 16  hour                            float64
 17  minute                   

In [57]:
# Get a statistical summary of the dataset
train_df.describe()

Unnamed: 0,station_id,bikes_available,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute,usage_rate_category
count,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0,11831430.0
mean,0.08728673,0.1407248,0.04538602,0.0006191241,0.02849037,0.0132462,-0.03006021,0.03542641,0.02947676,0.04194595,-0.05911484,-0.01487793,-0.03440722,0.0375204,-0.005401867,0.001451822,-0.008437618,0.0003482028,2.0
std,0.9863458,1.296435,1.019171,0.9762055,0.9795221,1.004863,0.9955864,1.016661,1.02656,1.002546,0.9581599,0.9559194,0.9854109,1.012845,1.012741,0.9917044,1.006911,0.9949198,0.8164966
min,-1.708774,-1.98795,-1.671316,-5.357456,-4.155406,-3.019177,-4.261891,-2.012174,-0.4363365,-1.463109,-0.703502,-0.168116,-0.6747013,-0.7006522,-1.600382,-1.671302,-1.661567,-1.593049,1.0
25%,-0.6668103,-1.029546,-0.6671623,-0.5379373,-0.4280805,-0.6958494,0.3060765,-0.8011921,-0.4363365,-1.01458,-0.5481358,-0.168116,-0.6747013,-0.7006522,-1.020755,-0.8764904,-0.9391449,-0.8729036,1.0
50%,0.2084391,-0.07114224,-0.6671623,0.04624675,0.1310183,-0.1537396,0.3060765,-0.1957009,-0.4363365,-0.117522,-0.5481358,-0.168116,-0.6747013,-0.7006522,0.1384995,0.03186537,-0.06895456,0.008465121,2.0
75%,0.9586529,0.8872615,0.336991,0.7764768,0.596934,0.7755915,0.3060765,0.7125358,-0.4363365,0.7795363,-0.09145356,-0.168116,1.482137,1.427242,1.00794,0.8266767,0.9187592,0.8672886,3.0
max,1.708867,4.481276,2.345298,2.236937,2.55378,3.021475,7.919356,4.950974,2.291809,2.125124,1.820021,5.948273,1.482137,1.427242,1.587567,1.735032,1.661576,1.593602,3.0


#### Test Data

In [58]:
# Print the shape of the test data
test_df.shape

(1467439, 19)

In [59]:
# Look at the first 5 rows of the test dataset
test_df.head()

Unnamed: 0,station_id,bikes_available,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute,usage_rate_category
0,0.75026,-1.029546,2.345298,0.922523,0.596934,0.001149,0.306077,-0.801192,2.291809,0.779536,-0.548136,-0.168116,-0.674701,-0.700652,1.00794,0.826677,-1.083629,0.724515,3.0
1,-0.416739,-0.071142,-0.667162,0.776477,1.249216,-0.540961,-0.455251,0.107045,2.291809,-0.117522,-0.703502,-0.168116,-0.674701,-0.700652,1.587567,-0.649401,-1.661567,-0.434267,2.0
2,-0.37506,-1.269147,1.341144,0.192293,-0.800813,0.853036,0.306077,0.40979,-0.436337,-0.117522,-0.091454,-0.168116,1.482137,1.427242,-1.020755,1.394399,-0.216723,-1.303354,3.0
3,1.33376,-0.550344,0.336991,0.922523,0.317385,-0.540961,0.306077,0.712536,-0.436337,0.779536,-0.548136,-0.168116,-0.674701,-0.700652,0.138499,1.735032,1.661576,0.724515,2.0
4,-1.667095,-0.550344,-0.667162,-0.537937,-0.893996,-0.463517,0.306077,0.712536,-0.436337,0.779536,1.820021,-0.168116,-0.674701,-0.700652,-0.441128,0.372499,-0.361207,0.145124,2.0


In [60]:
# Verify the test dataset
test_df.sample(10)

Unnamed: 0,station_id,bikes_available,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute,usage_rate_category
1028328,-0.500096,0.168459,-0.667162,1.068569,-0.241714,-0.231184,0.306077,0.712536,-0.436337,-1.463109,-0.703502,-0.168116,-0.674701,-0.700652,0.138499,0.14541,0.072246,0.145124,2.0
560458,-0.291703,-0.071142,-0.667162,0.338339,-2.384926,0.310926,0.170572,-0.195701,-0.436337,0.331007,-0.091454,-0.168116,-0.674701,-0.700652,-0.441128,-1.671302,0.939153,-0.723963,2.0
908766,1.375438,-1.269147,0.336991,0.776477,0.969667,-0.231184,0.306077,1.923518,2.291809,1.676595,-0.548136,-0.168116,-0.674701,-0.700652,-0.151314,1.053766,1.228122,-1.593049,3.0
632807,1.000331,1.126862,2.345298,-0.537937,0.410568,1.859811,-0.455251,-1.103938,-0.436337,-0.566051,-0.548136,-0.168116,-0.674701,-0.700652,1.297754,1.16731,-0.216723,1.593602,2.0
1137048,0.791939,-0.310743,0.336991,-0.976075,0.037835,-0.695849,-0.455251,1.318027,2.291809,0.779536,-0.548136,-0.168116,1.482137,1.427242,-0.730941,-1.217124,1.517091,1.014211,2.0
864556,-1.208631,0.647661,0.336991,0.922523,1.715132,2.0147,0.306077,-0.195701,2.291809,1.676595,1.820021,-0.168116,1.482137,-0.700652,1.587567,0.599588,-0.216723,1.014211,2.0
494204,1.083688,2.564468,1.341144,-0.245845,0.7833,0.38837,-1.216579,0.107045,-0.436337,-0.117522,-0.548136,-0.168116,1.482137,1.427242,-1.020755,-0.87649,-0.650176,1.303906,1.0
756455,0.333475,1.126862,0.336991,-0.099799,-0.055348,1.937255,0.306077,-0.801192,-0.436337,-0.566051,-0.548136,-0.168116,-0.674701,-0.700652,1.297754,-1.330668,-1.517083,1.593602,2.0
1309375,-1.500381,1.606064,-0.667162,-0.391891,-1.173546,0.853036,0.306077,-0.195701,-0.436337,-0.566051,1.820021,-0.168116,1.482137,1.427242,-1.020755,1.394399,-1.083629,1.593602,1.0
1311516,1.292081,2.564468,1.341144,-0.537937,1.715132,1.550034,-1.977907,-1.406683,-0.436337,-0.117522,-0.548136,-0.168116,-0.674701,1.427242,-1.600382,-0.422313,1.372607,0.43482,1.0


In [61]:
# Get a quick overview of dataset variables
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467439 entries, 0 to 1467438
Data columns (total 19 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   station_id                      1467439 non-null  float64
 1   bikes_available                 1467439 non-null  float64
 2   dock_count                      1467439 non-null  float64
 3   mean_dew_point_f                1467439 non-null  float64
 4   mean_humidity                   1467439 non-null  float64
 5   mean_sea_level_pressure_inches  1467439 non-null  float64
 6   mean_visibility_miles           1467439 non-null  float64
 7   mean_wind_speed_mph             1467439 non-null  float64
 8   precipitation_inches            1467439 non-null  float64
 9   cloud_cover                     1467439 non-null  float64
 10  zip_code                        1467439 non-null  float64
 11  holiday                         1467439 non-null  float64
 12  

In [62]:
# Get a statistical summary of the dataset
test_df.describe()

Unnamed: 0,station_id,bikes_available,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute,usage_rate_category
count,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0,1467439.0
mean,-0.001110399,-0.0007235799,-0.0009088503,-0.0002253745,0.00110414,8.589169e-05,0.0006632739,0.001124246,0.0005678869,0.002040834,0.0002704992,-0.0003428524,0.0002534252,0.0001767896,-3.142766e-05,-0.0008078574,2.174742e-05,0.0012127,2.071246
std,1.000178,0.9994657,0.9984484,1.001204,1.000368,1.000515,1.000051,0.9999969,1.000527,0.9995878,1.000361,0.9990089,1.000103,1.000065,0.9997948,0.9996474,0.9993584,1.000312,0.5689108
min,-1.708774,-1.98795,-1.671316,-5.357456,-4.155406,-3.019177,-4.261891,-2.012174,-0.4363365,-1.463109,-0.703502,-0.168116,-0.6747013,-0.7006522,-1.600382,-1.671302,-1.661567,-1.593049,1.0
25%,-0.7918459,-0.7899451,-0.6671623,-0.5379373,-0.5212637,-0.6958494,0.3060765,-0.8011921,-0.4363365,-1.01458,-0.5481358,-0.168116,-0.6747013,-0.7006522,-0.7309411,-0.8764904,-0.7946604,-0.7239626,2.0
50%,-0.04163215,-0.07114224,-0.6671623,0.04624675,0.03783516,-0.1537396,0.3060765,-0.1957009,-0.4363365,-0.117522,-0.5481358,-0.168116,-0.6747013,-0.7006522,0.1384995,0.03186537,0.07224641,0.145124,2.0
75%,0.8752958,0.6476606,0.336991,0.7764768,0.596934,0.6981473,0.3060765,0.7125358,-0.4363365,0.7795363,-0.09145356,-0.168116,1.482137,1.427242,1.00794,0.8266767,0.7946688,1.014211,2.0
max,1.708867,4.481276,2.345298,2.236937,2.55378,3.021475,7.919356,4.950974,2.291809,2.125124,1.820021,5.948273,1.482137,1.427242,1.587567,1.735032,1.661576,1.593602,3.0


## Spliting Training Set into Training and Validation Sets

In [63]:
# Separate features and target variable from the train set
X_train_full = train_df.drop(columns=['usage_rate_category'])
y_train_full = train_df['usage_rate_category']

# Split the training dataset into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Separate features and target variable from the test set
X_test = test_df.drop(columns=['usage_rate_category'])
y_test = test_df['usage_rate_category']

# Check the shapes of the datasets
print(f'Shape of test set: {X_train.shape}')
print(f'Shape of validation set: {X_val.shape}')
print(f'Shape of train set: {X_test.shape}')

Shape of test set: (9465146, 18)
Shape of validation set: (2366287, 18)
Shape of train set: (1467439, 18)


## Baseline Modeling

#### Logistic Regression

Let's fit a logistic regression model on the data and analyze the test and train accuracy.

In [64]:
# Initialize the model
logreg = LogisticRegression(random_state=42)

# Fitting a model
logreg.fit(X_train, y_train)

# Get class predictions
y_pred_logreg = logreg.predict(X_val)

# Training and test score
print('Logistic Regression')
print(f"Train score: {logreg.score(X_train, y_train)}")
print(f"Test score: {logreg.score(X_val, y_val)}")

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_logreg))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_logreg))

Logistic Regression
Train score: 0.9877043629332289
Test score: 0.9876071668398635

Confusion Matrix:
[[781460   6559    250]
 [  5808 773991   9100]
 [     0   7608 781511]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.99      0.99      0.99    788269
         2.0       0.98      0.98      0.98    788899
         3.0       0.99      0.99      0.99    789119

    accuracy                           0.99   2366287
   macro avg       0.99      0.99      0.99   2366287
weighted avg       0.99      0.99      0.99   2366287



Test

In [66]:
X_train_2 = X_train.copy()
X_train_2 = X_train_2.drop(columns=['bikes_available'])
X_train_2.head()

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute
3814887,1.125367,0.336991,-0.099799,-0.800813,0.310926,0.306077,0.107045,-0.436337,-1.01458,-0.548136,-0.168116,-0.674701,-0.700652,1.00794,0.599588,0.361215,0.43482
8633336,-0.166668,0.336991,0.046247,0.131018,0.543259,0.306077,-0.195701,-0.436337,-1.01458,-0.548136,-0.168116,-0.674701,-0.700652,1.00794,1.394399,0.908514,-1.303354
4123750,-0.333382,-1.671316,0.630431,-1.080363,-0.463517,7.919356,0.107045,-0.436337,0.331007,-0.091454,-0.168116,-0.674701,-0.700652,-0.151314,0.486043,-0.361207,1.593602
3925458,0.166761,0.336991,0.192293,-0.334897,0.310926,0.306077,1.620772,-0.436337,0.779536,-0.548136,-0.168116,-0.674701,-0.700652,-0.441128,0.031865,0.939153,1.593602
2080432,-0.416739,-0.667162,-0.830029,0.503751,0.38837,0.306077,-0.801192,2.291809,-0.117522,-0.703502,-0.168116,1.482137,-0.700652,1.587567,-0.195224,0.794669,1.014211


In [67]:
X_val_2 = X_val.copy()
X_val_2 = X_val_2.drop(columns=['bikes_available'])
X_val_2.head()

Unnamed: 0,station_id,dock_count,mean_dew_point_f,mean_humidity,mean_sea_level_pressure_inches,mean_visibility_miles,mean_wind_speed_mph,precipitation_inches,cloud_cover,zip_code,holiday,is_weekend_or_holiday,year,month,day,hour,minute
11686255,0.75026,2.345298,-0.391891,-1.266729,-0.618405,0.306077,0.40979,-0.436337,-1.01458,-0.548136,-0.168116,-0.674701,-0.700652,-0.441128,1.507944,1.083638,0.818771
2200011,-0.791846,-0.667162,0.776477,0.410568,-0.850738,0.306077,-0.801192,-0.436337,-0.117522,-0.651713,-0.168116,1.482137,-0.700652,0.428313,0.826677,-1.372598,-1.593049
549345,0.75026,2.345298,0.776477,0.410568,-1.005626,-0.455251,-0.498447,-0.436337,-1.01458,-0.548136,-0.168116,-0.674701,-0.700652,1.00794,-0.87649,-0.216723,-0.434267
3255008,-0.708489,-0.667162,-0.391891,1.621949,0.465814,-0.455251,-1.709429,2.291809,1.228065,-0.651713,-0.168116,1.482137,-0.700652,1.587567,-0.195224,-0.072238,0.145124
6006242,-0.208346,-0.667162,1.506707,-0.428081,-1.857513,0.306077,0.107045,-0.436337,-0.566051,-0.091454,-0.168116,-0.674701,-0.700652,0.718126,0.14541,0.505822,0.434576


In [68]:
# Initialize the model
logreg = LogisticRegression(random_state=42)

# Fitting a model
logreg.fit(X_train_2, y_train)

# Get class predictions
y_pred_logreg = logreg.predict(X_val_2)

# Training and test score
print('Logistic Regression')
print(f"Train score: {logreg.score(X_train_2, y_train)}")
print(f"Test score: {logreg.score(X_val_2, y_val)}")

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_logreg))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_logreg))

Logistic Regression
Train score: 0.40004676103252923
Test score: 0.4007438658117126

Confusion Matrix:
[[292757 266410 229102]
 [216754 381596 190549]
 [251265 263932 273922]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.38      0.37      0.38    788269
         2.0       0.42      0.48      0.45    788899
         3.0       0.39      0.35      0.37    789119

    accuracy                           0.40   2366287
   macro avg       0.40      0.40      0.40   2366287
weighted avg       0.40      0.40      0.40   2366287



#### Decision Tree

In [65]:
# Initialize the model
dt_model = DecisionTreeClassifier(random_state=42)

# Fitting a model
dt_model.fit(X_train, y_train);

# Get class predictions
y_pred_dt_model = dt_model.predict(X_val)

# Evaluate its classification accuracy
print('Decision Tree:')
print(f'Train score: {dt_model.score(X_train, y_train)}')
print(f'Test score: {dt_model.score(X_val, y_val)}')

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_dt_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_dt_model))

Decision Tree:
Train score: 1.0
Test score: 0.9996450134746968

Confusion Matrix:
[[788098    168      3]
 [   207 788466    226]
 [     3    233 788883]]

Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00    788269
         2.0       1.00      1.00      1.00    788899
         3.0       1.00      1.00      1.00    789119

    accuracy                           1.00   2366287
   macro avg       1.00      1.00      1.00   2366287
weighted avg       1.00      1.00      1.00   2366287



Test

In [69]:
# Initialize the model
dt_model = DecisionTreeClassifier(random_state=42)

# Fitting a model
dt_model.fit(X_train_2, y_train);

# Get class predictions
y_pred_dt_model = dt_model.predict(X_val_2)

# Evaluate its classification accuracy
print('Decision Tree:')
print(f'Train score: {dt_model.score(X_train_2, y_train)}')
print(f'Test score: {dt_model.score(X_val_2, y_val)}')

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_dt_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_dt_model))

Decision Tree:
Train score: 1.0
Test score: 0.9789252106781637

Confusion Matrix:
[[773904   8150   6215]
 [  8644 769758  10497]
 [  6522   9841 772756]]

Classification Report:
              precision    recall  f1-score   support

         1.0       0.98      0.98      0.98    788269
         2.0       0.98      0.98      0.98    788899
         3.0       0.98      0.98      0.98    789119

    accuracy                           0.98   2366287
   macro avg       0.98      0.98      0.98   2366287
weighted avg       0.98      0.98      0.98   2366287



#### Random Forest

In [None]:
# Initialize the model
random_forest_model = RandomForestClassifier(random_state=42)

# Fitting a model
random_forest_model.fit(X_train, y_train)

# Evaluate its classification accuracy
print(f'Train score: {random_forest_model.score(X_train, y_train)}')
print(f'Test score: {random_forest_model.score(X_val, y_val)}')

# Get class predictions
y_pred_random_forest_model = random_forest_model.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_random_forest_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_random_forest_model))

#### K-Nearest Neighbors (KNN)

In [None]:
# Initialize the model
KNN_model = KNeighborsClassifier(n_neighbors=3)

# Fitting a model
KNN_model.fit(X_train, y_train)

# Evaluate its classification accuracy
print(f'Train score: {KNN_model.score(X_train, y_train)}')
print(f'Test score: {KNN_model.score(X_val, y_val)}')

# Get class predictions
y_pred_KNN_model = KNN_model.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_KNN_model))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_KNN_model))

#### Support Vector Machine (SVM)

In [None]:
# Initialize the model
svm = SVC(random_state=42)

# Fitting a model
svm.fit(X_train, y_train)

# Evaluate its classification accuracy
print(f'Train score: {svm.score(X_train, y_train)}')
print(f'Test score: {svm.score(X_val, y_val)}')

# Get class predictions
y_pred_svm = svm.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_svm))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_svm))

#### XGBoost

In [None]:
# Initialize the model
xgbc = XGBClassifier()

# Fitting a model
xgbc.fit(X_train, y_train)

# Evaluate its classification accuracy
print(f'Train score: {xgbc.score(X_train, y_train)}')
print(f'Test score: {xgbc.score(X_val, y_val)}')

# Get class predictions
y_pred_xgbc = xgbc.predict(X_val)

# Generate confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_xgbc))

# Classification report 
print("\nClassification Report:")
print(classification_report(y_val, y_pred_xgbc))