In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [86]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [94]:
df = pd.read_csv("../Data/ttc_subway_delay_2018_2019.csv", encoding='unicode_escape')
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.head()

Unnamed: 0,id,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_info,latitude,longitude,line_name,month,time_range,month_number,hour,year
0,20,2019-01-01,18:49,Tuesday,BATHURST STATION,MUI,5,9,W,BD,5126,Injured or ill Customer (On Train) - Transported,43.3958,-79.244,Bloor Danforth,January,6-9PM,1,18,2019
1,74,2019-01-04,17:46,Friday,BATHURST STATION,MUI,6,9,E,BD,5193,Injured or ill Customer (On Train) - Transported,43.3958,-79.244,Bloor Danforth,January,3-6PM,1,17,2019
2,147,2019-01-09,20:04,Wednesday,BATHURST STATION,EUDO,4,7,W,BD,5025,Door Problems - Faulty Equipment,43.3958,-79.244,Bloor Danforth,January,6-9PM,1,20,2019
3,417,2019-01-22,20:05,Tuesday,BATHURST STATION,EUNT,6,9,W,BD,5322,Equipment - No Trouble Found,43.3958,-79.244,Bloor Danforth,January,6-9PM,1,20,2019
4,736,2019-02-04,7:50,Monday,BATHURST STATION,MUIR,3,5,E,BD,5358,Injured or ill Customer (On Train) - Medical A...,43.3958,-79.244,Bloor Danforth,February,5-9AM,2,7,2019


# Select your features (columns)

In [95]:
# Drop the null rows, and everything that is not a float except station
df = df.dropna().drop(['date','time','day','code','bound','line','code_info','line_name','month','time_range'], axis=1)
df.head()

Unnamed: 0,id,station,min_delay,min_gap,vehicle,latitude,longitude,month_number,hour,year
0,20,BATHURST STATION,5,9,5126,43.3958,-79.244,1,18,2019
1,74,BATHURST STATION,6,9,5193,43.3958,-79.244,1,17,2019
2,147,BATHURST STATION,4,7,5025,43.3958,-79.244,1,20,2019
3,417,BATHURST STATION,6,9,5322,43.3958,-79.244,1,20,2019
4,736,BATHURST STATION,3,5,5358,43.3958,-79.244,2,7,2019


# Create a Train Test Split



In [96]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
X = df.drop("station", axis=1)
y = df["min_delay"]
print(X.shape, y.shape)

(13517, 9) (13517,)


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [98]:
X_train.head()

Unnamed: 0,id,min_delay,min_gap,vehicle,latitude,longitude,month_number,hour,year
3959,9267,4,7,5566,43.4232,79.2627,5,6,2018
9730,867,4,6,6666,43.4002,-79.2414,2,7,2019
1367,9178,4,8,5151,43.4103,-79.1923,5,19,2018
4843,13649,3,5,5137,43.392,-79.2735,12,7,2018
6028,8237,10,12,5305,43.3814,-79.321,3,16,2018


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [99]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [100]:
# Encode Data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

ValueError: y contains previously unseen labels: [62, 79, 87, 107]

In [9]:
# Encode Data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# categorize
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/50
5243/5243 - 1s - loss: 0.5413 - accuracy: 0.7265
Epoch 2/50
5243/5243 - 0s - loss: 0.3679 - accuracy: 0.8041
Epoch 3/50
5243/5243 - 0s - loss: 0.3558 - accuracy: 0.8133
Epoch 4/50
5243/5243 - 0s - loss: 0.3402 - accuracy: 0.8327
Epoch 5/50
5243/5243 - 0s - loss: 0.3399 - accuracy: 0.8249
Epoch 6/50
5243/5243 - 0s - loss: 0.3330 - accuracy: 0.8373
Epoch 7/50
5243/5243 - 0s - loss: 0.3219 - accuracy: 0.8432
Epoch 8/50
5243/5243 - 0s - loss: 0.3236 - accuracy: 0.8352
Epoch 9/50
5243/5243 - 0s - loss: 0.3146 - accuracy: 0.8522
Epoch 10/50
5243/5243 - 0s - loss: 0.3160 - accuracy: 0.8491
Epoch 11/50
5243/5243 - 0s - loss: 0.3069 - accuracy: 0.8552
Epoch 12/50
5243/5243 - 0s - loss: 0.3023 - accuracy: 0.8623
Epoch 13/50
5243/5243 - 0s - loss: 0.2974 - accuracy: 0.8632
Epoch 14/50
5243/5243 - 0s - loss: 0.3029 - accuracy: 0.8636
Epoch 15/50
5243/5243 - 0s - loss: 0.2943 - accuracy: 0.8682
Epoch 16/50
5243/5243 - 0s - loss: 0.2948 - accuracy: 0.8674
Epoch 17/50

<tensorflow.python.keras.callbacks.History at 0x17993d572b0>

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               4100      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 14,503
Trainable params: 14,503
Non-trainable params: 0
_________________________________________________________________


# Train the Model



In [11]:
# Create the SVC Model
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
# fit the model
model2.fit(X_train_scaled, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8439824527942018
Testing Data Score: 0.8415331807780321


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.0005, 0.001]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [15]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.856, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.846, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.839, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.841, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.825, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.856, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.846, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.839, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.841, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   24.5s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.0005, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [16]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.8680138845428944


# Save the Model

In [19]:
# using results above to save the best model
best_model=SVC(kernel='linear', C=10, gamma=0.0001)

import joblib

filename = 'best_model.sav'
joblib.dump(best_model, filename)

['best_model.sav']