In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [86]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [101]:
df = pd.read_csv("../Data/ttc_subway_delay_2018_2019_for_machine_learning.csv", encoding='unicode_escape')
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.head()

Unnamed: 0,id,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_info,latitude,longitude,line_name,month,time_range,month_number,hour,year
0,7720.0,2018-02-13,6:57,Tuesday,BROADVIEW STATION,EUNT,2.0,4.0,W,BD,5285.0,Equipment - No Trouble Found,43.4037,-79.213,Bloor Danforth,February,5-9AM,2.0,6.0,2018.0
1,3147.0,2019-06-12,11:54,Wednesday,COXWELL STATION,TUNIP,2.0,5.0,W,BD,5350.0,Operator Not In Position,43.4103,-79.1923,Bloor Danforth,June,9AM-12PM,6.0,11.0,2019.0
2,11036.0,2018-07-31,17:05,Tuesday,COXWELL STATION,TUNOA,2.0,4.0,E,BD,0.0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7.0,17.0,2018.0
3,11037.0,2018-07-31,17:33,Tuesday,COXWELL STATION,TUNOA,2.0,4.0,E,BD,0.0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7.0,17.0,2018.0
4,11038.0,2018-07-31,17:40,Tuesday,COXWELL STATION,TUNOA,2.0,4.0,W,BD,0.0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7.0,17.0,2018.0


# Select your features (columns)

In [102]:
# Drop the null rows, and everything that is not a float except station
df = df.dropna().drop(['date','time','day','code','bound','line','code_info','line_name','month','time_range'], axis=1)
df.head()

Unnamed: 0,id,station,min_delay,min_gap,vehicle,latitude,longitude,month_number,hour,year
0,7720.0,BROADVIEW STATION,2.0,4.0,5285.0,43.4037,-79.213,2.0,6.0,2018.0
1,3147.0,COXWELL STATION,2.0,5.0,5350.0,43.4103,-79.1923,6.0,11.0,2019.0
2,11036.0,COXWELL STATION,2.0,4.0,0.0,43.4103,-79.1923,7.0,17.0,2018.0
3,11037.0,COXWELL STATION,2.0,4.0,0.0,43.4103,-79.1923,7.0,17.0,2018.0
4,11038.0,COXWELL STATION,2.0,4.0,0.0,43.4103,-79.1923,7.0,17.0,2018.0


# Create a Train Test Split



In [103]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
X = df.drop("station", axis=1)
y = df["min_delay"]
print(X.shape, y.shape)

(13315, 9) (13315,)


In [104]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [105]:
X_train.head()

Unnamed: 0,id,min_delay,min_gap,vehicle,latitude,longitude,month_number,hour,year
1331,9690.0,3.0,6.0,5621.0,43.7814,-79.415,6.0,18.0,2018.0
2779,431.0,3.0,5.0,5220.0,43.3944,-79.2536,1.0,8.0,2019.0
5875,2688.0,4.0,8.0,5205.0,43.3814,-79.321,5.0,22.0,2019.0
9439,6703.0,5.0,9.0,5089.0,43.4111,-79.1846,12.0,13.0,2019.0
8117,7764.0,5.0,10.0,5243.0,43.4357,-79.1549,2.0,5.0,2018.0


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [106]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [107]:
# Encode Data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [108]:
# categorize
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [132]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(units=30, activation='relu', input_dim=9))
model.add(Dense(units=30, activation='relu'))
model.add(Dense(units=29, activation='softmax'))

In [133]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [134]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

Train on 8921 samples
Epoch 1/50
8921/8921 - 1s - loss: 2.2931 - accuracy: 0.3167
Epoch 2/50
8921/8921 - 0s - loss: 1.6748 - accuracy: 0.3704
Epoch 3/50
8921/8921 - 0s - loss: 1.3779 - accuracy: 0.4745
Epoch 4/50
8921/8921 - 0s - loss: 1.1189 - accuracy: 0.6030
Epoch 5/50
8921/8921 - 0s - loss: 0.9097 - accuracy: 0.7307
Epoch 6/50
8921/8921 - 0s - loss: 0.7513 - accuracy: 0.8115
Epoch 7/50
8921/8921 - 0s - loss: 0.6244 - accuracy: 0.8508
Epoch 8/50
8921/8921 - 0s - loss: 0.5173 - accuracy: 0.8880
Epoch 9/50
8921/8921 - 0s - loss: 0.4344 - accuracy: 0.9083
Epoch 10/50
8921/8921 - 0s - loss: 0.3688 - accuracy: 0.9240
Epoch 11/50
8921/8921 - 0s - loss: 0.3180 - accuracy: 0.9334
Epoch 12/50
8921/8921 - 0s - loss: 0.2798 - accuracy: 0.9399
Epoch 13/50
8921/8921 - 0s - loss: 0.2485 - accuracy: 0.9466
Epoch 14/50
8921/8921 - 0s - loss: 0.2246 - accuracy: 0.9505
Epoch 15/50
8921/8921 - 0s - loss: 0.2051 - accuracy: 0.9533
Epoch 16/50
8921/8921 - 0s - loss: 0.1880 - accuracy: 0.9568
Epoch 17/50

<tensorflow.python.keras.callbacks.History at 0x12d720ed0f0>

In [135]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 30)                300       
_________________________________________________________________
dense_13 (Dense)             (None, 30)                930       
_________________________________________________________________
dense_14 (Dense)             (None, 29)                899       
Total params: 2,129
Trainable params: 2,129
Non-trainable params: 0
_________________________________________________________________


# Train the Model



In [136]:
# Create the SVC Model
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [137]:
# fit the model
model2.fit(X_train_scaled, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [138]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.7372491873108395
Testing Data Score: 0.7289485662266727


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [14]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.0005, 0.001]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [15]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.856, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.846, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.839, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.841, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.825, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.856, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.846, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.839, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.841, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   24.5s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.0005, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [16]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.8680138845428944


# Save the Model

In [19]:
# using results above to save the best model
best_model=SVC(kernel='linear', C=10, gamma=0.0001)

import joblib

filename = 'best_model.sav'
joblib.dump(best_model, filename)

['best_model.sav']