In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# the dataset used contains all delays under 30 mins. 
df = pd.read_csv("../Data/ttc_subway_delay_2018_2019_for_machine_learning.csv", encoding='unicode_escape')
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.head()
# one hot encoding for stations - 1 or 0. will create many columns
# min_delay
# time series model

Unnamed: 0,id,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_info,latitude,longitude,line_name,month,time_range,month_number,hour,year
0,7720,2018-02-13,6:57,Tuesday,BROADVIEW STATION,EUNT,2,4,W,BD,5285,Equipment - No Trouble Found,43.4037,-79.213,Bloor Danforth,February,5-9AM,2,6,2018
1,3147,2019-06-12,11:54,Wednesday,COXWELL STATION,TUNIP,2,5,W,BD,5350,Operator Not In Position,43.4103,-79.1923,Bloor Danforth,June,9AM-12PM,6,11,2019
2,11036,2018-07-31,17:05,Tuesday,COXWELL STATION,TUNOA,2,4,E,BD,0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7,17,2018
3,11037,2018-07-31,17:33,Tuesday,COXWELL STATION,TUNOA,2,4,E,BD,0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7,17,2018
4,11038,2018-07-31,17:40,Tuesday,COXWELL STATION,TUNOA,2,4,W,BD,0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7,17,2018


# Select your features (columns)

In [3]:
# Drop the null rows, and everything that is not a float except station
df = df.dropna().drop(['id','date','time','day','code','bound','line','code_info','line_name','month','time_range',"latitude","longitude"], axis=1)
df.head()

Unnamed: 0,station,min_delay,min_gap,vehicle,month_number,hour,year
0,BROADVIEW STATION,2,4,5285,2,6,2018
1,COXWELL STATION,2,5,5350,6,11,2019
2,COXWELL STATION,2,4,0,7,17,2018
3,COXWELL STATION,2,4,0,7,17,2018
4,COXWELL STATION,2,4,0,7,17,2018


# Create a Train Test Split



In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
X = df.drop("station", axis=1)
y = df["min_delay"]
print(X.shape, y.shape)

(13315, 6) (13315,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,min_delay,min_gap,vehicle,month_number,hour,year
1331,3,6,5621,6,18,2018
2779,3,5,5220,1,8,2019
5875,4,8,5205,5,22,2019
9439,5,9,5089,12,13,2019
8117,5,10,5243,2,5,2018


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Encode Data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [9]:
# categorize
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(units=10, activation='relu')) #how many units should we have?
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=29, activation='softmax'))

In [21]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=30, #did I over train?
    shuffle=True,
    verbose=2
)

Train on 8921 samples
Epoch 1/30
8921/8921 - 1s - loss: 2.5189 - accuracy: 0.2586
Epoch 2/30
8921/8921 - 0s - loss: 1.9853 - accuracy: 0.3504
Epoch 3/30
8921/8921 - 0s - loss: 1.8242 - accuracy: 0.3530
Epoch 4/30
8921/8921 - 0s - loss: 1.6430 - accuracy: 0.3714
Epoch 5/30
8921/8921 - 0s - loss: 1.4436 - accuracy: 0.4298
Epoch 6/30
8921/8921 - 0s - loss: 1.2957 - accuracy: 0.5068
Epoch 7/30
8921/8921 - 0s - loss: 1.1818 - accuracy: 0.5823
Epoch 8/30
8921/8921 - 0s - loss: 1.0882 - accuracy: 0.6577
Epoch 9/30
8921/8921 - 0s - loss: 1.0063 - accuracy: 0.7146
Epoch 10/30
8921/8921 - 0s - loss: 0.9312 - accuracy: 0.7608
Epoch 11/30
8921/8921 - 0s - loss: 0.8616 - accuracy: 0.7912
Epoch 12/30
8921/8921 - 0s - loss: 0.7964 - accuracy: 0.8147


In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  70        
_________________________________________________________________
dense_4 (Dense)              multiple                  110       
_________________________________________________________________
dense_5 (Dense)              multiple                  319       
Total params: 499
Trainable params: 499
Non-trainable params: 0
_________________________________________________________________


In [None]:
# prediction result



# Train the Model



In [152]:
# Create the SVC Model
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [153]:
# fit the model
model2.fit(X_train_scaled, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [154]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.7372491873108395
Testing Data Score: 0.7289485662266727


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [155]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.0005, 0.001]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [156]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.719, total=   2.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.724, total=   2.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.732, total=   2.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.729, total=   2.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.724, total=   2.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.719, total=   2.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.724, total=   2.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.732, total=   2.0s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.729, total=   2.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  1.0min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.0005, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [157]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.9641292032508071


# Save the Model

In [158]:
# using results above to save the best model
best_model=SVC(kernel='linear', C=10, gamma=0.0001)

import joblib

filename = 'best_model.sav'
joblib.dump(best_model, filename)

['best_model.sav']