In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# the dataset used contains all delays under 30 mins. 
df = pd.read_csv("../Data/ttc_subway_delay_2018_2019_for_machine_learning.csv", encoding='unicode_escape')
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.head()
# one hot encoding for stations - 1 or 0. will create many columns
# min_delay
# time series model

Unnamed: 0,id,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_info,latitude,longitude,line_name,month,time_range,month_number,hour,year
0,1,2019-01-01,3:03,Tuesday,DUPONT STATION,MUATC,11,16,N,YU,6061,ATC Project,43.674584,-79.40683,Yonge University Spadina,January,9PM-1:30AM,1,3,2019
1,2,2019-01-01,3:08,Tuesday,EGLINTON WEST STATION,EUATC,11,16,S,YU,5656,ATC RC&S Equipment,43.699209,-79.435819,Yonge University Spadina,January,9PM-1:30AM,1,3,2019
2,3,2019-01-01,3:09,Tuesday,DUPONT STATION,EUATC,6,11,N,YU,5381,ATC RC&S Equipment,43.674584,-79.40683,Yonge University Spadina,January,9PM-1:30AM,1,3,2019
3,4,2019-01-01,3:26,Tuesday,ST CLAIR WEST STATION,EUATC,4,9,N,YU,5571,ATC RC&S Equipment,43.683888,-79.415113,Yonge University Spadina,January,9PM-1:30AM,1,3,2019
4,5,2019-01-01,8:04,Tuesday,DAVISVILLE STATION,MUNOA,5,10,S,YU,0,No Operator Immediately Available - Not E.S.A....,43.697778,-79.397222,Yonge University Spadina,January,5-9AM,1,8,2019


# Select your features (columns)

In [3]:
# Drop the null rows, and everything that is not a float except station
df = df.dropna().drop(['id','date','time','day','code','bound','line','code_info','line_name','month','time_range',"latitude","longitude"], axis=1)
df.head()

Unnamed: 0,station,min_delay,min_gap,vehicle,month_number,hour,year
0,DUPONT STATION,11,16,6061,1,3,2019
1,EGLINTON WEST STATION,11,16,5656,1,3,2019
2,DUPONT STATION,6,11,5381,1,3,2019
3,ST CLAIR WEST STATION,4,9,5571,1,3,2019
4,DAVISVILLE STATION,5,10,0,1,8,2019


# Create a Train Test Split



In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
X = df.drop("station", axis = 1)
y = df["min_delay"]
print(X.shape, y.shape)

(13320, 6) (13320,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [6]:
X_train.head()

Unnamed: 0,min_delay,min_gap,vehicle,month_number,hour,year
2907,2,4,5871,6,16,2019
6201,5,8,5426,12,19,2019
5817,7,9,5916,11,14,2019
9511,3,5,6056,6,8,2018
9433,4,8,5135,6,21,2018


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Encode Data
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [9]:
# categorize
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(units=10, activation='relu')) #how many units should we have?
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=29, activation='softmax'))

In [11]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [12]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=30, #did I over train?
    shuffle=True,
    verbose=2
)

Train on 8924 samples
Epoch 1/30
8924/8924 - 1s - loss: 2.6701 - accuracy: 0.1927
Epoch 2/30
8924/8924 - 0s - loss: 1.9938 - accuracy: 0.3421
Epoch 3/30
8924/8924 - 0s - loss: 1.8408 - accuracy: 0.3421
Epoch 4/30
8924/8924 - 0s - loss: 1.7019 - accuracy: 0.3422
Epoch 5/30
8924/8924 - 0s - loss: 1.5350 - accuracy: 0.3659
Epoch 6/30
8924/8924 - 0s - loss: 1.3617 - accuracy: 0.4748
Epoch 7/30
8924/8924 - 0s - loss: 1.2156 - accuracy: 0.5511
Epoch 8/30
8924/8924 - 0s - loss: 1.0939 - accuracy: 0.6348
Epoch 9/30
8924/8924 - 0s - loss: 0.9847 - accuracy: 0.7015
Epoch 10/30
8924/8924 - 0s - loss: 0.8834 - accuracy: 0.7698
Epoch 11/30
8924/8924 - 0s - loss: 0.7908 - accuracy: 0.8097
Epoch 12/30
8924/8924 - 0s - loss: 0.7104 - accuracy: 0.8464
Epoch 13/30
8924/8924 - 0s - loss: 0.6385 - accuracy: 0.8637
Epoch 14/30
8924/8924 - 0s - loss: 0.5750 - accuracy: 0.8910
Epoch 15/30
8924/8924 - 0s - loss: 0.5203 - accuracy: 0.8956
Epoch 16/30
8924/8924 - 0s - loss: 0.4740 - accuracy: 0.9062
Epoch 17/30

<tensorflow.python.keras.callbacks.History at 0x24603ed1320>

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  70        
_________________________________________________________________
dense_1 (Dense)              multiple                  110       
_________________________________________________________________
dense_2 (Dense)              multiple                  319       
Total params: 499
Trainable params: 499
Non-trainable params: 0
_________________________________________________________________


In [14]:
# prediction result



# Train the Model



In [15]:
# Create the SVC Model
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [16]:
# fit the model
model2.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [17]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.7167189601075751
Testing Data Score: 0.7238398544131028


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [18]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.0005, 0.001]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [19]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.703, total=   0.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.706, total=   0.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.713, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.703, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.706, total=   0.9s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.713, total=   0.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.703, total=   0.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.706, total=   0.9s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.713, total=   0.9s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   15.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.0005, 0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [20]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.9541685342895563


# Save the Model

In [22]:
# using results above to save the best model
best_model=SVC(kernel='linear', C=10, gamma=0.0001)

import joblib

filename = 'best_model_nad.sav'
joblib.dump(best_model, filename)

['best_model_nad.sav']