In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# the dataset used contains all delays under 30 mins. 
df = pd.read_csv("../Data/ttc_subway_delay_2018_2019_for_machine_learning.csv", encoding='unicode_escape')

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.head()

Unnamed: 0,id,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_info,latitude,longitude,line_name,month,time_range,month_number,hour,year
0,7720,2018-02-13,6:57,Tuesday,BROADVIEW STATION,EUNT,2,4,W,BD,5285,Equipment - No Trouble Found,43.4037,-79.213,Bloor Danforth,February,5-9AM,2,6,2018
1,3147,2019-06-12,11:54,Wednesday,COXWELL STATION,TUNIP,2,5,W,BD,5350,Operator Not In Position,43.4103,-79.1923,Bloor Danforth,June,9AM-12PM,6,11,2019
2,11036,2018-07-31,17:05,Tuesday,COXWELL STATION,TUNOA,2,4,E,BD,0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7,17,2018
3,11037,2018-07-31,17:33,Tuesday,COXWELL STATION,TUNOA,2,4,E,BD,0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7,17,2018
4,11038,2018-07-31,17:40,Tuesday,COXWELL STATION,TUNOA,2,4,W,BD,0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7,17,2018


# Select your features (columns)

In [5]:
# Keep select columns
df = df.drop(['id','date','time','code','min_gap','vehicle','code_info','line_name','month','time_range',"latitude","longitude","year"], axis=1)
# df = df.drop(['id','date','time','code','min_gap','vehicle','code_info','line_name','month_number','hour',"latitude","longitude","year"], axis=1)
df.head()

Unnamed: 0,day,station,min_delay,bound,line,month_number,hour
0,Tuesday,BROADVIEW STATION,2,W,BD,2,6
1,Wednesday,COXWELL STATION,2,W,BD,6,11
2,Tuesday,COXWELL STATION,2,E,BD,7,17
3,Tuesday,COXWELL STATION,2,E,BD,7,17
4,Tuesday,COXWELL STATION,2,W,BD,7,17


In [6]:
# Label encoding for categorical variables
df["day_number"] = df["day"].astype("category").cat.codes
df["station_number"] = df["station"].astype("category").cat.codes
df["bound_number"] = df["bound"].astype("category").cat.codes
df["line_number"] = df["line"].astype("category").cat.codes
df.head()

Unnamed: 0,day,station,min_delay,bound,line,month_number,hour,day_number,station_number,bound_number,line_number
0,Tuesday,BROADVIEW STATION,2,W,BD,2,6,5,8,4,0
1,Wednesday,COXWELL STATION,2,W,BD,6,11,6,14,4,0
2,Tuesday,COXWELL STATION,2,E,BD,7,17,5,14,1,0
3,Tuesday,COXWELL STATION,2,E,BD,7,17,5,14,1,0
4,Tuesday,COXWELL STATION,2,W,BD,7,17,5,14,4,0


In [7]:
# Only keep numerical columns
df = df.drop(["day","station","bound","line"], axis=1)
df.head()

Unnamed: 0,min_delay,month_number,hour,day_number,station_number,bound_number,line_number
0,2,2,6,5,8,4,0
1,2,6,11,6,14,4,0
2,2,7,17,5,14,1,0
3,2,7,17,5,14,1,0
4,2,7,17,5,14,4,0


# Select X and y

In [8]:
X = df.drop("min_delay", axis=1)
# X = pd.get_dummies(df.loc[:,df.columns!='min_delay'])
y = df["min_delay"]

In [9]:
X.head()

Unnamed: 0,month_number,hour,day_number,station_number,bound_number,line_number
0,2,6,5,8,4,0
1,6,11,6,14,4,0
2,7,17,5,14,1,0
3,7,17,5,14,1,0
4,7,17,5,14,4,0


In [10]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: min_delay, dtype: int64

# Create a Train Test Split



In [11]:
from sklearn.model_selection import train_test_split
print(X.shape, y.shape)

(13563, 6) (13563,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42)

In [13]:
X_train.head()

Unnamed: 0,month_number,hour,day_number,station_number,bound_number,line_number
1877,10,7,5,41,4,0
513,1,13,0,17,3,3
6657,2,20,3,84,4,0
5030,10,21,1,14,4,0
8695,10,12,4,71,2,3


In [14]:
y_train.head()

1877    3
513     3
6657    4
5030    4
8695    5
Name: min_delay, dtype: int64

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [15]:
# Scale data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [16]:
# Create a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [17]:
# Fitting our model with all of our features in X
model.fit(X_train_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# Print the model score
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.003053321838729972
Testing Data Score: -0.00037575814674206676


# Make Predictions

In [19]:
# Make predictions with the model
predictions = model.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
6325,6.000533,4
2435,5.662834,3
8040,5.454432,5
12361,5.617672,10
7937,5.983989,5
...,...,...
12663,5.884751,12
9166,5.599402,5
1699,5.659743,3
13479,5.430037,24


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [20]:
# Create the GridSearchCV model
# from sklearn.model_selection import GridSearchCV
# param_grid = {'C': [1, 5, 10],
#               'gamma': [0.0001, 0.0005, 0.001]}
# grid = GridSearchCV(model2, param_grid, verbose=3)

In [21]:
# Train the model with GridSearch
# grid.fit(X_train_scaled, y_train)

In [22]:
# print(grid.best_params_)
# print(grid.best_score_)

# Save the Model

In [23]:
# using results above to save the best model
# best_model=SVC(kernel='linear', C=10, gamma=0.0001)

# import joblib

# filename = 'best_model.sav'
# joblib.dump(best_model, filename)