In [1]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [2]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [3]:
# the dataset used contains all delays under 30 mins. 
df = pd.read_csv("../Data/ttc_subway_delay_2018_2019_for_machine_learning.csv", encoding='unicode_escape')
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.head()
# one hot encoding for stations - 1 or 0. will create many columns
# min_delay
# time series model

Unnamed: 0,id,date,time,day,station,code,min_delay,min_gap,bound,line,vehicle,code_info,latitude,longitude,line_name,month,time_range,month_number,hour,year
0,7720.0,2018-02-13,6:57,Tuesday,BROADVIEW STATION,EUNT,2.0,4.0,W,BD,5285.0,Equipment - No Trouble Found,43.4037,-79.213,Bloor Danforth,February,5-9AM,2.0,6.0,2018.0
1,3147.0,2019-06-12,11:54,Wednesday,COXWELL STATION,TUNIP,2.0,5.0,W,BD,5350.0,Operator Not In Position,43.4103,-79.1923,Bloor Danforth,June,9AM-12PM,6.0,11.0,2019.0
2,11036.0,2018-07-31,17:05,Tuesday,COXWELL STATION,TUNOA,2.0,4.0,E,BD,0.0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7.0,17.0,2018.0
3,11037.0,2018-07-31,17:33,Tuesday,COXWELL STATION,TUNOA,2.0,4.0,E,BD,0.0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7.0,17.0,2018.0
4,11038.0,2018-07-31,17:40,Tuesday,COXWELL STATION,TUNOA,2.0,4.0,W,BD,0.0,No Operator Immediately Available,43.4103,-79.1923,Bloor Danforth,July,3-6PM,7.0,17.0,2018.0


# Select your features (columns)

In [4]:
# Drop the null rows, and everything that is not a float except station
df = df.dropna().drop(['id','date','time','day','code','bound','line','code_info','line_name','month','time_range',"latitude","longitude"], axis=1)
df.head()

Unnamed: 0,station,min_delay,min_gap,vehicle,month_number,hour,year
0,BROADVIEW STATION,2.0,4.0,5285.0,2.0,6.0,2018.0
1,COXWELL STATION,2.0,5.0,5350.0,6.0,11.0,2019.0
2,COXWELL STATION,2.0,4.0,0.0,7.0,17.0,2018.0
3,COXWELL STATION,2.0,4.0,0.0,7.0,17.0,2018.0
4,COXWELL STATION,2.0,4.0,0.0,7.0,17.0,2018.0


# Create a Train Test Split



In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [6]:
X = df.drop("station", axis=1)
y = df["min_delay"].values.reshape(-1, 1)
print(X.shape, y.shape)

(13315, 6) (13315, 1)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)

In [8]:
X_train.head()

Unnamed: 0,min_delay,min_gap,vehicle,month_number,hour,year
12243,10.0,15.0,5856.0,11.0,0.0,2019.0
2421,3.0,8.0,3026.0,5.0,10.0,2019.0
11114,7.0,13.0,5102.0,11.0,1.0,2019.0
4884,4.0,6.0,5047.0,10.0,19.0,2019.0
8624,5.0,8.0,5596.0,3.0,15.0,2018.0


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [9]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_scaler = MinMaxScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [10]:
print(X_train_scaled.shape, y_train_scaled.shape)

(9986, 6) (9986, 1)


In [11]:
y_train_scaled

array([[0.28571429],
       [0.03571429],
       [0.17857143],
       ...,
       [0.07142857],
       [0.03571429],
       [0.10714286]])

In [12]:
print(X_test_scaled.shape, y_test_scaled.shape)

(3329, 6) (3329, 1)


In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(units=12, activation='relu',kernel_initializer='normal')) #how many units should we have?
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=8, activation='linear'))

In [14]:
model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])

In [19]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [20]:
model.fit(
    X_train_scaled,
    y_train_scaled,
    epochs=40, #did I over train?
    shuffle=True,
    verbose=2
)

Train on 9986 samples
Epoch 1/40
9986/9986 - 2s - loss: 5.4213 - accuracy: 0.0000e+00
Epoch 2/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 3/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 4/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 5/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 6/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 7/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 8/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 9/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 10/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 11/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 12/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 13/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 14/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+00
Epoch 15/40
9986/9986 - 1s - loss: 10.2099 - accuracy: 0.0000e+0

<tensorflow.python.keras.callbacks.History at 0x2b37cac1eb8>

In [21]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_scaled, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

3329/3329 - 0s - loss: 10.2402 - accuracy: 0.0000e+00
Normal Neural Network - Loss: 10.240244282441441, Accuracy: 0.0


In [66]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             multiple                  35        
_________________________________________________________________
dense_31 (Dense)             multiple                  60        
_________________________________________________________________
dense_32 (Dense)             multiple                  165       
Total params: 260
Trainable params: 260
Non-trainable params: 0
_________________________________________________________________


In [37]:
# prediction result


AttributeError: 'Sequential' object has no attribute 'score'

# Train the Model



In [None]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

# Save the Model

In [158]:
# using results above to save the best model
best_model=SVC(kernel='linear', C=10, gamma=0.0001)

import joblib

filename = 'best_model.sav'
joblib.dump(best_model, filename)

['best_model.sav']

In [None]:
Xnew = np.array([[40, 0, 26, 9000, 8000]])

In [None]:
Xnew= x_scaler.transform(Xnew)
ynew= model2.predict(Xnew)
#invert normalize
ynew = y_scaler.inverse_transform(ynew) 
Xnew = x_scaler.inverse_transform(Xnew)
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))