In [285]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("../data-retriever/data/cleaned_1_week_transformed.csv")
train_df = train_df.drop(columns = ['current_time', 'hour', 'minute', 'round'])
train_df.reset_index()

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204330 entries, 0 to 204329
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   bus_stop_code  204330 non-null  int64
 1   duration       204330 non-null  int64
 2   day            204330 non-null  int64
 3   time           204330 non-null  int64
dtypes: int64(4)
memory usage: 6.2 MB


In [286]:
train_df.head()

Unnamed: 0,bus_stop_code,duration,day,time
0,4653946529,80,4,1
1,4652946491,91,4,1
2,4653946529,80,4,2
3,4652946491,91,4,2
4,4652946491,91,4,2


In [287]:
#Extract Features Columns
train_X = train_df.drop(columns=['duration'])
print('Training Input Features:')
print(list(train_X))

#Extract Training Result
train_Y = train_df[['duration']]
print('Training Output Features:')
print(list(train_Y))

Training Input Features:
['bus_stop_code', 'day', 'time']
Training Output Features:
['duration']


In [288]:
#Encode categorical data (bus_stop_code, day (already encoded))
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(train_X['bus_stop_code'])
encoded_bus_stops = encoder.transform(train_X['bus_stop_code'])

encoded_bus_stops.shape

(204330,)

In [289]:
#Get numerical values as float from input data
time_X = train_X['time'].values
time_X = time_X.astype('float32')

train_Y = train_Y.values
train_Y = train_Y.astype('float32')

time_X.shape

(204330,)

In [290]:
#Scale numerical data (time and duration)
from sklearn.preprocessing import RobustScaler

#Scale data
input_scalar, output_scalar = RobustScaler(), RobustScaler()
input_scalar.fit(time_X.reshape(-1, 1))
time_X = input_scalar.transform(time_X.reshape(-1, 1))

output_scalar.fit(train_Y)
train_Y = output_scalar.transform(train_Y)

In [291]:
time_X.shape

(204330, 1)

In [292]:
train_X = np.stack((encoded_bus_stops, train_X['day'].values, np.squeeze(time_X)), axis=1)

train_X.shape

(204330, 3)

In [293]:
train_Y.shape

(204330, 1)

In [294]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(train_X, train_Y, test_size=0.2, shuffle=True, random_state=88)

#get number of columns in training data
n_cols = train_X.shape[1]

In [295]:
n_cols

3

In [309]:
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense

num_layers = 10
num_neurons = 20

def create_model(num_layers, num_neurons, n_cols):
    model = Sequential()

    #Input layer
    model.add(Dense(num_neurons, activation='relu', input_shape=(n_cols,)))
    
    #Add hidden layers
    for i in range(0, num_layers):
        model.add(Dense(num_neurons, activation = 'relu'))
    
    #Add output layer
    model.add(Dense(1))

    return model

In [310]:
#Create and conpile model with Adam optimizer, L2/MSE for loss function
model = create_model(num_layers, num_neurons, n_cols)
model.compile(optimizer='adam', loss='mean_squared_error', metrics = ['mae'])
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 20)                80        
_________________________________________________________________
dense_52 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_53 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_54 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_55 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_56 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_57 (Dense)             (None, 20)              

In [311]:
model.fit(train_X, train_Y, epochs=50, batch_size = 32, verbose = 1, validation_data=(test_X, test_Y))

Train on 163464 samples, validate on 40866 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f9b966120d0>

In [None]:
model.evaluate(train_X, train_Y, verbose = 0)

In [None]:
model.evaluate(test_X, test_Y, verbose=0)