In [1]:
from sys import version
import tensorflow
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from keras.callbacks import EarlyStopping
from os import makedirs
makedirs("final_nn", exist_ok=True)

print(version)

Using TensorFlow backend.
3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [42]:
# Load and prepare training and xval data
TRAINING_FILE, XVAL_FILE, TEST_FILE = "../combined_data/15min/train.tsv.gz", "../combined_data/15min/xval.tsv.gz", "../combined_data/15min/test.tsv.gz"
train, xval, test = pd.read_csv(TRAINING_FILE, sep='\t'), pd.read_csv(XVAL_FILE, sep='\t'), pd.read_csv(TEST_FILE, sep='\t')
print(f'Training dimension: {train.shape}')
print(f'Xval dimension: {xval.shape}')
print(f'Test dimension: {test.shape}')
#print('\n'.join(train.columns))
#train.head(n=2).T

Training dimension: (542954, 20)
Xval dimension: (67876, 20)
Test dimension: (67699, 20)


In [43]:
X_NUM_COLS = [
    'orca_total', 
    'frac_disabled', 
    'frac_youth', 
    'frac_senior', 
    'frac_li', 
    'frac_uw'
]
X_CAT_COLS = [
    'is_ns', 
    'is_rapid', 
    'is_weekend', 
    'trip_start_hr_15', 
    'rte', 
    'dir', 
    'day_of_week', 
    'region', 
    'start', 
    'end', 
    'summer'
]

#label_encoders = {col: LabelEncoder() for col in X_CAT_COLS}
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()

print("Creating X_train")
X_train = np.concatenate((
    scaler.fit_transform(train[X_NUM_COLS]),
    one_hot_encoder.fit_transform(train[X_CAT_COLS]).todense()
), axis=1)

print("Creating X_xval")
X_xval = np.concatenate((
    scaler.transform(xval[X_NUM_COLS]),
    one_hot_encoder.transform(xval[X_CAT_COLS]).todense()
), axis=1)

print("Creating X_test")
X_test = np.concatenate((
    scaler.transform(test[X_NUM_COLS]),
    one_hot_encoder.transform(test[X_CAT_COLS]).todense()
), axis=1)

y_train = train['ons']
y_xval = xval['ons']
y_test = test['ons']

np.save("final_nn/preprocessed_15m_X_train.npy", X_train)
np.save("final_nn/preprocessed_15m_X_xval.npy", X_xval)
np.save("final_nn/preprocessed_15m_X_test.npy", X_test)
np.save("final_nn/y_15m_train.npy", y_train)
np.save("final_nn/y_15m_xval.npy", y_xval)
np.save("final_nn/y_15m_test.npy", y_test)

Creating X_train
Creating X_xval
Creating X_test


In [27]:
one_hot_encoder.categories_

column_labels = list()
for lab in X_NUM_COLS:
    column_labels.append(f'num: {lab}')

for i, cat in enumerate(X_CAT_COLS):
    for cat_val in one_hot_encoder.categories_[i]:
        column_labels.append(f'{cat}: {cat_val}')

assert len(column_labels) == X_train.shape[1], f"Len of column labels {len(column_labels)} matches dimension of training set {X_train.shape}"

import pickle
with open('final_nn/15m_one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

with open('final_nn/15m_standard_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('final_nn/15m_column_labels.pkl', 'wb') as f:
    pickle.dump(column_labels, f)


In [46]:
model = Sequential()
model.add(Dense(600, activation="sigmoid", input_dim=426)) # 700: 6.59 / 6.62
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.2, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_absolute_error', optimizer=sgd)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_xval, y_xval), epochs=25, batch_size=256, callbacks=[es])

Train on 542954 samples, validate on 67876 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Restoring model weights from the end of the best epoch
Epoch 00010: early stopping


<keras.callbacks.History at 0x1a476bbd30>

In [47]:
print(f'Test perf: {model.evaluate(X_test, y_test, batch_size=128)}')

Test perf: 6.5460960049349515


In [7]:
model_json = model.to_json()
with open("final_nn/model_15min.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("final_nn/model_15min_weights_train.h5")

In [10]:
# Train a version on the training and xval sets combined, using the test set as validation.

X_train = np.load("final_nn/preprocessed_X_train.npy")
X_xval = np.load("final_nn/preprocessed_X_xval.npy")
X_test = np.load("final_nn/preprocessed_X_test.npy")
y_train = np.load("final_nn/y_train.npy")
y_xval = np.load("final_nn/y_xval.npy")
y_test = np.load("final_nn/y_test.npy")

X_bigtrain = np.concatenate([X_train,X_xval])
print(X_bigtrain.shape)
y_bigtrain = np.concatenate([y_train,y_xval])

model = Sequential()
model.add(Dense(800, activation="sigmoid", input_dim=426))
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.2, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_absolute_error', optimizer=sgd)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True)
model.fit(X_bigtrain, y_bigtrain, validation_data=(X_test, y_test), epochs=25, batch_size=256, callbacks=[es])

(610830, 426)
Train on 610830 samples, validate on 67699 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Restoring model weights from the end of the best epoch
Epoch 00006: early stopping


<keras.callbacks.History at 0x1a48c9bc88>

In [11]:
model.save_weights("final_nn/model_15min_weights_train_and_xval.h5")

In [17]:
# Create test set predictions out of the better performing model
from keras.models import model_from_json

with open("final_nn/model_15min.json", "r") as json_file:
    model = model_from_json(json_file.read())

model.load_weights("final_nn/model_15min_weights_train.h5")

with open("../predictions/final_nn_15min_test.txt", 'wt') as f:
    for val in model.predict(X_test).squeeze():
        f.write(f'{val:.15f}\n')

with open("../predictions/final_nn_15min_xval.txt", 'wt') as f:
    for val in model.predict(X_xval).squeeze():
        f.write(f'{val:.15f}\n')

In [18]:
# Fit another model on 30m data.
# Load and prepare training and xval data
TRAINING_FILE, XVAL_FILE, TEST_FILE = "../combined_data/30min/train.tsv.gz", "../combined_data/30min/xval.tsv.gz", "../combined_data/30min/test.tsv.gz"
train, xval, test = pd.read_csv(TRAINING_FILE, sep='\t'), pd.read_csv(XVAL_FILE, sep='\t'), pd.read_csv(TEST_FILE, sep='\t')
print(f'Training dimension: {train.shape}')
print(f'Xval dimension: {xval.shape}')
print(f'Test dimension: {test.shape}')
X_NUM_COLS = [
    'orca_total', 
    'frac_disabled', 
    'frac_youth', 
    'frac_senior', 
    'frac_li', 
    'frac_uw'
]
X_CAT_COLS = [
    'is_ns', 
    'is_rapid', 
    'is_weekend', 
    'trip_start_hr_30',
    'rte', 
    'dir', 
    'day_of_week', 
    'region', 
    'start', 
    'end', 
    'summer'
]

#label_encoders = {col: LabelEncoder() for col in X_CAT_COLS}
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()

print("Creating X_train")
X_train = np.concatenate((
    scaler.fit_transform(train[X_NUM_COLS]),
    one_hot_encoder.fit_transform(train[X_CAT_COLS]).todense()
), axis=1)

print("Creating X_xval")
X_xval = np.concatenate((
    scaler.transform(xval[X_NUM_COLS]),
    one_hot_encoder.transform(xval[X_CAT_COLS]).todense()
), axis=1)

print("Creating X_test")
X_test = np.concatenate((
    scaler.transform(test[X_NUM_COLS]),
    one_hot_encoder.transform(test[X_CAT_COLS]).todense()
), axis=1)

y_train = train['ons']
y_xval = xval['ons']
y_test = test['ons']

np.save("final_nn/preprocessed_30m_X_train.npy", X_train)
np.save("final_nn/preprocessed_30m_X_xval.npy", X_xval)
np.save("final_nn/preprocessed_30m_X_test.npy", X_test)
np.save("final_nn/y_30m_train.npy", y_train)
np.save("final_nn/y_30m_xval.npy", y_xval)
np.save("final_nn/y_30m_test.npy", y_test)

Training dimension: (424373, 20)
Xval dimension: (53071, 20)
Test dimension: (53169, 20)
Creating X_train
Creating X_xval
Creating X_test


In [21]:
column_labels = list()
for lab in X_NUM_COLS:
    column_labels.append(f'num: {lab}')

for i, cat in enumerate(X_CAT_COLS):
    for cat_val in one_hot_encoder.categories_[i]:
        column_labels.append(f'{cat}: {cat_val}')

assert len(column_labels) == X_train.shape[1], f"Len of column labels {len(column_labels)} matches dimension of training set {X_train.shape}"

import pickle
with open('final_nn/30m_one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

with open('final_nn/30m_standard_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('final_nn/30m_column_labels.pkl', 'wb') as f:
    pickle.dump(column_labels, f)


In [20]:
model = Sequential()
model.add(Dense(500, activation="sigmoid", input_dim=378))
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.2, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_absolute_error', optimizer=sgd)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_xval, y_xval), epochs=25, batch_size=256, callbacks=[es])

Train on 424373 samples, validate on 53071 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Restoring model weights from the end of the best epoch
Epoch 00011: early stopping


<keras.callbacks.History at 0x1a464ab160>

In [22]:
model_json = model.to_json()
with open("final_nn/model_30min.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("final_nn/model_30min_weights_train.h5")

In [25]:
X_bigtrain = np.concatenate([X_train,X_xval])
print(X_bigtrain.shape)
y_bigtrain = np.concatenate([y_train,y_xval])

model = Sequential()
model.add(Dense(600, activation="sigmoid", input_dim=378))
model.add(Dropout(0.1))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.4, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_absolute_error', optimizer=sgd)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=25, batch_size=256, callbacks=[es])

(477444, 378)
Train on 424373 samples, validate on 53169 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Restoring model weights from the end of the best epoch
Epoch 00003: early stopping


<keras.callbacks.History at 0x1a4751c438>

In [26]:
# Create test set predictions out of the better performing model
from keras.models import model_from_json

with open("final_nn/model_30min.json", "r") as json_file:
    model = model_from_json(json_file.read())

model.load_weights("final_nn/model_30min_weights_train.h5")

with open("../predictions/final_nn_30min_test.txt", 'wt') as f:
    for val in model.predict(X_test).squeeze():
        f.write(f'{val:.15f}\n')

with open("../predictions/final_nn_30min_xval.txt", 'wt') as f:
    for val in model.predict(X_xval).squeeze():
        f.write(f'{val:.15f}\n')

In [28]:
# And for 1hr

TRAINING_FILE, XVAL_FILE, TEST_FILE = "../combined_data/hr/train.tsv.gz", "../combined_data/hr/xval.tsv.gz", "../combined_data/hr/test.tsv.gz"
train, xval, test = pd.read_csv(TRAINING_FILE, sep='\t'), pd.read_csv(XVAL_FILE, sep='\t'), pd.read_csv(TEST_FILE, sep='\t')
print(f'Training dimension: {train.shape}')
print(f'Xval dimension: {xval.shape}')
print(f'Test dimension: {test.shape}')
X_NUM_COLS = [
    'orca_total', 
    'frac_disabled', 
    'frac_youth', 
    'frac_senior', 
    'frac_li', 
    'frac_uw'
]
X_CAT_COLS = [
    'is_ns', 
    'is_rapid', 
    'is_weekend', 
    'trip_start_hr',
    'rte', 
    'dir', 
    'day_of_week', 
    'region', 
    'start', 
    'end', 
    'summer'
]

#label_encoders = {col: LabelEncoder() for col in X_CAT_COLS}
one_hot_encoder = OneHotEncoder()
scaler = StandardScaler()

print("Creating X_train")
X_train = np.concatenate((
    scaler.fit_transform(train[X_NUM_COLS]),
    one_hot_encoder.fit_transform(train[X_CAT_COLS]).todense()
), axis=1)

print("Creating X_xval")
X_xval = np.concatenate((
    scaler.transform(xval[X_NUM_COLS]),
    one_hot_encoder.transform(xval[X_CAT_COLS]).todense()
), axis=1)

print("Creating X_test")
X_test = np.concatenate((
    scaler.transform(test[X_NUM_COLS]),
    one_hot_encoder.transform(test[X_CAT_COLS]).todense()
), axis=1)

y_train = train['ons']
y_xval = xval['ons']
y_test = test['ons']

np.save("final_nn/preprocessed_hr_X_train.npy", X_train)
np.save("final_nn/preprocessed_hr_X_xval.npy", X_xval)
np.save("final_nn/preprocessed_hr_X_test.npy", X_test)
np.save("final_nn/y_hr_train.npy", y_train)
np.save("final_nn/y_hr_xval.npy", y_xval)
np.save("final_nn/y_hr_test.npy", y_test)

Training dimension: (272646, 20)
Xval dimension: (34382, 20)
Test dimension: (34258, 20)
Creating X_train
Creating X_xval
Creating X_test


In [29]:
column_labels = list()
for lab in X_NUM_COLS:
    column_labels.append(f'num: {lab}')

for i, cat in enumerate(X_CAT_COLS):
    for cat_val in one_hot_encoder.categories_[i]:
        column_labels.append(f'{cat}: {cat_val}')

assert len(column_labels) == X_train.shape[1], f"Len of column labels {len(column_labels)} matches dimension of training set {X_train.shape}"

import pickle
with open('final_nn/hr_one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

with open('final_nn/hr_standard_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('final_nn/hr_column_labels.pkl', 'wb') as f:
    pickle.dump(column_labels, f)


In [35]:
model = Sequential()
model.add(Dense(450, activation="sigmoid", input_dim=354))
model.add(Dropout(0.15))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.15, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_absolute_error', optimizer=sgd)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_xval, y_xval), epochs=25, batch_size=256, callbacks=[es])

Train on 272646 samples, validate on 34382 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Restoring model weights from the end of the best epoch
Epoch 00005: early stopping


<keras.callbacks.History at 0x1a47191f98>

In [36]:
model.evaluate(X_test, y_test)



11.88932574933197

In [38]:
model = Sequential()
model.add(Dense(500, activation="sigmoid", input_dim=354))
model.add(Dropout(0.15))
model.add(Dense(1, activation='linear'))
sgd = SGD(lr=0.15, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_absolute_error', optimizer=sgd)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_xval, y_xval), epochs=25, batch_size=256, callbacks=[es])

Train on 272646 samples, validate on 34382 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Restoring model weights from the end of the best epoch
Epoch 00008: early stopping


<keras.callbacks.History at 0x13db63c18>

In [39]:
model.evaluate(X_test, y_test)



11.195544252101104

In [40]:
model_json = model.to_json()
with open("final_nn/model_hr.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("final_nn/model_hr_weights_train.h5")

In [41]:
# Create test set predictions out of the better performing model
from keras.models import model_from_json

with open("final_nn/model_hr.json", "r") as json_file:
    model = model_from_json(json_file.read())

model.load_weights("final_nn/model_hr_weights_train.h5")

with open("../predictions/final_nn_hr_test.txt", 'wt') as f:
    for val in model.predict(X_test).squeeze():
        f.write(f'{val:.15f}\n')

with open("../predictions/final_nn_hr_xval.txt", 'wt') as f:
    for val in model.predict(X_xval).squeeze():
        f.write(f'{val:.15f}\n')