In [2]:
import os
import pickle
import datetime as dt
import itertools

import pandas as pd
import numpy as np

import keras
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import RMSprop
from keras.callbacks import CSVLogger, EarlyStopping
from keras.optimizers import Adam

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['axes.facecolor'] = 'w'
plt.rcParams['axes.labelcolor'] = 'k'
plt.rcParams['axes.edgecolor'] = 'k'
plt.rcParams['ytick.color'] = 'k'
plt.rcParams['xtick.color'] = 'k'
plt.rcParams['grid.color'] = (.7, .7, .7, 0)
plt.rcParams['figure.figsize'] = (16, 10)

print('numpy ver.: ' + np.__version__)
print('pandas ver.: ' + pd.__version__)
print('tensorflow ver.: ' + tf.__version__) 
print('keras ver.: ' + keras.__version__)


numpy ver.: 1.25.1
pandas ver.: 1.5.3
tensorflow ver.: 2.12.0
keras ver.: 2.12.0


In [3]:
with open('../data/VMs.pickle', 'rb') as file:
    VMs_df = pickle.load(file)

In [6]:
VMs_df.columns

Index(['TripId', 'OPERATOR_ID', 'CLUSTER_ID', 'License_Plate',
       'LINE_SHORT_NAME', 'OriginAimedDepartureTime', 'LINE_DESC', 'RouteId',
       'Direction', 'Alternative', 'stopOrder', 'actualArrivalTime_x',
       'actualDepartureTime_x', 'Linkref_x', 'linkTime', 'time_first_stop(s)',
       'Link_travel_time(s)', 'Trip_End', 'timestamp1', 'timestamp2', 'date',
       'Arrival_time', 'Departure_time', 'K-1_Travel_Time', 'K-2_Travel_Time',
       'K-3_Travel_Time', 'Headway_Time', 'K-1_Headway_Time',
       'K-2_Headway_Time', 'K-3_Headway_Time', 'Time_Period',
       'average_free_flow_time', 'Average_Bus_Dwelling_Time', 'Bus_Delay',
       'Delay_Ratio', 'Delay_Level', 'LinkrefID', 'direction', 'StopSequence',
       'DayInWeek_friday', 'DayInWeek_monday', 'DayInWeek_saturday',
       'DayInWeek_sunday', 'DayInWeek_thursday', 'DayInWeek_tuesday',
       'DayInWeek_wednesday', 'preD1', 'preD2', 'timeCategory_Unknown',
       'timeCategory_d1', 'timeCategory_d2', 'timeCategory_d3',

Preprocess the VMs dataframe by adding 'Day' and 'time' columns, sorting the data, and filtering out rows with TripId equal to 0

In [7]:
VMs_df['Day'] = pd.to_datetime(VMs_df['OriginAimedDepartureTime']).dt.date
VMs_df['time'] = pd.to_datetime(VMs_df['OriginAimedDepartureTime']).dt.strftime('%H:%M:%S')
VMs_df = VMs_df.sort_values(by=['TripId', 'ID', 'Day', 'OriginAimedDepartureTime', 'time', 'stopOrder'])
VMs_df = VMs_df[VMs_df['TripId'] != 0]

has_nulls = VMs_df.isnull().any().any()
has_zeros = (VMs_df['Linkref'] == 0).any()
assert has_nulls == False
assert has_zeros == False

Group VMs dataframe by 'ID' and create DataHolder objects for each line

In [4]:
class DataHolder:
    def __init__(self, line_name, links, routs, links_defaults, X, Y, models):
        self.line_name = line_name
        self.links = links
        self.routs = routs
        self.links_defaults = links_defaults
        self.x = X
        self.y = Y
        self.models = models

lines_dfs = [group_df for _, group_df in VMs_df.groupby('ID')]
lines_info = []

for line_df in lines_dfs:
    links_unique_values = line_df['Linkref'].unique()
    routs_unique_values = line_df['time'].unique()
    id_unique_values = line_df['ID'].unique()
    assert len(id_unique_values) == 1
    lines_info.append(DataHolder(id_unique_values[0], links_unique_values, routs_unique_values, None, None, None, None))

prepper a tensor with default values per line

In [5]:
try:
    with open('../data/default_tensors.pickle', 'rb') as file:
        default_tensors = pickle.load(file)
except:
    VALID_MISS = 5
    default_tensors = []
    i = 0
    for line_df, info in zip(lines_dfs, lines_info):
        default_tensor = np.zeros((len(info.links), 1))
        for n, (_, day_df) in enumerate(line_df.groupby('Day')):
            for (_, group_df) in day_df.groupby('time'):
                for (_, row) in group_df.iterrows():
                    if row['time'] in info.routs and row['Linkref'] in info.links and len(group_df) > (len(info.links) - VALID_MISS):
                        link_index = np.where(info.links == row['Linkref'])[0]

                        # FIXME <something> / 2 it a moving average
                        if row['linkTime'] > 0:
                            default_tensor[(link_index), 0] = (default_tensor[(link_index), 0] + row['linkTime']) / 2
                            
        default_tensors.append(default_tensor)
    
    with open('../data/default_tensors.pickle', 'wb') as file:
        pickle.dump(default_tensors, file)

for i, info in enumerate(lines_info):
    info.links_defaults = default_tensors[i]

save link time info for every line

In [8]:
try:
    with open('../data/link_time_info.pickle', 'rb') as file:
        link_time_info = pickle.load(file)
except:
    link_time_info = []

    for line_df, info in zip(lines_dfs, lines_info):
        line_link_time = []
        for n, (_, day_df) in enumerate(line_df.groupby('Day')):
            for (_, group_df) in day_df.groupby('time'):
                    all_links_time = ([0] * (len(info.links_defaults)))

                    for (_, row) in group_df.iterrows():
                        if row['time'] in info.routs and row['Linkref'] in info.links and len(group_df) > (len(info.links) - VALID_MISS):
                            link_index = np.where(info.links == row['Linkref'])[0]
                            if row['linkTime'] > 0:
                                all_links_time[link_index[0]] = row['linkTime']

                    for link in range(len(info.links)):
                        if all_links_time[link] == 0:
                            all_links_time[link] = info.links_defaults[link, 0]

                    line_link_time.append(all_links_time)
                        
        link_time_info.append(line_link_time)
    
    with open('../data/link_time_info.pickle', 'wb') as file:
        pickle.dump(link_time_info, file)

Prepare input (X) and output (Y) data for each DataHolder object (we one per line)

In [126]:
for i, info in enumerate(lines_info):
    X = []
    Y = []

    for j in range(len(info.links)):
        x = []
        y = []
        for rout in link_time_info[i]:
            if not rout:
                continue
            if j == 0:
                continue
            else:
                x.append(rout[:j])
                y.append(rout[j])
        
        if not x or not y:
            continue
        X.append(x)
        Y.append(y)
    
    info.x = X
    info.y = Y

Define a function to build a neural network model

In [124]:
def build_model(num_of_stops):
    model = Sequential()

    model.add(Dense(units=64, input_shape=(num_of_stops,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='relu'))

    model.compile(optimizer=Adam(), loss = "MAE", metrics=[])
    
    return model

Train neural network models for each DataHolder object and save the models' weights

In [None]:
for i, info in enumerate(lines_info):
    tmp_models = []
    print(i, info.line_name)
    for j in range(len(info.links)):

        if j == len(info.links) - 1:
            # can predict the one after the last stop
            continue

        model = build_model(j+1)

        data_train, data_test, targets_train, targets_test = train_test_split(info.x[j], info.y[j], test_size=0.2, random_state=42)

        history = model.fit(data_train, targets_train,
                            epochs = 100,
                            shuffle = True,
                            verbose = 0)
        
        tmp_models.append(model.get_weights())
        
    info.models = tmp_models.copy()
    with open('../data/lines_info.pickle', 'wb') as file:
        pickle.dump(lines_info, file)