# TTC Delay Forecasting

In [None]:
from config import data_path

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dateutil.parser import parse

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split


##import libraries

: 

In [None]:
# Defining parsing function

def standardize_time(time_str):
    # Add seconds if missing
    if len(time_str.split(':')) == 2:  # Format is h:m
        time_str += ':00'
    return time_str

def standardize_time_format(time_str):
    try:
        # Parse the time string
        parsed_time = parse(str(time_str)).time()  # Extract only the time
        # Format to HH:MM:SS
        return parsed_time.strftime('%H:%M:%S')
    except Exception as e:
        print(f"Could not parse '{time_str}': {e}")
        return None
    
def standardize_date_format(date_str):
    try:
        # Parse the date string
        parsed_date = parse(str(date_str)) 
        # Format to YYYY-MM-DD
        return parsed_date.strftime('%Y-%m-%d')
    except Exception as e:
        print(f"Could not parse '{date_str}': {e}")
        return None


: 

In [None]:

def loadRawData(type="bus",start_year = 2014, end_year = 2015,targets = [], features = [],file_path = 'a'):
    """
    type = bus, subway, streetcar
    start_year = start of year range
    end_year = end of year range
    targets = targets of dataset
    features = features of dataset

    
    loads data, based off given parameters
    """
    subfolder_path = os.path.join(data_path, type)

    if not os.path.isdir(subfolder_path):
            raise ValueError(f"Subfolder '{type}' does not exist in {data_path}.") #making sure path is correct
    
    all_data = pd.DataFrame()

    for filename in os.listdir(subfolder_path):
        # print("On filename:",filename)
        if (
            filename.endswith(".xlsx") and
            filename.startswith(f"ttc-{type}-delay-data") and
            start_year <= int(filename.split("-")[-1].split(".")[0]) <= end_year
        ):
                file_path = os.path.join(subfolder_path, filename)
                sheet_names = pd.ExcelFile(file_path).sheet_names
                for month in sheet_names:
                    data = pd.read_excel(file_path,sheet_name=month)

                    # accounting for inconsistent data formatting
                    if 'Report Date' in data.columns:
                        data.rename(columns={'Report Date': 'Date'}, inplace=True)
                    elif 'Date' in data.columns:
                        pass  # Column is already named "date"

                    if 'Delay' in data.columns:
                        data.rename(columns={'Delay': 'Min Delay'}, inplace=True)
                    elif 'Min Delay' in data.columns:
                        pass  # Column is already named "date"
                    
                    all_data = pd.concat([all_data, data], ignore_index=True)    

    # all_data.info()
    # print(all_data.describe())
    # print("\n")
    return all_data

def process_data(df,targets,features,start="-01-01"):
    '''
    Takes in dataframe and preprocesses based off arguments 
    '''
    # targets = "min_delay"
    # features = ["","",""]
    # print("Using features:\n",features,"\nTargets:",targets)
    df = df.sort_index()
    df = df[targets+features] #only using necessary data
    #drop empty rows:
    df.dropna(axis=0, how='all', inplace=True) #drops where all are null
    # print(df['Time'].head())
    
    df['Time'] = df['Time'].apply(standardize_time_format)

    # combines the time with the date
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.strftime('%H:%M:%S')
    df['Datetime'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Time']) #combining into one column

    #dropping the unecessary columns:
    df.drop(columns = ['Time','Date'], inplace = True)
    # print(df.columns)

    # preprocessing the direction to make consistent 4 + 1 directions 
    valid_directions = ['n','s','e','w','b'] #should only have n,e,s,w, b - both ways

    df['Direction'] = df['Direction'].str[0].str.lower()
    df['Direction'] = df['Direction'].apply(lambda x: x if x in valid_directions else 'unknown')
    
    unique_directions = df['Direction'].unique() 
    # print(unique_directions)

    #one hot encoding
    categorical_features = df.select_dtypes(include=['object']).columns # only categorical features selected
    # categorical_features = ['Route','Direction']
    for features in categorical_features:
        df[features] = df[features].astype(str)
    # df['Route'] = df['Route'].astype(str)
    # df['Direction'] = df['Direction'].astype(str)

    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_features = encoder.fit_transform(df[categorical_features])
    
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
    df = pd.concat([df, encoded_df], axis=1)

    df.drop(categorical_features, axis=1, inplace=True)

    df.set_index('Datetime',inplace=True)

    # Extract year, month, day, hour, and minute from the Datetime index
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
     
    scaler = StandardScaler() #standard because we expect standard deviation
    # scaler = MinMaxScaler() #min max because ...

    df[['Min Delay']] = scaler.fit_transform(df[['Min Delay']])
    df.dropna(axis=0, how='any', inplace=True)
    
    return df,scaler

In [None]:
from data_load import *

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
# libraries used for 


def getXandY(type="bus",start_year=2014,end_year=2015):

    df = loadRawData(type,start_year,end_year)

    targets = ["Min Delay"]
    features = ["Date", "Time","Direction"]


    df,scaler = process_data(df,targets=targets,features=features)

    print(df.head())

    n_steps = 10
    n_outputs = 1
    
    
    X, y= create_sliding_windows(df,n_steps,n_outputs,target_column=targets[0])
    # print("X shape:",X.shape)
    # print("y shape:",y.shape)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(np.isnan(X_train).any(), np.isinf(X_train).any())
    print(np.isnan(y_train).any(), np.isinf(y_train).any())

    return X_train, X_test, y_train, y_test, scaler

def create_sliding_windows(df, n_steps, n_outputs, target_column):
    """
    Converts a DataFrame into overlapping sliding windows.

    Parameters:
    - df: Input DataFrame with features and target variable.
    - n_steps: Number of time steps in the input sequence.
    - n_outputs: Number of time steps in the output sequence.
    - target_column: Name or index of the target column.

    Returns:
    - X: Numpy array of shape (num_samples, n_steps, num_features)
    - y: Numpy array of shape (num_samples, n_outputs)
    """
    X, y = [], []
    if isinstance(target_column, str):
        target_index = df.columns.get_loc(target_column)  # Get column index
    else:
        target_index = target_column

    data = df.to_numpy()  # Convert to NumPy for efficiency
    for i in range(len(data) - n_steps - n_outputs + 1):
        # Include all columns except the target in X
        X.append(data[i:i + n_steps, :])
        # Use only the target column for y
        y.append(data[i + n_steps:i + n_steps + n_outputs, target_index])
    X = np.array(X)
    y = np.array(y)

    # Exclude the target column from X (optional if the target is among features)
    # X = np.delete(X, target_index, axis=-1)
    return X, y

In [None]:
# LSTM model
## Creating models

#parameters:
type = 'bus'
start_year = 2014 #min
end_year = 2023 #max

adam_lr = 0.00001
num_neurons = 128
batch_size = 128
num_epochs = 10

X_train, X_test, y_train, y_test, scaler = getXandY(type,start_year,end_year)
input_shape = (X_train.shape[0],X_train.shape[2]) #num samples, num features in each sample

lstm_model = models.Sequential()
lstm_model.add(layers.LSTM(num_neurons,
                           activation='relu',
                           input_shape=input_shape))
lstm_model.add(layers.Dense(1))  # for the final output layer since its only 1 output

optimizer = Adam(learning_rate=adam_lr) #setting 

lstm_model.compile(optimizer=optimizer, metrics = ['mae'], loss='mse')  #use mae and mse since regression

# creating model
lstm_model.summary()

#fitting model
history = lstm_model.fit(X_train,y_train,
                    epochs = num_epochs,
                    batch_size = batch_size,
                    validation_split=0.2, #0.2 of the training set to be used for validation
                    verbose=1)


In [None]:
#Testing model:
def test_model(history,model,X_train,X_test,y_train,y_test):

    # getting predictions of model:
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    # n_elements = len(y_pred_test)
    # pad_size = 10 - (n_elements % 10)  # Calculate how much to pad
    # y_pred_test_padded = np.pad(y_pred_test, (0, pad_size), mode='constant', constant_values=0)
    # y_pred_test = y_pred_test_padded.reshape(-1, 10, 1)

    # y_pred_train = lstm_model.predict(X_train)
    # n_elements = len(y_pred_train)
    # pad_size = 10 - (n_elements % 10)  # Calculate how much to pad
    # y_pred_train_padded = np.pad(y_pred_train, (0, pad_size), mode='constant', constant_values=0)
    # y_pred_train = y_pred_train_padded.reshape(-1, 10, 1) #since its not divisble by 10 ?

    print(y_test.shape,y_test_pred.shape)
    print(y_train.shape,y_train_pred.shape)

    test_loss = model.evaluate(y_test, y_test_pred)
    train_loss = model.evaluate(y_train,y_train_pred)

    # # print("Predictions:", y_pred)

    print(f"MSE training: {train_loss[0]}")
    print(f"MSE testing: {test_loss[0]}")

    print(f'MAE train: {train_loss[1]}')
    print(f'MAE test: {test_loss[1]}')


    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

test_model(history,lstm_model,X_train, X_test, y_train, y_test)
