# TTC Delay Forecasting

In [1]:
from config import data_path

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dateutil.parser import parse

import re #for parsing
import time

##import libraries
# from data_load import *

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, GRU, Dense, Concatenate, Input, Flatten
import numpy as np


from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split

2024-12-03 01:47:14.607637: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-03 01:47:14.625709: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-03 01:47:14.631709: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-03 01:47:14.644741: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Defining parsing function
def standardize_time(time_str):
    # Add seconds if missing
    if len(time_str.split(':')) == 2:  # Format is h:m
        time_str += ':00'
    return time_str

def standardize_time_format(time_str):
    try:
        # Parse the time string
        parsed_time = parse(str(time_str)).time()  # Extract only the time
        # Format to HH:MM:SS
        return parsed_time.strftime('%H:%M:%S')
    except Exception as e:
        print(f"Could not parse '{time_str}': {e}")
        return None
    
def standardize_date_format(date_str):
    try:
        # Parse the date string
        parsed_date = parse(str(date_str)) 
        # Format to YYYY-MM-DD
        return parsed_date.strftime('%Y-%m-%d')
    except Exception as e:
        print(f"Could not parse '{date_str}': {e}")
        return None
    
def parse_string(s):
    try:
        s = s.lower()                   # convert to lowercase
        s = s.replace("station", "stn") # replace "station" with "stn"
        s = s.replace("&", "and")       # replace "&" with "and"
        s = s.replace("centre","center") #other spellings
        s = re.sub(r'\bu\b', 'university', s)
        s = s.replace('stn.','stn')
        s = s.replace("st. clair", "st clair")
        s = s.replace("sta", "station")

        return s
    except Exception as e:
        print(f"Could not parse '{s}': {e}")
        return None



#Defining data loading
def loadRawData(vehicle_type="bus",start_year = 2014, end_year = 2015, data_path = data_path):
    """
    vehicle_type = bus, subway, streetcar
    start_year = start of year range
    end_year = end of year range
    targets = targets of dataset
    features = features of dataset
    loads data, based off given parameters
    """
    # print('subfolder path', data_path)
    subfolder_path = os.path.join(data_path, vehicle_type)

    print('subfolder path', subfolder_path)
    if not os.path.isdir(subfolder_path):
            print("error")
            raise ValueError(f"Subfolder '{vehicle_type}' does not exist in {data_path}.") #making sure path is correct

    all_data = pd.DataFrame()

    for filename in os.listdir(subfolder_path):
        
        print("On filename:",filename)

        if (
            filename.endswith(".xlsx") and
            filename.startswith(f"ttc-{vehicle_type}-delay-data") and
            start_year <= int(filename.split("-")[-1].split(".")[0]) <= end_year
        ):
                file_path = os.path.join(subfolder_path, filename)
                sheet_names = pd.ExcelFile(file_path).sheet_names
                for month in sheet_names:
                    data = pd.read_excel(file_path,sheet_name=month)
                    # print(data.columns)
                    # accounting for inconsistent data formatting
                    if 'Report Date' in data.columns:
                        pass  # Column is already named "Date"
                        
                    elif 'Date' in data.columns:
                        # print("Renamed date to report date")
                        data.rename(columns={'Date': 'Report Date'}, inplace=True)

                    if 'Delay' in data.columns:
                        # print("Renamed Delay to Min Delay")
                        data.rename(columns={'Delay': 'Min Delay'}, inplace=True)
                    elif 'Min Delay' in data.columns:
                        pass  # Column is already named Min Delay
                    
                    all_data = pd.concat([all_data, data], ignore_index=True)    
    return all_data

In [3]:
#Defining sliding window function

def create_sliding_windows(df, n_steps, n_outputs, target_column):
    """
    Converts a DataFrame into overlapping sliding windows.

    Parameters:
    - df: Input DataFrame with features and target variable.
    - n_steps: Number of time steps in the input sequence.
    - n_outputs: Number of time steps in the output sequence.
    - target_column: Name or index of the target column.

    Returns:
    - X: Numpy array of shape (num_samples, n_steps, num_features)
    - y: Numpy array of shape (num_samples, n_outputs)
    """
    X, y = [], []
    if isinstance(target_column, str):
        target_index = df.columns.get_loc(target_column)  # Get column index
    else:
        target_index = target_column

    data = df.to_numpy()  # Convert to NumPy for efficiency
    for i in range(len(data) - n_steps - n_outputs + 1):
        # Include all columns except the target in X
        X.append(data[i:i + n_steps, :])
        # Use only the target column for y
        y.append(data[i + n_steps:i + n_steps + n_outputs, target_index])
    X = np.array(X)
    y = np.array(y)

    # Exclude the target column from X (optional if the target is among features)
    # X = np.delete(X, target_index, axis=-1)
    return X, y

In [4]:
''' #code to investigate into the direction column / cardinality
description = df['Direction'].describe()
print("Summary Statistics:\n", description)
unique_values_count = df['Direction'].nunique()
print(len(df['Direction']))
print(f"Number of unique values in 'Location': {unique_values_count}")

import matplotlib.pyplot as plt


# Plot the top 10 most common labels
top_labels = df['Direction'].value_counts().head(50)
top_labels_list = top_labels.index.tolist()
print(top_labels_list)
# for label in top_labels_list:
#      print(label)

top_labels.plot(kind='bar', figsize=(10, 5))

# Set title and labels
plt.title('Direction vs Frequency')
plt.xlabel('Direction')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()
'''
''' #code to investigate the Location column / cardinality:
description = df['Location'].describe()
print("Summary Statistics:\n", description)
unique_values_count = df['Location'].nunique()
print(len(df['Location']))
print(f"Number of unique values in 'Location': {unique_values_count}")

import matplotlib.pyplot as plt


# Plot the top 10 most common labels
top_labels = df['Location'].value_counts().head(1000)
top_labels_list = top_labels.index.tolist()
print(top_labels_list)
# for label in top_labels_list:
#      print(label)

top_labels.plot(kind='bar', figsize=(100, 20))

# Set title and labels
plt.title('Top 500 Most Frequent Labels')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()
'''

' #code to investigate the Location column / cardinality:\ndescription = df[\'Location\'].describe()\nprint("Summary Statistics:\n", description)\nunique_values_count = df[\'Location\'].nunique()\nprint(len(df[\'Location\']))\nprint(f"Number of unique values in \'Location\': {unique_values_count}")\n\nimport matplotlib.pyplot as plt\n\n\n# Plot the top 10 most common labels\ntop_labels = df[\'Location\'].value_counts().head(1000)\ntop_labels_list = top_labels.index.tolist()\nprint(top_labels_list)\n# for label in top_labels_list:\n#      print(label)\n\ntop_labels.plot(kind=\'bar\', figsize=(100, 20))\n\n# Set title and labels\nplt.title(\'Top 500 Most Frequent Labels\')\nplt.xlabel(\'Label\')\nplt.ylabel(\'Frequency\')\nplt.grid(axis=\'y\', linestyle=\'--\', alpha=0.7)\n\n# Show the plot\nplt.show()\n'

In [6]:
#getting data
vehicle_type = 'bus'
start_year = 2014 #min
end_year = 2024 #max

#number of outputs: (how many points ahead to forecast)s

targets = ["Min Delay"]
features = ["Report Date", "Time","Direction", "Location", "Route"]

print("Loading data")
df_raw = loadRawData(vehicle_type=vehicle_type,start_year=start_year,end_year=end_year)

print("Finished loading data")

Loading data
subfolder path /media/jadenh/SSD2/TTC_DATA/bus
On filename: .~lock.ttc-bus-delay-data-2022.xlsx#
On filename: ttc-bus-delay-data-2014.xlsx
On filename: ttc-bus-delay-data-2015.xlsx
On filename: ttc-bus-delay-data-2016.xlsx
On filename: ttc-bus-delay-data-2017.xlsx
On filename: ttc-bus-delay-data-2018.xlsx
On filename: ttc-bus-delay-data-2019.xlsx
On filename: ttc-bus-delay-data-2020.xlsx
On filename: ttc-bus-delay-data-2021.xlsx
On filename: ttc-bus-delay-data-2022.xlsx
On filename: ttc-bus-delay-data-2023.xlsx
On filename: ttc-bus-delay-data-2024.xlsx
Finished loading data


In [None]:
## Defining parameters:
df = df_raw.copy()

n_outputs = 8 #firecastubforecasting 1 hours of delays(4 |delays / hour * 2 hours)
n_steps = 50 # how many points to use # assuming about 4 delays per hour * 24 hours / day ~ 100 points of data
location_embedding_dim = 25 #< -should adjust
route_embedding_dim = 25 #< -should adjust

## Processing Data

#first, sort index
df = df.sort_index() 

print("Using features:\n",features,"\nTargets:",targets)

df = df[targets+features] #only using necessary data
df.dropna(axis=0, how='any', inplace=True) #drops empty rows where any are null  
print("target + features + dropped empty:\n",df.head())


# applying parsing functions to the time and date (due to different formats)
df['Time'] = df['Time'].apply(standardize_time_format)
df['Report Date'] = df['Report Date'].apply(standardize_date_format)
# df['Location_encode'] = df['Location'].apply(parse_string) #may not need if we use embedding ?

df['Datetime'] = pd.to_datetime((df['Report Date'] + ' ' + df['Time']),format='%Y-%m-%d %H:%M:%S') #combining into one column
df.drop(columns = ['Time','Report Date'], inplace = True)

# preprocessing the DIRECTION to make consistent 4 + 1 directions 
valid_directions = ['n','s','e','w','b'] #should only have n,e,s,w, b - both ways
df['Direction'] = df['Direction'].str[0].str.lower()
df['Direction'] = df['Direction'].apply(lambda x: x if x in valid_directions else 'unknown')

df['Route'] = df['Route'].astype(str)
## one hot encoding 
# the only one hot encoded feature will be: 'Direction', as determined by the data exploration

# df['Direction'] = df['Direction'].astype(str) #formatting necessary????

one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = one_hot_encoder.fit_transform(df[['Direction']])

one_hot_encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(['Direction']))

df = pd.concat([df, one_hot_encoded_df], axis=1)
df.drop(df[['Direction']], axis=1, inplace=True) #removing unecessary columns


# defining encoders for embedding
location_encoder = LabelEncoder()
route_encoder = LabelEncoder()

## Using label encodings and embeddings: #
df['Location_encoded'] = location_encoder.fit_transform(df['Location'])
df['Route_encoded'] = route_encoder.fit_transform(df['Route'])

df = df.drop(['Location','Route'])

location_vocab_size = len(location_encoder.classes_)  # Number of unique locations
route_vocab_size = len(route_encoder.classes_)  # Number of unique routes


df.dropna(axis=0, how='any', inplace=True) #drops empty rows where any are null  
df.set_index('Datetime',inplace=True)
# Extract year, month, day, hour, and minute from the Datetime index
df['year'] = df.index.year.astype(int)
df['month'] = df.index.month.astype(int)
df['day'] = df.index.day.astype(int)
df['hour'] = df.index.hour.astype(int)
df['minute'] = df.index.minute.astype(int)

# statistics = pd.DataFrame({
#     'max': df[['year', 'month', 'day', 'hour', 'minute']].max(),
#     'min': df[['year', 'month', 'day', 'hour', 'minute']].min(),
#     'std': df[['year', 'month', 'day', 'hour', 'minute']].std()
# })

# print(statistics)
    
scaler = RobustScaler() #because we want model to be robust to outliars of which there are a couple
# scaler = StandardScaler() #standard because we expect standard deviation
# scaler = MinMaxScaler() #min max because ...

df[['Min Delay']] = scaler.fit_transform(df[['Min Delay']])
df.dropna(axis=0, how='any', inplace=True)

# df = df.sort_index() - shouldn't need to sort again

print(df.head())

#split dataframe into embedded encoded features, and regular features (for sliding window)

embed_features = ['Location_encoded', 'Route_encoded']
df_embed = df[embed_features]
df = df.drop(embed_features,axis=1)


X, y = create_sliding_windows(df,n_steps,n_outputs,target_column=targets[0])

#splitting data
test_size = 0.2
split_index = int(len(df) * (1 - test_size))

X_train, X_test = X[:split_index], X[split_index:] #will only contain non - embed features
y_train, y_test = y[:split_index], y[split_index:] 

X_train_embed, X_test_embed = df_embed[:split_index], df_embed[split_index:]

print(X_train_embed.shape, X_train_embed.columns)

# X_train, X_test, y_train, y_test, scaler = getXandY(n_outputs,n_steps,targets, features, vehicle_type=vehicle_type,start_year=start_year,end_year = end_year)



print("Finished loading data")

print("x train shape:", X_train.shape)
print("x test shape:", X_test.shape)
print("y train shape:", y_train.shape)
print("y test shape:", y_test.shape)

Using features:
 ['Report Date', 'Time', 'Direction', 'Location', 'Route'] 
Targets: ['Min Delay']
target + features + dropped empty:
    Min Delay Report Date      Time Direction              Location  Route
0       10.0  2014-01-01  00:23:00         E    York Mills station   95.0
1       33.0  2014-01-01  00:55:00       b/w  Entire run for route  102.0
2       10.0  2014-01-01  01:28:00        WB   lawrence and Warden   54.0
3       18.0  2014-01-01  01:30:00         N       Kipling Station  112.0
4       10.0  2014-01-01  01:37:00         n      VP and Ellesmere   24.0
                     Min Delay              Location  Route  Direction_e  \
Datetime                                                                   
2014-01-01 00:23:00   0.000000    York Mills station   95.0          1.0   
2014-01-01 00:55:00   2.555556  Entire run for route  102.0          0.0   
2014-01-01 01:28:00   0.000000   lawrence and Warden   54.0          0.0   
2014-01-01 01:30:00   0.888889       Kipl

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
# LSTM model
## Creating models

#parameters:
adam_lr = 0.0001
num_neurons = 64
batch_size = 512
num_epochs = 10


# location_input = Input(shape=(1,), name='Location_input')
# route_input = Input(shape=(1,), name='Route_input')

# direction_input = Input(shape=(n_steps, len(one_hot_encoder.get_feature_names_out(['Direction']))), name='Direction_input') #will be nsteps,6, (n,s,w,e,b,u)
# datetime_input = Input(shape=(n_steps,5), name='Datetime_input')  # 5 for year,month,day,hour,minute

#making a lr_schedule
def step_decay(epoch, lr):
    # Reduce LR by 50% after each epoch
    return lr * 0.5

lr_schedule = LearningRateScheduler(step_decay)

# Define the input shapes for time features, location, and route
n_steps = X_train.shape[1]  # number of time steps in the sliding window
n_features = X_train.shape[2]  # number of features for time inputs (e.g., year, month, etc.)

# Input for time-related features (n_steps, n_time_features)
features_input = layers.Input(shape=(n_steps, n_features), name='features_input') #regular inputs

# Inputs for categorical features (location, route)
location_input = layers.Input(shape=(1,), name='Location_input')
route_input = layers.Input(shape=(1,), name='Route_input')


# print(X_train.shape)
# print(X_train_embed['Location_encoded'].values.shape)
# print(X_train_embed['Route_encoded'].values.shape)

# print(np.any(np.isnan(X_train_embed['Location_encoded'].values)), np.any(np.isnan(X_train_embed['Route_encoded'].values)))
# print(np.any(np.isinf(X_train_embed['Location_encoded'].values)), np.any(np.isinf(X_train_embed['Route_encoded'].values)))

# X_train_location = X_train_embed['Location_encoded'].values.reshape(-1, 1)
# X_train_route = X_train_embed['Route_encoded'].values.reshape(-1, 1)
# X_train_location = X_train_location.astype(np.float32)
# X_train_route = X_train_route.astype(np.float32)

# # Check data types of inputs
# print(X_train.dtype)  # Check if it's float32
# print(X_train_location.dtype)  # Check if it's float32
# print(X_train_route.dtype)  # Check if it's float32
# print(y_train.dtype)  # Check if it's float32

# # Convert all inputs to float32 if they're not already
# X_train = X_train.astype(np.float32)
# X_train_location = X_train_location.astype(np.float32)
# X_train_route = X_train_route.astype(np.float32)
# y_train = y_train.astype(np.float32)




## MODEL Architecture:
# LSTM model for time series
lstm_out = layers.LSTM(num_neurons, activation='relu', return_sequences=True)(features_input)
lstm_out = layers.BatchNormalization()(lstm_out)
lstm_out = layers.Dropout(0.3)(lstm_out)

lstm_out = layers.LSTM(num_neurons // 2, activation='relu')(lstm_out)
lstm_out = layers.Dropout(0.3)(lstm_out)

# Embedding layers for categorical features (location and route)
location_embedded = layers.Embedding(input_dim=location_vocab_size, output_dim=location_embedding_dim)(location_input)
route_embedded = layers.Embedding(input_dim=route_vocab_size, output_dim=route_embedding_dim)(route_input)

# Flatten the embeddings to feed into the dense layer
location_embedded = layers.Flatten()(location_embedded)
route_embedded = layers.Flatten()(route_embedded)

# Concatenate the LSTM output with the embedded categorical features
x = layers.Concatenate()([lstm_out, location_embedded, route_embedded])

# Dense layer for final output
output = layers.Dense(n_outputs)(x)

# Build the model
model = models.Model(inputs=[features_input, location_input, route_input], outputs=output)

# Compile the model
optimizer = Adam(learning_rate=adam_lr)
model.compile(optimizer=optimizer, metrics=['mae'], loss='mse')

# Model summary
model.summary()



# Fit the model
start_time = time.time()
history = model.fit([X_train, X_train_embed['Location_encoded'].values, X_train_embed['Route_encoded'].values], y_train,
# history = model.fit([X_train, X_train_location, X_train_route], y_train,
                    epochs=num_epochs,
                    batch_size=batch_size,
                    callbacks=[lr_schedule],
                    validation_split=0.2,
                    shuffle=False,
                    verbose=1)
end_time = time.time()

# Print total time taken for training
print("Total time: {0} s".format(round((end_time - start_time), 2)))


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
#Testing model:
# getting predictions of model:
y_test_pred = lstm_model.predict(X_test)
y_train_pred = lstm_model.predict(X_train)

train_loss, train_mae = lstm_model.evaluate(X_train, y_train, verbose=0)
test_loss, test_mae = lstm_model.evaluate(X_test, y_test, verbose=0)

# # print("Predictions:", y_pred)

# Print model evaluation results
print(f"TensorFlow Evaluation:")
print(f"Training Loss (MSE): {train_loss:.4f}, Training MAE: {train_mae:.4f}")
print(f"Testing Loss (MSE): {test_loss:.4f}, Testing MAE: {test_mae:.4f}")


plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_predictions_vs_real(y_test, y_test_pred, X_test, scaler, output_index=0, title="Predictions vs Actual"):
    """
    Plots the predicted vs actual values on separate plots, limited to the last month.
    
    Parameters:
    - y_test: Ground truth values (2D array with shape [samples, n_outputs]).
    - y_test_pred: Predicted values (2D array with shape [samples, n_outputs]).
    - X_test: Input features (3D array with time steps and features).
    - scaler: Scaler used to scale the data (e.g., StandardScaler).
    - output_index: Index of the output time step you want to plot (0-based).
    - title: Title for the plot.
    """
    # Extract only the output_index-th output for both y_test and y_test_pred
    y_test_single = y_test[:, output_index]
    y_test_pred_single = y_test_pred[:, output_index]

    # Ensure data is 1D
    y_test_single = y_test_single.flatten()
    y_test_pred_single = y_test_pred_single.flatten()

    # Unscale the data
    y_test_unscaled = scaler.inverse_transform(y_test_single.reshape(-1, 1)).flatten()
    y_test_pred_unscaled = scaler.inverse_transform(y_test_pred_single.reshape(-1, 1)).flatten()

    # Extract datetime features from X_test
    datetime_features = X_test[:, :, -5:]  # Last 5 columns for datetime features (year, month, etc.)
    last_timestep_datetime = datetime_features[:, -1, :]  # Take datetime info from the last time step

    # Create a DataFrame for datetime information
    # datetime_df = pd.DataFrame(last_timestep_datetime, columns=['year', 'month', 'day', 'hour', 'minute'])
    # datetime_df['datetime'] = pd.to_datetime(datetime_df[['year', 'month', 'day', 'hour', 'minute']])
    datetime_df = pd.DataFrame(last_timestep_datetime, columns=['year', 'month', 'day', 'hour'])
    datetime_df['datetime'] = pd.to_datetime(datetime_df[['year', 'month', 'day', 'hour']])


    # Filter the datetime range to the last month
    datetime_df['datetime'] = pd.to_datetime(datetime_df['datetime'])
    last_month = datetime_df['datetime'].max() - pd.DateOffset(months=1)
    datetime_df = datetime_df[datetime_df['datetime'] >= last_month]

    # Filter the data for the last month
    y_test_unscaled_month = y_test_unscaled[datetime_df.index]
    y_test_pred_unscaled_month = y_test_pred_unscaled[datetime_df.index]

    # Plotting the results with separate plots for actual and predicted values
    fig, axs = plt.subplots(2, 1, figsize=(18, 6), sharex=True)

    # Plot actual values
    axs[0].plot(datetime_df['datetime'], y_test_unscaled_month, label="Actual", color="blue", alpha=0.7)
    axs[0].set_ylabel("Actual Delay (minutes)")
    axs[0].set_ylim(bottom=0)
    axs[0].grid(True)

    # Plot predicted values
    axs[1].plot(datetime_df['datetime'], y_test_pred_unscaled_month, label="Predicted", color="red", alpha=0.7)
    axs[1].set_xlabel("Date and Time")
    axs[1].set_ylabel("Predicted Delay (minutes)")
    axs[1].set_ylim(bottom=0)
    axs[1].grid(True)

    # Set the main title for the entire figure
    fig.suptitle(title, fontsize=16)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()


# Call the plotting function
plot_predictions_vs_real(y_test, y_test_pred, X_test, scaler, output_index=0)


In [None]:
input_shape = (X_train.shape[0], X_train.shape[2])  # num samples, num features in each sample

# GRU model
gru_model = models.Sequential()
gru_model.add(layers.GRU(num_neurons,
                         activation='relu',
                         return_sequences=True,  # so we can use multiple GRU layers
                         input_shape=input_shape))
gru_model.add(layers.Dropout(0.3))
gru_model.add(layers.GRU(num_neurons//2,
                         activation='relu'))
gru_model.add(layers.Dropout(0.3))
gru_model.add(layers.Dense(1))  # for the final output layer since it's only 1 output

optimizer = Adam(learning_rate=adam_lr)  # setting optimizer

gru_model.compile(optimizer=optimizer, metrics=['mae'], loss='mse')  # use mae for regression

# creating model
gru_model.summary()

# fitting model
start_time = time.time()
gru_history = gru_model.fit(X_train, y_train,
                        epochs=num_epochs,
                        batch_size=batch_size,
                        validation_split=0.2,  # 0.2 of the training set to be used for validation
                        verbose=1)
end_time = time.time()

print("Total time: {0} s".format(round((end_time - start_time), 2)))

In [None]:
#Testing model:
# getting predictions of model:
y_test_pred = gru_model.predict(X_test)
y_train_pred = gru_model.predict(X_train)

train_loss, train_mae = gru_model.evaluate(X_train, y_train, verbose=0)
test_loss, test_mae = gru_model.evaluate(X_test, y_test, verbose=0)

# # print("Predictions:", y_pred)

# Print model evaluation results
print(f"TensorFlow Evaluation:")
print(f"Training Loss (MSE): {train_loss:.4f}, Training MAE: {train_mae:.4f}")
print(f"Testing Loss (MSE): {test_loss:.4f}, Testing MAE: {test_mae:.4f}")


plt.plot(gru_history.history['loss'], label='Training Loss')
plt.plot(gru_history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()



In [None]:
plot_predictions_vs_real(y_test, y_test_pred, X_test, scaler, output_index=0)

In [None]:
# RNN model
rnn_model = models.Sequential()
rnn_model.add(layers.SimpleRNN(num_neurons,
                               activation='relu',
                               return_sequences=True,  # so we can use multiple RNN layers
                               input_shape=input_shape))
rnn_model.add(layers.Dropout(0.3))
rnn_model.add(layers.SimpleRNN(num_neurons//2,
                               activation='relu'))
rnn_model.add(layers.Dropout(0.3))
rnn_model.add(layers.Dense(1))  # for the final output layer since it's only 1 output

optimizer = Adam(learning_rate=adam_lr)  # setting optimizer

rnn_model.compile(optimizer=optimizer, metrics=['mae'], loss='mse')  # use mae for regression

# creating model
rnn_model.summary()

# fitting model
start_time = time.time()
rnn_history = rnn_model.fit(X_train, y_train,
                        epochs=num_epochs,
                        batch_size=batch_size,
                        validation_split=0.2,  # 0.2 of the training set to be used for validation
                        verbose=1)
end_time = time.time()

print("Total time: {0} s".format(round((end_time - start_time), 2)))


In [None]:
#Testing model:
# getting predictions of model:
y_test_pred = rnn_model.predict(X_test)
y_train_pred = rnn_model.predict(X_train)

train_loss, train_mae = rnn_model.evaluate(X_train, y_train, verbose=0)
test_loss, test_mae = rnn_model.evaluate(X_test, y_test, verbose=0)

# # print("Predictions:", y_pred)

# Print model evaluation results
print(f"TensorFlow Evaluation:")
print(f"Training Loss (MSE): {train_loss:.4f}, Training MAE: {train_mae:.4f}")
print(f"Testing Loss (MSE): {test_loss:.4f}, Testing MAE: {test_mae:.4f}")


plt.plot(rnn_history.history['loss'], label='Training Loss')
plt.plot(rnn_history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()



In [None]:
# output_index = 0
# title = "x"
# y_test_single = y_test[:, output_index]
# y_test_pred_single = y_test_pred[:, output_index]

# # Ensure data is 1D
# y_test_single = y_test_single.flatten()
# y_test_pred_single = y_test_pred_single.flatten()

# # Unscale the data
# y_test_unscaled = scaler.inverse_transform(y_test_single.reshape(-1, 1)).flatten()
# y_test_pred_unscaled = scaler.inverse_transform(y_test_pred_single.reshape(-1, 1)).flatten()

# # Extract datetime features from X_test
# datetime_features = X_test[:, :, -5:]  # Last 5 columns for datetime features (year, month, etc.)
# last_timestep_datetime = datetime_features[:, -1, :]  # Take datetime info from the last time step

# # Create a DataFrame for datetime information
# datetime_df = pd.DataFrame(last_timestep_datetime, columns=['year', 'month', 'day', 'hour', 'minute'])
# datetime_df['datetime'] = pd.to_datetime(datetime_df[['year', 'month', 'day', 'hour', 'minute']])

# # Filter the datetime range to the last month
# datetime_df['datetime'] = pd.to_datetime(datetime_df['datetime'])
# last_month = datetime_df['datetime'].max() - pd.DateOffset(months=1)
# datetime_df = datetime_df[datetime_df['datetime'] >= last_month]

# # Filter the data for the last month
# y_test_unscaled_month = y_test_unscaled[datetime_df.index]
# y_test_pred_unscaled_month = y_test_pred_unscaled[datetime_df.index]

# # Plotting the results on the same plot
# plt.figure(figsize=(18, 6))

# # Plot actual values
# # plt.plot(datetime_df['datetime'], y_test_unscaled_month, label="Actual", color="blue", alpha=0.7)

# # Plot predicted values
# plt.plot(datetime_df['datetime'], y_test_pred_unscaled_month, label="Predicted", color="red", alpha=0.7)

# # Labels and title
# plt.xlabel("Date and Time")
# plt.ylabel("Delay (minutes)")
# plt.title(title, fontsize=16)
# plt.legend()
# plt.grid(True)

# # Show the plot
# plt.tight_layout()
# plt.show()

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd

# def plot_forecast_only(forecasted_values_unscaled, X_test, title="Forecast for Next 30 Days"):
#     """
#     Plots the forecasted values for the next 30 days.

#     Parameters:
#     - forecasted_values_unscaled: Predicted values for the next 30 days (1D array of length 30).
#     - X_test: The input data (to extract datetime information).
#     - title: Title for the plot.
#     """
#     # Extract datetime information from the last window of X_test
#     datetime_features = X_test[-1, :, -5:]  # Last window (most recent window) from X_test
#     last_timestep_datetime = datetime_features[-1, :]  # Get the last timestep's datetime
#     datetime_df = pd.DataFrame([last_timestep_datetime], columns=['year', 'month', 'day', 'hour', 'minute'])
    
#     # Convert to scalar datetime
#     last_datetime = pd.to_datetime(datetime_df.iloc[0])
    
#     # Generate the datetime range
#     datetime_range = last_datetime + pd.to_timedelta(np.arange(30), unit='D')

#     # Check if forecasted values and datetime range match
#     if len(forecasted_values_unscaled) != len(datetime_range):
#         raise ValueError(
#             f"Mismatch in dimensions: forecasted_values_unscaled={len(forecasted_values_unscaled)}, "
#             f"datetime_range={len(datetime_range)}"
#         )

#     # Plot the forecasted values for the next 30 days
#     plt.figure(figsize=(12, 6))
#     plt.plot(datetime_range, forecasted_values_unscaled, label="Forecasted", color="red", alpha=0.7)
#     plt.title(title)
#     plt.xlabel("Time")
#     plt.ylabel("Values")
#     plt.legend()
#     plt.grid(True)
#     plt.show()


# # Step 1: Get the last window of data
# last_window = X_test[-1]  # The last sliding window (most recent data)

# # Step 2: Reshape the input data for prediction (LSTM expects 3D input: [samples, timesteps, features])
# last_window = last_window.reshape((1, last_window.shape[0], last_window.shape[1]))  # Shape: (1, n_steps, n_features)

# # Step 3: Predict the next 30 days
# forecasted_values = lstm_model.predict(last_window, batch_size=1)

# # Step 4: Reshape the forecasted values (correctly)
# forecasted_values = forecasted_values.flatten()  # Ensure it's a flat array with 30 elements

# # Step 5: If you used a scaler for your target variable, unscale the predictions
# forecasted_values_unscaled = lstm_scaler.inverse_transform(forecasted_values.reshape(-1, 1)).flatten()

# # Call the plotting function with forecasted values
# plot_forecast_only(forecasted_values_unscaled, X_test)

