In [1]:
import os
import pickle
import datetime as dt
import itertools

import pandas as pd
import numpy as np

import keras
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import RMSprop
from keras.callbacks import CSVLogger, EarlyStopping
from keras.optimizers import Adam

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

print('numpy ver.: ' + np.__version__)
print('pandas ver.: ' + pd.__version__)
print('tensorflow ver.: ' + tf.__version__) 
print('keras ver.: ' + keras.__version__)

numpy ver.: 1.25.1
pandas ver.: 1.5.3
tensorflow ver.: 2.12.0
keras ver.: 2.12.0


## Code Section: Merging and Preprocessing Data  
  
This code section performs the following tasks:  
  
1. Load the `merged_df` DataFrame from a pickle file if it exists. If not, load the `cnn_data` and `ANN_data` DataFrames from their respective pickle files.  
  
2. Preprocess the `cnn_data` DataFrame by splitting the 'i' column into three new columns: 'TripId', 'Date', and 'Stop_id'. Then, drop the original 'i' column.  
  
3. Convert the 'TripId' and 'stopOrder' columns in both `cnn_data` and `ANN_data` DataFrames to string data type.  
  
4. Merge the `ANN_data` and `cnn_data` DataFrames on 'TripId' and 'stopOrder' columns, and replace missing values with -1.  
  
5. Drop unnecessary columns from the merged DataFrame and calculate the average link time for each link.  
  
6. Create a dictionary to store the link times for each trip and update the link time values based on the average link times.  
  
7. Iterate through the merged DataFrame and store the link times up to the current stop in a new column named 'links_refs_till_current_stop'.  
  
8. Save the final merged DataFrame to a pickle file named `merged_df_final_1.pkl` for future use.  


In [2]:
try:
    with open('../finalmodel/data/merged_df_final_1.pkl', 'rb') as file:
        merged_df = pickle.load(file)
except:
    with open('../finalmodel/data/aia/Final_Data.pkl', 'rb') as file:
        cnn_data = pickle.load(file)

    # Split the 'i' column into three new columns
    cnn_data[['TripId', 'Date', 'Stop_id']] = cnn_data['i'].str.split('_', expand=True)

    # Drop the original 'i' column
    cnn_data.drop(columns=['i'], inplace=True)

    with open('../finalmodel/data/matan/ANN_Complete_data2.pkl', 'rb') as file:
        ANN_data = pickle.load(file)

    ANN_data['TripId'] = ANN_data['TripId'].astype(str)
    ANN_data['stopOrder'] = ANN_data['stopOrder'].astype(str)

    cnn_data['TripId'] = cnn_data['TripId'].astype(str)
    cnn_data['stopOrder'] = cnn_data['stopOrder'].astype(str)

    merged_df = pd.merge(ANN_data, cnn_data, on=['TripId', 'stopOrder'], how='left')
    
    merged_df.replace({np.nan: -1, pd.NaT: -1, 'None': -1, 'N/A': -1}, inplace=True)

    merged_df = merged_df.drop(columns=['actualArrivalTime_y', 'Linkref_y', 'actualDepartureTime_y'])

    avg_times = merged_df.groupby('Linkref_x')['linkTime'].mean().to_dict()
    TripIds = merged_df.groupby('TripId')

    def drop_negative_values(dictionary):
        return {key: value for key, value in dictionary.items() if value >= 0}
    avg_times = drop_negative_values(avg_times)

    links_time = dict()
    for trip_id, group_df in TripIds:
        Linkref = list(group_df['Linkref_x'])
        linkTime = list(group_df['linkTime'])

        for i in range(len(linkTime)):
            if linkTime[i] < 0:
                if Linkref[i] in avg_times:
                    linkTime[i] = avg_times[Linkref[i]]
                else:
                    linkTime[i] = -1

        links_time[trip_id] = linkTime

    for index, row in merged_df.iterrows():
        trip_id = row['TripId']
        stop_order = int(row['stopOrder'])
        
        if stop_order != 2:
            # Get the value from the dictionary up to the 'stopOrder'
            value_from_dict = links_time[trip_id][:stop_order-2]
                
            # Store the value in a new column in the DataFrame
            merged_df.at[index, 'links_refs_till_current_stop'] = '_'.join(str(item) for item in value_from_dict)

    with open('../finalmodel/data/merged_df_final_1.pkl', 'wb') as file:
        pickle.dump(merged_df, file)

## Code Section: Defining Functions to Extract Relevant Features  
  
This code section defines three functions to extract relevant features from the input DataFrames:  
  
1. `get_ann_info(row)`: Extracts features related to the ANN model, including travel time, headway time, link reference ID, arrival and departure times, delay levels, and time periods.  
  
2. `get_cnn_info(row)`: Extracts features related to the CNN model, including direction, stop sequence, day of the week, time categories, and previous delay information.  
  
3. `get_multi_models_info(row)`: Extracts features related to the multi-models approach, including link references till the current stop, ID, and stop order.  


In [3]:
def get_ann_info(row):
    return row[['K-1_Travel_Time', 'K-2_Travel_Time', 'K-3_Travel_Time', 'Headway_Time',
       'K-1_Headway_Time', 'K-2_Headway_Time', 'K-3_Headway_Time', 'LinkrefID',
       'ArrivalTimeHour', 'ArrivalTimeMinute', 'ArrivalTimeSecond',
       'DepartureTimeHour', 'DepartureTimeMinute', 'DepartureTimeSecond',
       'Delay_Level_Level 1', 'Delay_Level_Level 2', 'Delay_Level_Level 3',
       'Delay_Level_Level 4', 'Time_Period_Weekday a.m. peak hours',
       'Time_Period_Weekday off-peak hours',
       'Time_Period_Weekday p.m. peak hours', 'Time_Period_Weekend all-day']]  

def get_cnn_info(row):
    return row[['direction', 'StopSequence', 'DayInWeek_friday',
       'DayInWeek_monday', 'DayInWeek_saturday', 'DayInWeek_sunday',
       'DayInWeek_thursday', 'DayInWeek_tuesday', 'DayInWeek_wednesday',
       'preD1', 'preD2', 'timeCategory_Unknown', 'timeCategory_d1',
       'timeCategory_d2', 'timeCategory_d3', 'timeCategory_d4',
       'timeCategory_d5', 'timeCategory_d6']] 

def get_multi_models_info(row):
    return row[['links_refs_till_current_stop', 'ID', 'stopOrder']] 

In this code section, we perform preparation of the merged_df to the diff models:

1. Import the required libraries, including TensorFlow, Keras, and Scikit-learn.

2. Load the CNN training data and fit a `StandardScaler` object to it, excluding the 'i' and 'targetTime' columns.

3. Load the pre-trained CNN and ANN models.

4. Define a `DataHolder` class to store information about the lines, routes, and models.

5. Define a `build_model` function to create an ANN model with a specified number of input features (number of stops).

6. Load line information from a pickle file and create a dictionary of `DataHolder` objects.

7. Process a DataFrame called `merged_df`:
   - Split the 'LINE_DESC' column by '-' and create a new 'ID' column from the first part of the split.
   - Create a new 'DIRECTION' column from the second part of the split.
   - Concatenate the 'ID' and 'DIRECTION' columns as the new 'ID' column.


In [4]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler

with open('../finalmodel/data/aia/F.Data.pkl', 'rb') as file:
        cnn_data = pickle.load(file)
cnn_scaler_normalizer= StandardScaler()
cnn_scaler_normalizer.fit_transform(cnn_data.drop(columns=['i', 'targetTime']))
#X_test = cnn_scaler_normalizer.transform(X_test_)
cnn_model = load_model("../finalmodel/data/aia/simpleANN_BusTracker_2_64.h5")
ann_model = load_model("../finalmodel/data/matan/my_model1.h5")

class DataHolder:
    def __init__(self, line_name, links, routs, links_defaults, X, Y, models):
        self.line_name = line_name
        self.links = links
        self.routs = routs
        self.links_defaults = links_defaults
        self.x_train = X
        self.y_train = Y
        self.x_test = X
        self.y_test = Y
        self.models = models

def build_model(num_of_stops):
    model = Sequential()

    model.add(Dense(units=64, input_shape=(num_of_stops,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='relu'))

    model.compile(optimizer=Adam(), loss = "MAE", metrics=[])
    
    return model

with open('../finalmodel/data/haim/lines_info.pickle', 'rb') as file:
        multi_models = pickle.load(file)
multi_models = {data.line_name: data for data in multi_models}

# split LINE_DESC column by '-' and take the 0th element to create 'ID' column
merged_df['ID'] = merged_df['LINE_DESC'].str.split('-').str[0]
# split LINE_DESC column by '-' and take the 1st element to create 'DIRECTION' column
merged_df['DIRECTION'] = merged_df['LINE_DESC'].str.split('-').str[1]
merged_df['ID'] = merged_df['ID'].astype(str) + '_' + merged_df['DIRECTION'].astype(str)

In this code section, we perform the following tasks:

1. Select a specific row from the `merged_df` DataFrame.

2. Extract the ANN, CNN, and multi-models information for the selected row using the `get_ann_info`, `get_cnn_info`, and `get_multi_models_info` functions.

3. Check if the extracted CNN row does not contain any -1 or null values:
   - Reshape the CNN row and scale it using the `cnn_scaler_normalizer` object.
   - Pass the scaled and reshaped CNN row to the pre-trained CNN model and print the prediction.

4. Check if the extracted ANN row does not contain any -1 or null values:
   - Reshape the ANN row and pass it to the pre-trained ANN model.
   - Print the prediction from the ANN model.

5. Check if the extracted multi-models row does not contain any -1 or null values:
   - Split the 'links_refs_till_current_stop' column by '_' and ensure it does not contain any -1 values.
   - Check if the multi-models row 'ID' exists in the `multi_models` dictionary.
   - Build an ANN model using the `build_model` function with the appropriate input features (number of stops).
   - Set the weights of the built model using the pre-trained model from the `multi_models` dictionary.
   - Convert the 'links_time' values to float, reshape, and pass them to the built model.
   - Print results


In [5]:
# row with all data of all models
row = merged_df.iloc[478]

ann_row = get_ann_info(row)
cnn_row = get_cnn_info(row)
multi_models_row = get_multi_models_info(row)

if not (cnn_row == -1).any() and not cnn_row.isnull().any():
    cnn_row = np.array(list(cnn_row)).reshape(1, -1)
    cnn_row = cnn_scaler_normalizer.transform(cnn_row)
    print(cnn_model.predict(cnn_row))
    pass

if not (ann_row == -1).any() and not ann_row.isnull().any():
    print(ann_model.predict(np.array(list(ann_row)).reshape(1, -1)))

if not (multi_models_row == -1).any() and not multi_models_row.isnull().any():
    links_time = multi_models_row['links_refs_till_current_stop'].split('_')
    if -1 not in links_time and multi_models_row['ID'] in multi_models:
        model = build_model(int(multi_models_row['stopOrder'])-2)
        model.set_weights(multi_models[multi_models_row['ID']].models[int(multi_models_row['stopOrder'])-3])
        links_time_float = np.array(links_time).astype(float)
        links_time_reshaped = links_time_float.reshape(1, -1)
        print(model.predict(links_time_reshaped))



[[13383.347]]
[[23.31433]]
[[0.]]
