## Packages

In [11]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from warnings import filterwarnings
from pprint import pprint
import gc
import pickle
filterwarnings("ignore")
%matplotlib inline

In [2]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

## Global Parameters

In [3]:
ROOT_DIRECTORY = "/home/kaan.aytekin/Thesis"
# Non-feature columns
non_feature_columns = ["simulation_run", "is_accident_simulation", 
                       "accident_location", "accident_start_time", 
                       "accident_duration", "accident_lane", 
                       "prev_detector_detector_number","next_detector_detector_number",
                       "detector_number", "timestamp"
]

## UDFs

In [4]:
def sample_from_array(array,freq):
    array_size = len(array)
    sample_size = int(np.ceil(array_size*freq))
    array_slicer = np.zeros(array_size)
    test_index =  np.random.choice(range(0,array_size),size=sample_size,replace=False)
    array_slicer[test_index] = 1
    return array[array_slicer.astype(bool)]

def kfolds_from_array(array,k,seed=None):
    if seed:
        np.random.seed(seed)
    np.random.shuffle(array)
    array_folds = np.array_split(array,k)
    return array_folds

def simulation_based_k_folds_split(df,k=10,seed=None):
    if seed:
        np.random.seed(seed)
    unique_simulation_combinations = df[["simulation_run","is_accident_simulation","accident_lane"]].drop_duplicates().reset_index(drop=True)
    unique_simulation_combinations = df[["simulation_run","is_accident_simulation","accident_lane"]].drop_duplicates().reset_index(drop=True)
    test_simulation_runs = unique_simulation_combinations.groupby(["is_accident_simulation","accident_lane"]).simulation_run.unique().apply(lambda x: kfolds_from_array(x,k=k))
    test_simulation_runs = test_simulation_runs.reset_index()
    
    for fold_number in range(k):
        complete_test_index = []
        for row in test_simulation_runs.itertuples():
            current_test_index = (
                (df.is_accident_simulation == row.is_accident_simulation)
                &(df.accident_lane == row.accident_lane)
                &(df.simulation_run.isin(row.simulation_run[fold_number]))
            )
            if len(complete_test_index):
                complete_test_index = (complete_test_index | current_test_index)
            else:
                complete_test_index = current_test_index

        train_index = ~complete_test_index
        test_index = complete_test_index
        #df_train = df[~complete_test_index].reset_index(drop=True)
        #df_test = df[complete_test_index].reset_index(drop=True)
        yield train_index, test_index #df_train, df_test

def simulation_based_train_test_split(df, test_size=0.2, seed=None):
    """
    Splits {df} into train and test datasets by their simulation-type with given {test_size}
    """
    if seed:
        np.random.seed(seed)
    unique_simulation_combinations = (
        df[["simulation_run","is_accident_simulation","accident_lane"]].drop_duplicates().reset_index(drop=True)
    )
    test_simulation_runs = (
        unique_simulation_combinations.groupby(
            ["is_accident_simulation", "accident_lane"]
        )
        .simulation_run.unique()
        .apply(lambda x: sample_from_array(x, freq=test_size))
    )
    test_simulation_runs = test_simulation_runs.reset_index()

    complete_test_index = []
    for row in test_simulation_runs.itertuples():
        current_test_index = (
            (df.is_accident_simulation == row.is_accident_simulation)
            & (df.accident_lane == row.accident_lane)
            & (df.simulation_run.isin(row.simulation_run))
        )
        if len(complete_test_index):
            complete_test_index = complete_test_index | current_test_index
        else:
            complete_test_index = current_test_index
    train_index = ~complete_test_index
    test_index = complete_test_index
    #df_train = df[~complete_test_index].reset_index(drop=True)
    #df_test = df[complete_test_index].reset_index(drop=True)
    return train_index, test_index #df_train, df_test

def custom_cross_validation(models_list,performance_metrics_list,df_train,test_size=0.2,repetition_count=5,k_folds=10,seed=None):
    if seed:
        np.random.seed(seed)
    results = []
    for repetition in repetition_count:
        df_train, df_validate = simulation_based_train_test_split(df=df_train,test_size=test_size)
        x_train = df_train[feature_columns]
        y_train = df_train["target"]
        x_validate = df_validate[feature_columns]
        y_validate = df_validate["target"]
        for x,y in simulation_based_k_folds_split:
            for model in models_list:
                model.fit(x_train,y_train)
                y_predicted = model.predict(x_validate)
                for performance_metric in performance_metrics_list:
                    performance_metric(y_validate,y_predicted)

## Data Loading

In [5]:
x_train = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/x_train.csv"))
y_train = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/y_train.csv"))

x_test = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/x_test.csv"))
y_test = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/y_test.csv"))

In [6]:
x_train.head()

Unnamed: 0,simulation_run,is_accident_simulation,accident_location,accident_start_time,accident_duration,accident_lane,prev_detector_detector_number,next_detector_detector_number,detector_number,timestamp,...,prev_detector_flow_vehph_lag9,prev_detector_density_vehpkm_lag9,prev_detector_avg_speed_kmph_lag9,prev_detector_section_travel_time_sec_lag9,prev_detector_delay_time_sec_lag9,prev_detector_flow_vehph_lag10,prev_detector_density_vehpkm_lag10,prev_detector_avg_speed_kmph_lag10,prev_detector_section_travel_time_sec_lag10,prev_detector_delay_time_sec_lag10
0,0,0,0,0,0,0,1.0,2.0,1,915,...,1440.0,13.652174,49.2,36.585366,19.343987,1440.0,13.652174,49.2,36.585366,19.343987
1,0,0,0,0,0,0,1.0,2.0,1,930,...,1440.0,13.652174,49.2,36.585366,19.343987,1440.0,13.652174,49.2,36.585366,19.343987
2,0,0,0,0,0,0,1.0,2.0,1,945,...,1440.0,13.652174,49.2,36.585366,19.343987,1440.0,13.652174,49.2,36.585366,19.343987
3,0,0,0,0,0,0,1.0,2.0,1,960,...,1440.0,13.652174,49.2,36.585366,19.343987,1440.0,13.652174,49.2,36.585366,19.343987
4,0,0,0,0,0,0,1.0,2.0,1,975,...,1440.0,13.652174,49.2,36.585366,19.343987,1440.0,13.652174,49.2,36.585366,19.343987


## Feature Selection & Preprocessing

### Top Features

In [7]:
top_features_df = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/top_features.txt"))
selected_features = top_features_df.feature.to_list()

In [8]:
non_feature_columns = [
    "simulation_run", 
    "is_accident_simulation", 
    #"accident_location", 
    #"accident_start_time", 
    #"accident_duration", 
    "accident_lane", 
    #"prev_detector_detector_number",
    #"next_detector_detector_number",
    #"detector_number", 
    #"timestamp"
]

### Configs + Features

In [9]:
x_train = x_train[non_feature_columns + selected_features]
x_test = x_test[non_feature_columns + selected_features]

### One Hot Encoding for "Accident Lane"

In [10]:
x_train_accident_lane_categorical = x_train[["accident_lane"]]
x_test_accident_lane_categorical = x_test[["accident_lane"]]
one_hot_encoder = OneHotEncoder(drop="first")
one_hot_encoder.fit(x_train_accident_lane_categorical)

x_train_accident_lane_df = pd.DataFrame(one_hot_encoder.transform(x_train_accident_lane_categorical).toarray())
x_train_accident_lane_df.columns = one_hot_encoder.get_feature_names(["accident_lane"])

x_test_accident_lane_df = pd.DataFrame(one_hot_encoder.transform(x_test_accident_lane_categorical).toarray())
x_test_accident_lane_df.columns = one_hot_encoder.get_feature_names(["accident_lane"])

In [12]:
with open(os.path.join(ROOT_DIRECTORY, "model/one_hot_encoder.pkl"), "wb") as writer:
    pickle.dump(obj=one_hot_encoder, file=writer)

In [11]:
x_train = pd.concat([x_train_accident_lane_df,x_train],axis = 1)
x_test = pd.concat([x_test_accident_lane_df,x_test],axis = 1)

In [12]:
df_train = x_train.copy()
df_train["target"] = y_train

df_test = x_test.copy()
df_test["target"] = y_test

In [13]:
df_train.columns

Index(['accident_lane_1', 'accident_lane_2', 'accident_lane_3',
       'simulation_run', 'is_accident_simulation', 'accident_lane',
       'section_travel_time_sec', 'delay_time_sec', 'avg_speed_kmph',
       'avg_speed_kmph_lag1', 'section_travel_time_sec_lag1',
       'delay_time_sec_lag1', 'prev_detector_avg_speed_kmph_lag3',
       'prev_detector_section_travel_time_sec_lag3',
       'prev_detector_delay_time_sec_lag3', 'avg_speed_kmph_lag2',
       'prev_detector_avg_speed_kmph_lag4', 'density_vehpkm',
       'prev_detector_delay_time_sec_lag2',
       'prev_detector_section_travel_time_sec_lag2',
       'section_travel_time_sec_lag2', 'delay_time_sec_lag2',
       'prev_detector_avg_speed_kmph_lag2',
       'prev_detector_delay_time_sec_lag4',
       'prev_detector_section_travel_time_sec_lag4',
       'section_travel_time_sec_lag3', 'delay_time_sec_lag3',
       'avg_speed_kmph_lag3', 'prev_detector_avg_speed_kmph_lag1',
       'prev_detector_delay_time_sec_lag1',
       'prev_d

In [14]:
FEATURE_COLUMNS =  ["accident_lane_1", "accident_lane_2", "accident_lane_3", 'accident_lane'] + selected_features

## Serialize Data

In [15]:
df_train.to_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/x_train_processed.csv"),index=False)
df_test.to_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/x_test_processed.csv"),index=False)

In [16]:
processed_feature_columns_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/processed_feature_columns.txt")
with open(processed_feature_columns_path,"w") as writer:
    writer.write("\n".join(FEATURE_COLUMNS))