## Packages

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from warnings import filterwarnings
from pprint import pprint
import gc
filterwarnings("ignore")
%matplotlib inline

## Global Parameters

In [2]:
ROOT_DIRECTORY = "/home/kaan.aytekin/Thesis"

## UDFs

In [3]:
def sample_from_array(array,freq):
    array_size = len(array)
    sample_size = int(np.ceil(array_size*freq))
    array_slicer = np.zeros(array_size)
    test_index =  np.random.choice(range(0,array_size),size=sample_size,replace=False)
    array_slicer[test_index] = 1
    return array[array_slicer.astype(bool)]


def simulation_based_train_test_split(df, test_size=0.2, seed=None):
    """
    Splits {df} into train and test datasets by their simulation-type with given {test_size}
    """
    if seed:
        np.random.seed(seed)
    unique_simulation_combinations = (
        df[non_feature_columns[:-4]].drop_duplicates().reset_index(drop=True)
    )
    test_simulation_runs = (
        unique_simulation_combinations.groupby(
            ["is_accident_simulation", "accident_lane", "connected_vehicle_ratio"]
        )
        .simulation_run.unique()
        .apply(lambda x: sample_from_array(x, freq=test_size))
    )
    test_simulation_runs = test_simulation_runs.reset_index()

    complete_test_index = []
    for row in test_simulation_runs.itertuples():
        current_test_index = (
            (df.is_accident_simulation == row.is_accident_simulation)
            & (df.accident_lane == row.accident_lane)
            & (df.simulation_run.isin(row.simulation_run))
        )
        if len(complete_test_index):
            complete_test_index = complete_test_index | current_test_index
        else:
            complete_test_index = current_test_index
    train_index = ~complete_test_index
    test_index = complete_test_index
    #df_train = df[~complete_test_index].reset_index(drop=True)
    #df_test = df[complete_test_index].reset_index(drop=True)
    return train_index, test_index #df_train, df_test

## Data Loading

In [4]:
feature_engineered_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/feature_engineered_data.csv")
df = pd.read_csv(feature_engineered_data_path)
#df = df.replace(np.Inf,9999999)
# Rephrasing the problem with another target?
df["target_delay_time_diff"] = df["target_delay_time"] - df["delay_time_sec"]
df_columns = list(df.columns)
# Non-feature columns
non_feature_columns = ["simulation_run", "connected_vehicle_ratio", "is_accident_simulation", 
                       "accident_location", "accident_start_time", 
                       "accident_duration", "accident_lane", 
                       "prev_detector_detector_number","next_detector_detector_number",
                       "detector_number", "timestamp"
]
target_columns = ["target_delay_time", "target_delay_time_diff"]
feature_columns = [column for column in df_columns if column not in non_feature_columns + target_columns]
feature_columns = ["time_after_accident_started", "distance_to_accident"] + [column for column in feature_columns if column not in ["time_after_accident_started", "distance_to_accident"]]
# Reorder the data
df = df[non_feature_columns + feature_columns + target_columns]

In [5]:
feature_list_path = os.path.join(ROOT_DIRECTORY,f"data/thesis_data/feature_names_list.txt")
with open(feature_list_path,"w+") as writer:
    writer.write('\n'.join(feature_columns))

## Train/Test Split

In [6]:
train_index, test_index = simulation_based_train_test_split(df, test_size=0.2, seed=5)
df_train = df[train_index]
df_test = df[test_index]

x_train = df_train[non_feature_columns + feature_columns]
y_train = df_train[["target_delay_time"]]

x_test = df_test[non_feature_columns + feature_columns]
y_test = df_test[["target_delay_time"]]

del df, df_train, df_test
gc.collect()

98

In [7]:
x_train.shape, x_test.shape

((644740, 179), (2354052, 179))

## Serialize

In [8]:
for df_to_serialize,df_name in zip([x_train,y_train,x_test,y_test],["x_train","y_train","x_test","y_test"]):
    csv_path = os.path.join(ROOT_DIRECTORY,f"data/thesis_data/{df_name}.csv")
    df_to_serialize.to_csv(csv_path,index=False)