In [3]:
from sklearn.feature_selection import mutual_info_regression
import os
import pandas as pd
import numpy as np
import pprint
import seaborn as sns
from matplotlib import pyplot as plt

In [5]:
ROOT_DIRECTORY = "/home/kaan.aytekin/Thesis"
#ROOT_DIRECTORY = "/Users/kaan.aytekin/Desktop/Kaan/Thesis"

## Data Loading

In [6]:
feature_list_path = os.path.join(ROOT_DIRECTORY,f"data/thesis_data/feature_names_list.txt")
with open(feature_list_path,"r") as reader:
    feature_columns = reader.read()
    feature_columns = feature_columns.split("\n")

In [3]:
x_train = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/x_train.csv"))[feature_columns]
y_train = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/y_train.csv")).target_delay_time.ravel()

df_train = x_train.copy()
df_train["target"] = y_train

x_test = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/x_test.csv"))[feature_columns]
y_test = pd.read_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/y_test.csv")).target_delay_time.ravel()

In [4]:
# Label discrete feature(s)
#feature_columns = list(x_train.columns)
discrete_feature_index = feature_columns.get_loc("is_accident_timestamp")
discrete_feature_index_array = np.zeros(len(feature_columns))
discrete_feature_index_array[discrete_feature_index] = 1
discrete_feature_index_array = discrete_feature_index_array.astype(bool)

In [5]:
x_train.shape

(474807, 168)

## Data Sampling

In [6]:
seed = 500
x_train_sample = x_train.sample(n=20_000,random_state=seed)
y_train_sample = y_train[x_train_sample.index]

## Calculate Mutual Information

In [7]:
# Computationally expensive
feature_mutual_informations = mutual_info_regression(x_train_sample,y_train_sample,discrete_features=discrete_feature_index_array)

In [8]:
mutual_information_descending_importance_index = np.argsort(feature_mutual_informations)[::-1]
labeled_sorted_feature_mutual_informations = list(zip(feature_columns[mutual_information_descending_importance_index],feature_mutual_informations[mutual_information_descending_importance_index]))

In [9]:
pprint.pprint(labeled_sorted_feature_mutual_informations)

[('section_travel_time_sec', 0.6592846285418261),
 ('delay_time_sec', 0.6591475516718184),
 ('avg_speed_kmph', 0.6573405685591958),
 ('avg_speed_kmph_lag1', 0.45575133160180314),
 ('section_travel_time_sec_lag1', 0.45543609123517026),
 ('delay_time_sec_lag1', 0.4550983482140314),
 ('prev_detector_avg_speed_kmph_lag3', 0.38692412018222555),
 ('prev_detector_section_travel_time_sec_lag3', 0.380536681409823),
 ('prev_detector_delay_time_sec_lag3', 0.3797768915581443),
 ('avg_speed_kmph_lag2', 0.36880068111145103),
 ('prev_detector_avg_speed_kmph_lag4', 0.3636199987887849),
 ('density_vehpkm', 0.36313145449743267),
 ('prev_detector_delay_time_sec_lag2', 0.3581387307647548),
 ('prev_detector_section_travel_time_sec_lag2', 0.3579961577779347),
 ('section_travel_time_sec_lag2', 0.3547419391830644),
 ('delay_time_sec_lag2', 0.3542522726229045),
 ('prev_detector_avg_speed_kmph_lag2', 0.3518340870223371),
 ('prev_detector_delay_time_sec_lag4', 0.3502123095668104),
 ('prev_detector_section_travel

## Calculate Feature~Target Correlation

## Correlation Plot

In [None]:
sns.set_theme(style="white")

# Compute the correlation matrix
correlation_matrix = df_train[feature_columns+["target"]].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
#figure(num=None, figsize=(20, 12), dpi=80, facecolor="w", edgecolor="k")
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(100, 60))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlation_matrix, mask=mask, 
            cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [12]:
x_train_sample_extended = x_train_sample.copy()
x_train_sample_extended["target"] = y_train_sample

In [24]:
feature_correlation_information = x_train_sample_extended.corr()["target"][:-1]
correlation_descending_importance_index = np.argsort(feature_correlation_information.abs())[::-1] # Take absolute correlation
labeled_sorted_feature_correlation_informations = list(zip(feature_columns[correlation_descending_importance_index],feature_correlation_information[correlation_descending_importance_index]))

In [26]:
pprint.pprint(labeled_sorted_feature_correlation_informations)

[('delay_time_sec', 0.8317583239701359),
 ('section_travel_time_sec', 0.8317583239701358),
 ('delay_time_sec_lag1', 0.7233034308133626),
 ('section_travel_time_sec_lag1', 0.7233034308133625),
 ('section_travel_time_sec_lag3', 0.6879271253873855),
 ('delay_time_sec_lag3', 0.6879271253873855),
 ('section_travel_time_sec_lag4', 0.6817676740353644),
 ('delay_time_sec_lag4', 0.6817676740353642),
 ('delay_time_sec_lag5', 0.6791516096845567),
 ('section_travel_time_sec_lag5', 0.6791516096845566),
 ('delay_time_sec_lag2', 0.6773836771700158),
 ('section_travel_time_sec_lag2', 0.6773836771700157),
 ('next_detector_delay_time_sec_lag6', 0.6253425988876425),
 ('next_detector_section_travel_time_sec_lag6', 0.625342598887642),
 ('delay_time_sec_lag6', 0.6187852283681293),
 ('section_travel_time_sec_lag6', 0.6187852283681291),
 ('delay_time_sec_lag7', 0.6102088535215989),
 ('section_travel_time_sec_lag7', 0.6102088535215988),
 ('next_detector_section_travel_time_sec_lag5', 0.607583723301211),
 ('nex

## Serialize

In [76]:
mutual_information_feature_order_df = pd.DataFrame(labeled_sorted_feature_mutual_informations)
mutual_information_feature_order_df.columns = ["feature","value"]
mutual_information_feature_order_df["order"] = mutual_information_feature_order_df.index + 1


correlation_feature_order_df = pd.DataFrame(labeled_sorted_feature_correlation_informations)
correlation_feature_order_df.columns = ["feature","value"]
correlation_feature_order_df["order"] = correlation_feature_order_df.index + 1


mutual_information_feature_order_df.to_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/mutual_information_feature_orders.csv"),index=False)
correlation_feature_order_df.to_csv(os.path.join(ROOT_DIRECTORY,"data/thesis_data/correlation_feature_orders.csv"),index=False)