In [1]:
import os
import pandas as pd
import pprint
from IPython.display import display, Markdown

In [2]:
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

In [3]:
ROOT_DIRECTORY = "/home/kaan.aytekin/Thesis"
#ROOT_DIRECTORY = "/Users/kaan.aytekin/Desktop/Kaan/Thesis"

## Load Data

In [4]:
mi_feat_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/mutual_information_feature_orders.csv")
corr_feat_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/correlation_feature_orders.csv")
rf_feat_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/random_forest_feature_orders.csv")
rrf_feat_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/regularized_random_forest_feature_orders.csv")


mutual_information_feature_importance_df = pd.read_csv(mi_feat_data_path)
correlation_feature_importance_df = pd.read_csv(corr_feat_data_path)
random_forest_feature_importance_df = pd.read_csv(rf_feat_data_path)
regularized_random_forest_feature_importance_df = pd.read_csv(rrf_feat_data_path)

In [5]:
mutual_information_feature_importance_df["source"] = "mutual_importance"
correlation_feature_importance_df["source"] = "correlation"
random_forest_feature_importance_df["source"] = "rf_feature_importance"
regularized_random_forest_feature_importance_df["source"] = "rrf_feature_importance"

In [6]:
printmd("**Mutual Information**",color="green")
display(mutual_information_feature_importance_df.head(),mutual_information_feature_importance_df.tail())
printmd("**Correlation**",color="green")
display(correlation_feature_importance_df.head(),correlation_feature_importance_df.tail())
printmd("**Random Forest**",color="green")
display(random_forest_feature_importance_df.head(),random_forest_feature_importance_df.tail())
printmd("**Regularized Random Forest**",color="green")
display(regularized_random_forest_feature_importance_df.head(),regularized_random_forest_feature_importance_df.tail())

<span style='color:green'>**Mutual Information**</span>

Unnamed: 0,feature,value,order,source
0,section_travel_time_sec,0.659285,1,mutual_importance
1,delay_time_sec,0.659148,2,mutual_importance
2,avg_speed_kmph,0.657341,3,mutual_importance
3,avg_speed_kmph_lag1,0.455751,4,mutual_importance
4,section_travel_time_sec_lag1,0.455436,5,mutual_importance


Unnamed: 0,feature,value,order,source
163,next_detector_flow_vehph_lag6,0.046786,164,mutual_importance
164,next_detector_flow_vehph_lag9,0.04324,165,mutual_importance
165,distance_to_accident,0.033427,166,mutual_importance
166,time_after_accident_started,0.011453,167,mutual_importance
167,is_accident_timestamp,0.00597,168,mutual_importance


<span style='color:green'>**Correlation**</span>

Unnamed: 0,feature,value,order,source
0,delay_time_sec,0.831758,1,correlation
1,section_travel_time_sec,0.831758,2,correlation
2,delay_time_sec_lag1,0.723303,3,correlation
3,section_travel_time_sec_lag1,0.723303,4,correlation
4,section_travel_time_sec_lag3,0.687927,5,correlation


Unnamed: 0,feature,value,order,source
163,next_detector_flow_vehph_lag9,0.065011,164,correlation
164,next_detector_flow_vehph_lag7,0.064051,165,correlation
165,next_detector_flow_vehph_lag10,0.061084,166,correlation
166,is_accident_timestamp,0.051727,167,correlation
167,distance_to_accident,-0.051723,168,correlation


<span style='color:green'>**Random Forest**</span>

Unnamed: 0,feature,value,order,source
0,section_travel_time_sec,7124881.0,1,rf_feature_importance
1,delay_time_sec,6708992.0,2,rf_feature_importance
2,avg_speed_kmph,4010971.0,3,rf_feature_importance
3,delay_time_sec_lag1,1822658.0,4,rf_feature_importance
4,section_travel_time_sec_lag1,1624110.0,5,rf_feature_importance


Unnamed: 0,feature,value,order,source
163,flow_vehph_lag6,9142.837747,164,rf_feature_importance
164,flow_vehph_lag9,8622.276305,165,rf_feature_importance
165,prev_detector_section_travel_time_sec_lag6,8202.162755,166,rf_feature_importance
166,prev_detector_delay_time_sec_lag6,8125.234508,167,rf_feature_importance
167,is_accident_timestamp,2495.723727,168,rf_feature_importance


<span style='color:green'>**Regularized Random Forest**</span>

Unnamed: 0,feature,value,order,source
0,delay_time_sec,7398173.0,1,rrf_feature_importance
1,section_travel_time_sec,6533061.0,2,rrf_feature_importance
2,avg_speed_kmph,4033253.0,3,rrf_feature_importance
3,section_travel_time_sec_lag1,1840471.0,4,rrf_feature_importance
4,delay_time_sec_lag1,1372206.0,5,rrf_feature_importance


Unnamed: 0,feature,value,order,source
163,prev_detector_avg_speed_kmph_lag6,9514.557003,164,rrf_feature_importance
164,flow_vehph_lag7,9210.54846,165,rrf_feature_importance
165,prev_detector_section_travel_time_sec_lag5,7594.364747,166,rrf_feature_importance
166,prev_detector_section_travel_time_sec_lag7,5404.750878,167,rrf_feature_importance
167,is_accident_timestamp,953.132423,168,rrf_feature_importance


In [7]:
feature_importance_df = mutual_information_feature_importance_df.append(correlation_feature_importance_df).append(random_forest_feature_importance_df).append(regularized_random_forest_feature_importance_df).reset_index(drop=True)

In [8]:
feature_importance_order_df = feature_importance_df.pivot(index="feature",columns="source",values="order").sort_values(by=["mutual_importance","rrf_feature_importance","rf_feature_importance","correlation"])

In [9]:
feature_importance_order_df = feature_importance_order_df.reset_index()#.drop("source")
del feature_importance_order_df.columns.name

## Serialize Data

In [10]:
feature_importance_order_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/feature_importance_orders.csv")
try:
    feature_importance_order_df = pd.read_csv(feature_importance_order_data_path)
except:
    feature_importance_order_df.to_csv(feature_importance_order_data_path,index=False)

In [11]:
top_features_df = feature_importance_order_df[
    (feature_importance_order_df.correlation < 30)
    | (feature_importance_order_df.mutual_importance < 30)
    | (feature_importance_order_df.rf_feature_importance < 30)
    | (feature_importance_order_df.rrf_feature_importance < 30)
].reset_index(drop=True)

In [12]:
display(top_features_df)

Unnamed: 0,feature,correlation,mutual_importance,rf_feature_importance,rrf_feature_importance
0,section_travel_time_sec,2,1,1,2
1,delay_time_sec,1,2,2,1
2,avg_speed_kmph,46,3,3,3
3,avg_speed_kmph_lag1,53,4,6,6
4,section_travel_time_sec_lag1,4,5,5,4
5,delay_time_sec_lag1,3,6,4,5
6,prev_detector_avg_speed_kmph_lag3,90,7,71,74
7,prev_detector_section_travel_time_sec_lag3,93,8,62,65
8,prev_detector_delay_time_sec_lag3,94,9,63,69
9,avg_speed_kmph_lag2,56,10,18,20


In [13]:
top_features_data_path = os.path.join(ROOT_DIRECTORY,"data/thesis_data/top_features.txt")
top_features_df.to_csv(top_features_data_path,index=False)