In [None]:
#install.packages("RRF")
#install.packages("data.table")

In [1]:
library(data.table)
library(RRF);set.seed(1)

RRF 1.9.1

Type rrfNews() to see new features/changes/bug fixes.



In [2]:
ROOT_DIRECTORY = "/home/kaan.aytekin/Thesis"
#ROOT_DIRECTORY = "/Users/kaan.aytekin/Desktop/Kaan/Thesis"

## Load Data

In [86]:
feature_list_path <- file.path(ROOT_DIRECTORY,"data/thesis_data/feature_names_list.txt")
feature_columns <- as.character(read.csv(feature_list_path,header = FALSE)[,1])
#feature_columns <- feature_columns[,1]
#feature_columns <- as.character(feature_columns)

In [88]:
feature_engineered_data_path = file.path(ROOT_DIRECTORY,"data/thesis_data/feature_engineered_data.csv")
x_train_data_path <- file.path(ROOT_DIRECTORY,"data/thesis_data/x_train.csv")
y_train_data_path <- file.path(ROOT_DIRECTORY,"data/thesis_data/y_train.csv")

x_train <- data.table::fread(x_train_data_path)[,..feature_columns]
#x_train <- x_train[,..feature_columns]
y_train <- data.table::fread(y_train_data_path)$target_delay_time


In [4]:
set.seed(500)
sample_index <- sample(1:nrow(x_train),20000,replace = FALSE)
x_train_sample <- x_train[c(sample_index)]
y_train_sample <- y_train[c(sample_index)]

In [6]:
nrow(x_train_sample)
nrow(y_train_sample)

NULL

## Standard Random Forest Feature Importance

In [7]:
# Standard Random Forest model
rf_model <- RRF(x=x_train_sample, y=y_train_sample, flagReg = 0)

In [11]:
rf_model_importance <- rf_model$importance
rf_model_importance <- rf_model_importance[,"IncNodePurity"]

In [14]:
rf_model_importance

In [92]:
sorted_rf_model_importance <- rf_model_importance[order(rf_model_importance,decreasing=TRUE)]
rf_model_importance_df <- as.data.table(sorted_rf_model_importance, keep.rownames=TRUE)
setnames(rf_model_importance_df,c("feature","value"))
rf_model_importance_df$order = 1:nrow(rf_model_importance_df)

In [93]:
head(rf_model_importance_df)

feature,value,order
<chr>,<dbl>,<int>
section_travel_time_sec,7124881,1
delay_time_sec,6708992,2
avg_speed_kmph,4010971,3
delay_time_sec_lag1,1822658,4
section_travel_time_sec_lag1,1624110,5
avg_speed_kmph_lag1,1052913,6


## Regularized Random Forest Feature Importance

In [19]:
#Regularized Random Forest Model
rf_model_importance_normalized <- rf_model_importance/(max(rf_model_importance))#normalize the importance score
gamma <- 0.5
regularization_coeff <- (1-gamma)+gamma*rf_model_importance_normalized #weighted average

In [20]:
rrf_model <- RRF(x=x_train_sample, y=y_train_sample, coefReg=regularization_coeff, flagReg = 1)

In [22]:
# Feature Importances
rrf_model_importance <- rrf_model$importance
rrf_model_importance <- rrf_model_importance[,"IncNodePurity"]

In [89]:
sorted_rrf_model_importance <- rrf_model_importance[order(rrf_model_importance,decreasing=TRUE)]

In [90]:
rrf_model_importance_df <- as.data.table(sorted_rrf_model_importance, keep.rownames=TRUE)
setnames(rrf_model_importance_df,c("feature","value"))
rrf_model_importance_df$order = 1:nrow(rrf_model_importance_df)

In [94]:
head(rrf_model_importance_df)

feature,value,order
<chr>,<dbl>,<int>
delay_time_sec,7398172.6,1
section_travel_time_sec,6533061.3,2
avg_speed_kmph,4033253.4,3
section_travel_time_sec_lag1,1840471.2,4
delay_time_sec_lag1,1372206.3,5
avg_speed_kmph_lag1,925088.7,6


## Serialize

In [98]:
write.csv(rf_model_importance_df,file.path(ROOT_DIRECTORY,"data/thesis_data/random_forest_feature_orders.csv"),quote=FALSE,row.names=FALSE)
write.csv(rrf_model_importance_df,file.path(ROOT_DIRECTORY,"data/thesis_data/regularized_random_forest_feature_orders.csv"),quote=FALSE,row.names=FALSE)