# Custom Random Forest implementation
#### This script applies a customized random forest (RF) algorithm to the (in)famous "Iris" data set. Results are compared to the scikit-learn RF implementation. 
## 1. Load data, test functionality by calculate confusion matrix and metrics

In [None]:
### Import libraries
# Matrix/vector handling
import numpy as np
import pandas as pd
# Scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold
# Own code
from lib import RandomForest as rf
from lib import Evaluation as evl
from lib import DataHandling as dat
from lib import Plotting as pl
# Plotting
import matplotlib.pyplot as plt

In [None]:
### Load Iris data set
X_train, X_test, y_train, y_test, X_labs, y_labs = dat.getIrisData(test_fraction=0.2)

### Grow forest
irisRF = rf.RandomForest(num_trees=500,max_depth=10)
irisRF.growForest(X_train, y_train, feature_names=X_labs, target_names=y_labs)
# Make prediction
y_pred,_ = irisRF.predict(X_test)

In [None]:
### Create confusion matrix

# First, create dictionary for pretty data frame
y_strings = [str(cl) for cl in sorted(set(np.concatenate([y_train,y_test])))]
class_dict = dict(zip(y_strings, y_labs))

# Now, matrix
conf_mat, overall_metrics = evl.confMatMetrics(y_test,y_pred,class_dict=class_dict)
# Print results
print(conf_mat)
print(overall_metrics)

In [None]:
### Export to LaTeX table?
print(conf_mat.to_latex(caption="Confusion matrix for the classification results on random 20 percent of the training data."))
print(overall_metrics.to_latex(caption="Chosen metrics for the classification results on random 20 percent of the training data."))

## 2. Use cross validation to compare to scikit-learn

In [None]:
### Reload data without subsetting
X, y, X_labs, y_labs = dat.getIrisData()

### Create cross validation folds
kf = KFold(n_splits=10,shuffle=True)
split_idx = kf.split(X)

### Initialize vectors to store performance metrics
custom_test_acc = []
custom_test_f1 = []
sklearn_test_acc = []
sklearn_test_f1 = []
custom_oob = []
sklearn_oob = []

### Loop through folds
for train_index, test_index in split_idx:
    
    # Subset data
    cur_X_train = X[train_index,:]
    cur_y_train = y[train_index]
    
    cur_X_test = X[test_index,:]
    cur_y_test = y[test_index]
    
    ### Scikit Learn
    sklrf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=10, max_features='auto',\
                                   bootstrap=True, oob_score=True, verbose=0)
    sklrf.fit(cur_X_train, cur_y_train)
    skl_y_pred = sklrf.predict(cur_X_test)
    
    
    ### Own implementation
    custrf = rf.RandomForest(num_trees=500, max_depth=10, task='classification', split_method='gini', verbose=False)
    custrf.growForest(cur_X_train,cur_y_train, feature_names=X_labs, target_names=y_labs)
    cust_y_pred,_ = custrf.predict(cur_X_test)
    
    
    ### CALCULATE SCORES :)
    # OOB-error
    custom_oob.append(custrf.calcOobError())
    sklearn_oob.append(sklrf.oob_score_)
    
    # Accuracy
    custom_test_acc.append(evl.accuracy(cur_y_test,cust_y_pred)) 
    sklearn_test_acc.append(evl.accuracy(cur_y_test,skl_y_pred)) 
    
    # F1-score
    custom_test_f1.append(metrics.f1_score(cur_y_test,cust_y_pred,average='weighted'))
    sklearn_test_f1.append(metrics.f1_score(cur_y_test,skl_y_pred,average='weighted'))

    
'''
Sklearn RF parameters:

sklrf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=10, min_samples_split=2,\
                               min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',\
                               max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True,\
                               oob_score=True, n_jobs=None, random_state=None, verbose=0, warm_start=False,\
                               class_weight=None, ccp_alpha=0.0, max_samples=None)
'''

In [None]:
# Set data
data_cust = [[1-x for x in custom_oob], custom_test_acc, custom_test_f1]
data_skl = [sklearn_oob, sklearn_test_acc, sklearn_test_f1]
ticks = ['OOB-score', 'Accuracy', 'F1-score']

# Plot
pl.metricsBoxPlot(data_cust, data_skl, ticks, fig_size=(10,8), _save=False, savename="Box_SklVsOwn_AccF1oob.png")