# Mapping the vegetation distribution in Norway using Random Forest
### Analysis pt. 1
Main workflow for reading the preprocessed data and setting up the experiments.

In [None]:
### Import libraries
# Matrix/vector handling
import numpy as np
import pandas as pd
# Scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold
# Own code
from lib import RandomForest as rf
from lib import Evaluation as evl
from lib import DataHandling as dat
from lib import Plotting as pl
# Plotting
import matplotlib.pyplot as plt

## 1. Read and partition data 

In [None]:
### Read confusion matrix. No need to scale or transform input data!
X_cat = pd.read_pickle("./DataFiles/FeatureMatrixCats.pkl")

# Read target
y_cat = pd.read_pickle("./DataFiles/TargetVTs.pkl")

### Construct final feature matrices
# Remove variables from data frames that should not be in feature matrix
rmvars = ["x","y","plot_id","geology_norge1"]
X_cat_final = X_cat.drop(rmvars, axis=1)

print(X_cat_final.columns)

In [None]:
### SPLIT DATA INTO TEST AND TRAIN
testSetRatio = 0.2    # Proportion of data that should end up in test set
seed  = 77           # Random seed to make results reproducible

from sklearn.model_selection import train_test_split
# Start with numerical scaled
X_train, X_test, y_train, y_test =\
train_test_split(X_cat_final, y_cat, test_size = testSetRatio, random_state = seed)

In [None]:
### Scikit Learn
sklrf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=15, max_features='auto',\
                               bootstrap=True, oob_score=True, verbose=0, n_jobs=-1)

In [None]:
sklrf.fit(X_train, y_train)

In [None]:
y_pred = sklrf.predict(X_test)

In [None]:
print(sklrf.oob_score_)
print(sklrf.feature_importances_)
print(sklrf.classes_)
print(sklrf.n_features_)

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(sklrf, X_test, y_test, n_repeats=5, random_state=0,n_jobs=-1)

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{X_cat_final.columns[i]:<20}" f"{r.importances_mean[i]:.3f}" f" +/- {r.importances_std[i]:.3f}")

## Create cross validation metrics

In [None]:
from sklearn.model_selection import KFold

### Create cross validation folds
kf = KFold(n_splits=5,shuffle=True)
split_idx = kf.split(X_cat_final)

### Initialize empty lists
MODELS = []
cat_acc_cv, prec_cv, rec_cv, f1_cv, auc_cv = [],[],[],[],[]
cat_acc_cv_test, prec_cv_test, rec_cv_test, f1_cv_test, auc_cv_test = [],[],[],[],[]

In [None]:
metrics.precision_score(y_test,y_pred,average='macro')

In [None]:
from sklearn import metrics

### Loop through folds
for train_index, test_index in split_idx:
    
    ### Subset data to current fold
    X_train_cur = X_cat_final.iloc[train_index,:]
    y_train_cur = y_cat.iloc[train_index]
    
    X_test_cur = X_cat_final.iloc[test_index,:]
    y_test_cur = y_cat.iloc[test_index]
    
    ### Scikit Learn
    rf_cur = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=18, max_features='auto',\
                                    bootstrap=True, oob_score=True, verbose=0, n_jobs=-1)
    rf_cur.fit(X_train_cur, y_train_cur)
    y_pred_cur = rf_cur.predict(X_test_cur)
    
    
    # Out of bag
    cat_acc_cv.append(rf_cur.oob_score_)
    #prec_cv.append(metrics.precision_score(y_test_cur, y_pred_cur, average='macro'))
    #rec_cv.append(metrics.recall_score(y_test_cur, y_pred_cur, average='macro'))
    #f1_cv.append(metrics.f1_score(y_test_cur, y_pred_cur, average='macro'))
    #auc_cv.append(metrics.roc_auc_score(y_test_cur, y_pred_cur, average='macro'))
    
    ### Test set
    cat_acc_cv_test.append(metrics.accuracy_score(y_test_cur, y_pred_cur))
    prec_cv_test.append(metrics.precision_score(y_test_cur, y_pred_cur, average='macro'))
    rec_cv_test.append(metrics.recall_score(y_test_cur, y_pred_cur, average='macro'))
    f1_cv_test.append(metrics.f1_score(y_test_cur, y_pred_cur, average='macro'))
    #auc_cv_test.append(metrics.roc_auc_score(y_test_cur, y_pred_cur, average='macro',multi_class='ovo'))
    
    MODELS.append(rf_cur)
    


In [None]:
y_2 = sklrf.predict(X_test_cur)
metrics.accuracy_score(y_test_cur, y_2)

In [None]:
#df_train = pd.DataFrame(data={"categorical_accuracy": cat_acc_cv, "precision": prec_cv, "recall": rec_cv, "auc": auc_cv, "f1_score": f1_cv})
df_test = pd.DataFrame(data={"categorical_accuracy": cat_acc_cv_test, "precision": prec_cv_test, "recall": rec_cv_test, "f1_score": f1_cv_test})

#df_train.to_csv("./Results/Tables/NN_trainmetrics.csv", sep=',',index=False)
df_test.to_csv("./Results/Tables/RF_testmetrics.csv", sep=',',index=False)

## Plot a tree

In [None]:
from sklearn import tree

fn=X_train.columns
cn=sklrf.classes_
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
t = tree.plot_tree(sklrf.estimators_[0],
               feature_names = fn, 
               class_names=cn,
               filled = True,
              max_depth=1,
              fontsize=3.5,
              precision=2,proportion=False);
#fig.savefig('Results/FigureFiles/RF_treeExample.png')

In [None]:
type(t)

### Test collinearity

In [None]:
from scipy.stats import spearmanr
from scipy.cluster import hierarchy

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 10))
corr = spearmanr(X_cat_final).correlation
corr_linkage = hierarchy.ward(corr)
dendro = hierarchy.dendrogram(
    corr_linkage, labels=X_cat_final.columns.tolist(), ax=ax1, leaf_rotation=90
)
dendro_idx = np.arange(0, len(dendro['ivl']))

ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()

In [None]:
from collections import defaultdict

cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
'''
cluster_id_to_feature_ids contains clusters of correlated variables as seperate lists.
Size of lists is determined by "k", the second input of hierarchy.fcluster.
Higher k --> more correlation required to end up in same list
'''
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]

X_train_sel = X_train.iloc[:, selected_features]
X_test_sel = X_test.iloc[:, selected_features]

clf_sel = RandomForestClassifier(n_estimators=500, random_state=42,n_jobs=-1)
clf_sel.fit(X_train_sel, y_train)
print("Accuracy on test data with features removed: {:.2f}".format(
      clf_sel.score(X_test_sel, y_test)))

In [None]:
set(cluster_ids)

In [None]:
cluster_id_to_feature_ids.values()

In [None]:
for v in cluster_id_to_feature_ids.values():
    print(X_train.columns[v])

In [None]:
from sklearn.inspection import permutation_importance

r_sub = permutation_importance(clf_sel, X_test_sel, y_test, n_repeats=5, random_state=0,n_jobs=7)

In [None]:
for i in r_sub.importances_mean.argsort()[::-1]:
    if r_sub.importances_mean[i] - 2 * r_sub.importances_std[i] > 0:
        print(f"{X_train_sel.columns[i]:<20}" f"{r_sub.importances_mean[i]:.3f}" f" +/- {r_sub.importances_std[i]:.3f}")

## Read metrics, plot for comparison

In [None]:
data_rf = pd.read_csv('./Results/Tables/RF_testmetrics.csv')
data_nn = pd.read_csv('./Results/Tables/NN_testmetrics.csv')

In [None]:
data_nn = data_nn.drop(['auc'],axis=1)
labels = ['Accuracy', 'Precision', 'Recall', 'F1-score']

In [None]:
data_nn.iloc[:,0]

In [None]:
pl.metricsBoxPlot(data_rf, data_nn, ticks=labels, fig_size=(10,8), _save=False, savename="MetricsCompariRFvsNN.png",\
                  title='Comparison of classification metrics - 5 fold cv\n Random Forest vs. Deep Neural Network',\
                  name_data_1='RandomForest', name_data_2='DeepNN')

In [None]:
import importlib
importlib.reload(pl)

In [None]:
print(len(data_nn))
print(data_nn.shape)
np.array(range(len(data_nn)))*2.0-0.4

In [None]:
plt.boxplot(data_nn)