# Validate Random Forest classifier on external validation set

## Read data 

### Read the training set

In [None]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

time='80_100'
# Read in data as pandas dataframe and display first 5 rows
features_train = pd.read_csv('../features/features_training2/features_{}.csv'.format(time))
features_train.head(7)
features_train_num=features_train.to_numpy()
features_train[:] = np.nan_to_num(features_train_num)
np.where(pd.isnull(features_train_num))
RSEED=52
features_train.fillna(features_train.mean())
features_train.describe(include='all')
labels_train = features_train['quality']
# Remove the labels from the features
# axis 1 refers to the columns
features_train= features_train.drop('quality',axis = 1)
y = labels_train.map({'native':1,"non-native":0})
x = features_train.values



### Read the validation set

In [None]:

# Read in data as pandas dataframe and display first 5 rows
features_test = pd.read_csv('../features/features_validation2/features_{}.csv'.format(time))
features_test.head(7)
features_test_num=features_test.to_numpy()
features_test[:] = np.nan_to_num(features_test_num)
np.where(pd.isnull(features_test_num))
RSEED=50
features_test.fillna(features_train.mean())
features_test.describe(include='all')
labels_test = features_test['quality']

# Remove the labels from the features
# axis 1 refers to the columns
features_test= features_test.drop('quality',axis = 1)
y1 = labels_test.map({'native':1,"non-native":0})
x1 = features_test.values

## Run the Random Forest classifier 

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(bootstrap= True, max_depth=50, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators = 1000,oob_score= True,
                                  random_state = 42)

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn import metrics


from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score 
from sklearn.model_selection import cross_val_score,  cross_val_predict , cross_validate

# Fit classifier on training set
rf.fit(x,y)

# Predict cross-validated (10x) score for training set
scores = cross_val_score(rf, x, y, cv=10)
print("Accuracy cv_score training set: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Predict new data from validaiton set 
predicted = cross_val_predict(rf, x1, y1, cv=5)

pred_labels=rf.predict(x1)

# Calculate and print accuracy, f1-score, 
ex_accuracy=metrics.accuracy_score(y1, pred_labels)
ex_f1=metrics.f1_score(y1, pred_labels)
ex_precision=metrics.precision_score(y1, pred_labels)
ex_recall=metrics.recall_score(y1, pred_labels)


print('Accuracy for validation set: %0.2f ' % (ex_accuracy))
print('f1-score  for validation set: %0.2f ' % (ex_f1))
print('precision for validation set: %0.2f ' % (ex_precision))
print('recall for validation set: %0.2f ' % (ex_recall))
print('oob score:  %0.2f' %(rf.oob_score_))


In [None]:
print(__doc__)
sns.set(style="white" )


from sklearn.model_selection import RepeatedKFold


import numpy as np
import matplotlib.pyplot as plt
from scipy import interp
from sklearn.model_selection import KFold

from sklearn.model_selection import LeavePOut
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold


# #############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=20,  random_state=2652124)



fig, ax = plt.subplots()
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0

probas_ = rf.fit(x, y).predict_proba(x1)
    # Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y1, probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.1,
             )

i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=1)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=1)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.1,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")


plt.show()