# Cross validation of training set 

## Open features file

In [None]:

# Pandas is used for data manipulation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

time='80_100'
# Read in data as pandas dataframe and display first 5 rows
features = pd.read_csv('../features/features_training1//features_{}.csv'.format(time))
features_num=features.to_numpy()
features[:] = np.nan_to_num(features_num)
np.where(pd.isnull(features_num))
RSEED=50
#Check the start of the file
features.head(7)
#features.describe(include='all')

## Plot the distribution of features for native and non-native models

In [None]:
# sklearn provides the iris species as integer values since this is required for classification
# here we're just adding a column with the species names to the dataframe for visualisation

#features['quality'] = np.array([features[i] for i in features)
colors = ['#91174A','#0072B2']
sns.set(font_scale=3, context='notebook') 
sns.pairplot(features, hue='quality', palette=colors, height=4)
g = sns.PairGrid(features, hue='quality',  height=4)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6);



### Define Features and Labels and Convert Data to Arrays

In [None]:
# Use numpy to convert to arrays
import numpy as np


labels = features['quality']

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('quality',axis = 1)


# Saving feature names for later use
feature_list = list(features.columns)
features[features==np.inf]=np.nan
features.fillna(features.mean(), inplace=True)
y = labels.map({'native':1,"non-native":0})
x = features.values

## Specify the Random Forest classifier

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(bootstrap= True, max_depth=50, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators = 1000,oob_score= True,
                                  random_state = 42)

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(x, y, test_size = 0.25,
                                                                           random_state = 42)

### Define plotting of ROC curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
# Plot an ROC. pred - the predictions, y - the expected output.
def plot_roc(pred,y):
    fpr, tpr, _ = roc_curve(y, pred)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()




## Run the Random Forest classifier and plot the ROC curve

In [None]:
print(__doc__)
sns.set(style="white" )


from sklearn.model_selection import RepeatedKFold


import numpy as np
import matplotlib.pyplot as plt
from scipy import interp
from sklearn.model_selection import KFold

from sklearn.model_selection import LeavePOut
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold


# #############################################################################
# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=100, random_state=26124)



fig, ax = plt.subplots()
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(x, y):
     
    probas_ = rf.fit(x[train], y[train]).predict_proba(x[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.1,
             )

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=1)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=1)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.1,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('Receiver operating characteristic example', fontsize=14)
plt.legend(loc="lower right", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)



## Extract Feature Importances

In [None]:


# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

# list of x locations for plotting
x_values = list(range(len(importances)))

# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')

# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical',fontsize=15)
plt.yticks(fontsize=15)

# Axis labels and title
plt.ylabel('Importance', fontsize=20); plt.xlabel('Variable', fontsize=20); plt.title('Variable Importances time = {}ns'.format(time), fontsize=25); 
