### Import the Lirbraries and load the Dataset, Code names and models

In [1]:
from Kennard_Stone import kennardstonealgorithm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
from datetime import datetime
from Bayesian_Optimization import BayesOpt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score,matthews_corrcoef,
                             classification_report, confusion_matrix)
%matplotlib inline

df = pd.read_csv('pre-precessed_dataset.csv')
codes = open('codes.txt','r').readlines()

clf = joblib.load('RF_model.sav')

df.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,COOH mmol/g,Endotoxins (EU/mg),Diameter min. (nm),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,Type_PRISTINE,% Total Impurities,Genotoxicity
0,0.0,0.198214,0.559322,0.934783,0.03639,0.349333,0.020378,0.23491,0.067882,0.306306,...,0.079208,0.34,0.332558,0.206494,0,0,0,1,0.134906,0
1,0.093822,0.283929,0.567797,1.0,0.066958,0.730667,0.13246,0.375204,0.054205,0.570571,...,0.405941,0.42,0.390698,0.298701,0,0,1,0,0.075646,0
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,1.0,0.5,0.372093,0.394805,1,0,0,0,0.04878,0
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,0.034653,0.48,0.872093,0.775325,0,0,0,1,0.219889,0
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,0.044554,0.52,0.444186,0.419481,0,0,1,0,0.190736,1


Keep the same train test split

In [2]:
test = df.loc[[0,1,8,11,12]]
train = df.drop(index=[0,1,8,11,12],axis=0)

train_labels = train.pop('Genotoxicity')
test_labels = test.pop('Genotoxicity')

In [3]:
# Keep the test codes
test_codes = [codes[i][:-1] for i in test.index]

### RFE for RF

#### 1st iteration

Get the features' importance

In [4]:
# Get numerical feature importances and feature names
importances = list(clf.feature_importances_)
feature_list = list(df.drop('Genotoxicity',axis=1).columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Length ave. (nm)     Importance: 0.444
Variable: Zave (12.5 ug/ml)    Importance: 0.111
Variable: PdI  (12.5 ug/ml)    Importance: 0.111
Variable: Zave  (200 ug/ml)    Importance: 0.111
Variable: Purity (%)           Importance: 0.074
Variable: ROS                  Importance: 0.074
Variable: Endotoxins (EU/mg)   Importance: 0.037
Variable: Diameter max. (nm)   Importance: 0.037
Variable: Diameter ave. (nm)   Importance: 0.0
Variable: BET (m2/g)           Importance: 0.0
Variable: Zave (batch)         Importance: 0.0
Variable: PdI (batch)          Importance: 0.0
Variable: PdI  (200 ug/ml)     Importance: 0.0
Variable: Peak (ug/ml)         Importance: 0.0
Variable: CEA: C.H.N.O (wt%)   Importance: 0.0
Variable: OH mmol/g            Importance: 0.0
Variable: COOH mmol/g          Importance: 0.0
Variable: Diameter min. (nm)   Importance: 0.0
Variable: Type_COOH            Importance: 0.0
Variable: Type_NH2             Importance: 0.0
Variable: Type_OH              Importance: 0

Exclude features with zero importance

In [5]:
for i in range(len(feature_importances)):
    if feature_importances[i][1]==0.0:
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),ROS,Endotoxins (EU/mg),Diameter max. (nm)
2,0.039432,1.0,0.014556,0.097879,0.031408,0.103896,0.5,0.394805
3,0.048716,0.923913,0.0,0.0,0.013171,0.298701,0.48,0.775325
4,0.15655,0.934783,0.425036,0.365416,0.241641,0.220779,0.52,0.419481
5,0.199591,0.684783,0.27802,0.363785,0.0846,0.194805,0.66,0.412987
6,0.038274,0.945652,0.659389,0.40783,0.52229,0.857143,0.36,0.405195


Optimize Random Forest

In [6]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [7]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params = {"n_estimators": (6,20), "min_samples_split": (0.1,0.9), "max_features": (0.1, 0.9)}

# Optimization
rf_optimum = bo.optimize_rf(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.6667  [0m | [0m 0.3996  [0m | [0m 0.8606  [0m | [0m 16.25   [0m |
| [95m 2       [0m | [95m 0.75    [0m | [95m 0.5789  [0m | [95m 0.2248  [0m | [95m 8.184   [0m |
| [0m 3       [0m | [0m 0.75    [0m | [0m 0.1465  [0m | [0m 0.7929  [0m | [0m 14.42   [0m |
| [95m 4       [0m | [95m 0.8333  [0m | [95m 0.6665  [0m | [95m 0.1165  [0m | [95m 19.58   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 0.766   [0m | [0m 0.2699  [0m | [0m 8.546   [0m |
| [0m 6       [0m | [0m 0.75    [0m | [0m 0.1013  [0m | [0m 0.7614  [0m | [0m 19.96   [0m |
| [0m 7       [0m | [0m 0.75    [0m | [0m 0.9     [0m | [0m 0.1     [0m | [0m 18.95   [0m |
| [0m 8       [0m | [0m 0.8333  [0m | [0m 0.6887  [0m | [0m 0.1     [0m | [0m 19.62   [0m |
| [0m 9       [0m | [0m 0.75    [0m | [0m 0.8

In [8]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.07


Test model's performance on the traning set

In [9]:
# Declare the model
clf = RandomForestClassifier(n_estimators=rf_optimum['params']['n_estimators'], max_features=rf_optimum['params']['max_features'], min_samples_split=rf_optimum['params']['min_samples_split'],  random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [10]:
# Use the model's predict method 
predictions = clf.predict(train)

In [11]:
# Print the training accuracy
print("RF's training accuracy:", accuracy_score(train_labels, predictions))

RF's training accuracy: 1.0


In [12]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [13]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [14]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [15]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [16]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [1.         0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


#### 2nd iteration

Get the features' importance

In [17]:
# Get numerical feature importances and feature names
importances = list(clf.feature_importances_)
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Length ave. (nm)     Importance: 0.311
Variable: Purity (%)           Importance: 0.2
Variable: Zave (12.5 ug/ml)    Importance: 0.198
Variable: Zave  (200 ug/ml)    Importance: 0.135
Variable: ROS                  Importance: 0.09
Variable: Endotoxins (EU/mg)   Importance: 0.047
Variable: PdI  (12.5 ug/ml)    Importance: 0.02
Variable: Diameter max. (nm)   Importance: 0.0


Exclude features with zero importance

In [18]:
for i in range(len(feature_importances)):
    if feature_importances[i][1]==0.0:
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),ROS,Endotoxins (EU/mg)
2,0.039432,1.0,0.014556,0.097879,0.031408,0.103896,0.5
3,0.048716,0.923913,0.0,0.0,0.013171,0.298701,0.48
4,0.15655,0.934783,0.425036,0.365416,0.241641,0.220779,0.52
5,0.199591,0.684783,0.27802,0.363785,0.0846,0.194805,0.66
6,0.038274,0.945652,0.659389,0.40783,0.52229,0.857143,0.36


Optimize Random Forest

In [19]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [21]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params = {"n_estimators": (6,20), "min_samples_split": (0.1,0.9), "max_features": (0.1, 0.5)}

# Optimization
rf_optimum = bo.optimize_rf(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.6667  [0m | [0m 0.2498  [0m | [0m 0.8606  [0m | [0m 16.25   [0m |
| [95m 2       [0m | [95m 0.9167  [0m | [95m 0.3395  [0m | [95m 0.2248  [0m | [95m 8.184   [0m |
| [0m 3       [0m | [0m 0.9167  [0m | [0m 0.1232  [0m | [0m 0.7929  [0m | [0m 14.42   [0m |
| [0m 4       [0m | [0m 0.9167  [0m | [0m 0.3832  [0m | [0m 0.1165  [0m | [0m 19.58   [0m |
| [0m 5       [0m | [0m 0.9167  [0m | [0m 0.433   [0m | [0m 0.2699  [0m | [0m 8.546   [0m |
| [0m 6       [0m | [0m 0.625   [0m | [0m 0.1     [0m | [0m 0.731   [0m | [0m 12.89   [0m |
| [0m 7       [0m | [0m 0.4167  [0m | [0m 0.1     [0m | [0m 0.9     [0m | [0m 18.94   [0m |
| [0m 8       [0m | [0m 0.5     [0m | [0m 0.5     [0m | [0m 0.8759  [0m | [0m 8.257   [0m |
| [0m 9       [0m | [0m 0.8333  [0m | [0m 0.1208  

In [22]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.07


Step 5: Test model's performance (on the testing set)

In [23]:
# Declare the model
clf = RandomForestClassifier(n_estimators=rf_optimum['params']['n_estimators'], max_features=rf_optimum['params']['max_features'], min_samples_split=rf_optimum['params']['min_samples_split'],  random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [24]:
# Use the model's predict method 
predictions = clf.predict(train)

In [25]:
# Print the training accuracy
print("RF's training accuracy:", accuracy_score(train_labels, predictions))

RF's training accuracy: 1.0


In [26]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [27]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [28]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [29]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [30]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [1.         0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


#### 3rd iteration

Get the features' importance

In [31]:
# Get numerical feature importances and feature names
importances = list(clf.feature_importances_)
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Purity (%)           Importance: 0.302
Variable: Length ave. (nm)     Importance: 0.286
Variable: Zave (12.5 ug/ml)    Importance: 0.143
Variable: ROS                  Importance: 0.143
Variable: Endotoxins (EU/mg)   Importance: 0.079
Variable: PdI  (12.5 ug/ml)    Importance: 0.048
Variable: Zave  (200 ug/ml)    Importance: 0.0


Exclude features with zero importance

In [32]:
for i in range(len(feature_importances)):
    if feature_importances[i][1]==0.0:
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),Zave (12.5 ug/ml),PdI (12.5 ug/ml),ROS,Endotoxins (EU/mg)
2,0.039432,1.0,0.014556,0.097879,0.103896,0.5
3,0.048716,0.923913,0.0,0.0,0.298701,0.48
4,0.15655,0.934783,0.425036,0.365416,0.220779,0.52
5,0.199591,0.684783,0.27802,0.363785,0.194805,0.66
6,0.038274,0.945652,0.659389,0.40783,0.857143,0.36


Optimize Random Forest

In [33]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [34]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params = {"n_estimators": (6,10), "min_samples_split": (0.5,0.9), "max_features": (0.1, 0.5)}

# Optimization
rf_optimum = bo.optimize_rf(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.5     [0m | [0m 0.2498  [0m | [0m 0.8803  [0m | [0m 8.928   [0m |
| [95m 2       [0m | [95m 0.7083  [0m | [95m 0.3395  [0m | [95m 0.5624  [0m | [95m 6.624   [0m |
| [0m 3       [0m | [0m 0.4167  [0m | [0m 0.1232  [0m | [0m 0.8465  [0m | [0m 8.404   [0m |
| [95m 4       [0m | [95m 0.8333  [0m | [95m 0.3832  [0m | [95m 0.5082  [0m | [95m 9.88    [0m |
| [0m 5       [0m | [0m 0.625   [0m | [0m 0.433   [0m | [0m 0.5849  [0m | [0m 6.727   [0m |
| [0m 6       [0m | [0m 0.5     [0m | [0m 0.5     [0m | [0m 0.835   [0m | [0m 10.0    [0m |
| [0m 7       [0m | [0m 0.8333  [0m | [0m 0.2881  [0m | [0m 0.5     [0m | [0m 9.782   [0m |
| [0m 8       [0m | [0m 0.8333  [0m | [0m 0.1     [0m | [0m 0.5     [0m | [0m 10.0    [0m |
| [0m 9       [0m | [0m 0.6667  [0m | [0m 0.1

In [35]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.07


Step 5: Test model's performance (on the testing set)

In [36]:
9# Declare the model
clf = RandomForestClassifier(n_estimators=rf_optimum['params']['n_estimators'], max_features=rf_optimum['params']['max_features'], min_samples_split=rf_optimum['params']['min_samples_split'],  random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [37]:
# Use the model's predict method 
predictions = clf.predict(train)

In [38]:
# Print the training accuracy
print("RF's training accuracy:", accuracy_score(train_labels, predictions))

RF's training accuracy: 1.0


In [39]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [40]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [41]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [42]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [43]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [1.         0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


#### 4th iteration

Get the features' importance

In [44]:
# Get numerical feature importances and feature names
importances = list(clf.feature_importances_)
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Length ave. (nm)     Importance: 0.667
Variable: Purity (%)           Importance: 0.167
Variable: Zave (12.5 ug/ml)    Importance: 0.167
Variable: PdI  (12.5 ug/ml)    Importance: 0.0
Variable: ROS                  Importance: 0.0
Variable: Endotoxins (EU/mg)   Importance: 0.0


Exclude features with zero importance

In [45]:
for i in range(len(feature_importances)):
    if feature_importances[i][1]==0.0:
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),Zave (12.5 ug/ml)
2,0.039432,1.0,0.014556
3,0.048716,0.923913,0.0
4,0.15655,0.934783,0.425036
5,0.199591,0.684783,0.27802
6,0.038274,0.945652,0.659389


Optimize Random Forest

In [46]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [47]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params = {"n_estimators": (6,20), "min_samples_split": (0.1,0.9), "max_features": (0.1, 0.9)}

# Optimization
rf_optimum = bo.optimize_rf(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.6667  [0m | [0m 0.3996  [0m | [0m 0.8606  [0m | [0m 16.25   [0m |
| [0m 2       [0m | [0m 0.625   [0m | [0m 0.5789  [0m | [0m 0.2248  [0m | [0m 8.184   [0m |
| [95m 3       [0m | [95m 0.75    [0m | [95m 0.1465  [0m | [95m 0.7929  [0m | [95m 14.42   [0m |
| [95m 4       [0m | [95m 0.9167  [0m | [95m 0.6665  [0m | [95m 0.1165  [0m | [95m 19.58   [0m |
| [0m 5       [0m | [0m 0.8333  [0m | [0m 0.766   [0m | [0m 0.2699  [0m | [0m 8.546   [0m |
| [0m 6       [0m | [0m 0.8333  [0m | [0m 0.8771  [0m | [0m 0.6059  [0m | [0m 19.8    [0m |
| [0m 7       [0m | [0m 0.5417  [0m | [0m 0.2286  [0m | [0m 0.4192  [0m | [0m 7.211   [0m |
| [0m 8       [0m | [0m 0.5417  [0m | [0m 0.3602  [0m | [0m 0.4833  [0m | [0m 7.999   [0m |
| [0m 9       [0m | [0m 0.9167  [0m | [0m 0.1

In [48]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.07


Step 5: Test model's performance (on the testing set)

In [49]:
# Declare the model
clf = RandomForestClassifier(n_estimators=rf_optimum['params']['n_estimators'], max_features=rf_optimum['params']['max_features'], min_samples_split=rf_optimum['params']['min_samples_split'],  random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [50]:
# Use the model's predict method 
predictions = clf.predict(train)

In [51]:
# Print the training accuracy
print("RF's training accuracy:", accuracy_score(train_labels, predictions))

RF's training accuracy: 1.0


In [52]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [53]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [54]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [55]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [56]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [1.         0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


Below this point of the process the cross validatio score lowers significanlty. Hence, the process terminates here. In the next cells we present some metrics of the final model on the testning set. 

###### Metrics on the Testing set

In [57]:
# Use the model's predict method
predictions = clf.predict(test)

In [58]:
# Print the testing accuracy
print("RF's Testing accuracy:", accuracy_score(test_labels, predictions))

RF's Testing accuracy: 0.8


In [59]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.50      1.00      0.67         1

    accuracy                           0.80         5
   macro avg       0.75      0.88      0.76         5
weighted avg       0.90      0.80      0.82         5



In [60]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
3 | 1
-----
0 | 1


In [61]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.75
Sensitivity: 1.0


In [62]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6123724356957946


Hence, the final RF model is the above, using 4 features, and the importance of the features is the following 

In [63]:
# Get numerical feature importances and feature names
importances = list(clf.feature_importances_)
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 9)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Length ave. (nm)     Importance: 0.35670194
Variable: Purity (%)           Importance: 0.334766314
Variable: Zave (12.5 ug/ml)    Importance: 0.308531746


The classification probabilities of the testing samples: 

In [64]:
print('-----------------------------------------------------------------')
print('  Sample                                    Prob(0)    Prob(1)')
print('-----------------------------------------------------------------')
for i in test.index:
    print('{:40} {}'.format(codes[i][:-1],clf.predict_proba(np.array(test.loc[i]).reshape(1, -1))[0]));

-----------------------------------------------------------------
  Sample                                    Prob(0)    Prob(1)
-----------------------------------------------------------------
NRCWE- 040                               [0.73684211 0.26315789]
NRCWE- 041                               [0.73684211 0.26315789]
NRCWE- 048                               [0.68421053 0.31578947]
NM-401                                   [0.31578947 0.68421053]
NM-402                                   [0. 1.]


### Domain of Applicability

In [65]:
test_names = [codes[i] for i in test.index]

In [66]:
leverage_threshold = 3*train.shape[1]/train.shape[0]
print('The Leverage threshold is:', round(leverage_threshold, 2))

The Leverage threshold is: 0.9


In [67]:
# Labels are the values we want to predict
l_train_labels = np.array(train_labels)
l_test_labels = np.array(test_labels)
# Convert to numpy array
l_test = np.array(test)
l_train = np.array(train)

In [68]:
from numpy.linalg import matrix_power
H=list()
reliability=list()
for i in range(len(l_test)):
    H.append(l_test[i].T@(matrix_power(l_train.T@l_train, -1))@l_test[i])
    if H[i]<=leverage_threshold:
        reliability.append('reliable')
    else:
        reliability.append('unreliable')

LV = [(sample, round(l_val, 2),rely) for sample, l_val, rely in zip(test_codes, H, reliability)]
for i in range(len(l_test)):
    [print('Sample: {:40} Leverage Value: {:5}    Reliability: {:20}'.format(LV[i][0],LV[i][1],LV[i][2]))];

Sample: NRCWE- 040                               Leverage Value:   0.2    Reliability: reliable            
Sample: NRCWE- 041                               Leverage Value:  0.18    Reliability: reliable            
Sample: NRCWE- 048                               Leverage Value:  0.33    Reliability: reliable            
Sample: NM-401                                   Leverage Value:  0.48    Reliability: reliable            
Sample: NM-402                                   Leverage Value:  0.13    Reliability: reliable            
