### Import the Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('pre-precessed_dataset.csv')
codes = open('codes.txt','r').readlines()

df.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,COOH mmol/g,Endotoxins (EU/mg),Diameter min. (nm),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,Type_PRISTINE,% Total Impurities,Genotoxicity
0,0.0,0.198214,0.559322,0.934783,0.03639,0.349333,0.020378,0.23491,0.067882,0.306306,...,0.079208,0.34,0.332558,0.206494,0.0,0.0,0.0,1.0,0.134906,0
1,0.093822,0.283929,0.567797,1.0,0.066958,0.730667,0.13246,0.375204,0.054205,0.570571,...,0.405941,0.42,0.390698,0.298701,0.0,0.0,1.0,0.0,0.075646,0
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,1.0,0.5,0.372093,0.394805,1.0,0.0,0.0,0.0,0.04878,0
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,0.034653,0.48,0.872093,0.775325,0.0,0.0,0.0,1.0,0.219889,0
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,0.044554,0.52,0.444186,0.419481,0.0,0.0,1.0,0.0,0.190736,1


We will drop the columns that we do not need for our model

In [2]:
all_columns = list(df.drop('Genotoxicity',axis=1).columns)
keep = ['Purity (%)','Zave (12.5 ug/ml)','Length ave. (nm) ']

for i in all_columns:
    if i not in keep:
        df.drop(i,axis=1,inplace=True)

In [3]:
df.head()

Unnamed: 0,Length ave. (nm),Purity (%),Zave (12.5 ug/ml),Genotoxicity
0,0.0,0.934783,0.020378,0
1,0.093822,1.0,0.13246,0
2,0.039432,1.0,0.014556,0
3,0.048716,0.923913,0.0,0
4,0.15655,0.934783,0.425036,1


### Instantiate and fit a Logistic Regression classifier

The hyperparameter values were obtained from the final result of the "RFE for RF and LR.ipynb" notebook.

In [4]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=19, min_samples_split=0.11646759543664197, max_features=0.6664580622368363, random_state=42)

### Extract the endpoint column

In [5]:
labels = df.pop('Genotoxicity')

In [6]:
df.head()


Unnamed: 0,Length ave. (nm),Purity (%),Zave (12.5 ug/ml)
0,0.0,0.934783,0.020378
1,0.093822,1.0,0.13246
2,0.039432,1.0,0.014556
3,0.048716,0.923913,0.0
4,0.15655,0.934783,0.425036


In [7]:
labels.head()

0    0
1    0
2    0
3    0
4    1
Name: Genotoxicity, dtype: int64

### Train the model

In [8]:
clf.fit(df,labels);

### Confirmation on the training data

In [9]:
predictions = clf.predict(df)

In [10]:
from sklearn.metrics import (accuracy_score,matthews_corrcoef,
                             classification_report, confusion_matrix)

In [11]:
print('Accuracy:',accuracy_score(labels, predictions))

Accuracy: 1.0


In [12]:
print(classification_report(labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         6

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15



In [13]:
cm = confusion_matrix(labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
9 | 0
-----
0 | 6


In [14]:
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [15]:
print('-----------------------------------------------------------------')
print('  Sample                                    Prob(0)    Prob(1)')
print('-----------------------------------------------------------------')
for i in df.index:
    print('{:40} {}'.format(codes[i][:-1],clf.predict_proba(np.array(df.loc[i]).reshape(1, -1))[0]));

-----------------------------------------------------------------
  Sample                                    Prob(0)    Prob(1)
-----------------------------------------------------------------
NRCWE- 040                               [0.94736842 0.05263158]
NRCWE- 041                               [0.94736842 0.05263158]
NRCWE- 042                               [1. 0.]
NRCWE- 043                               [0.94736842 0.05263158]
NRCWE- 044                               [0.31578947 0.68421053]
NRCWE- 045                               [0.21052632 0.78947368]
NRCWE- 046                               [0.94736842 0.05263158]
NRCWE- 047                               [1. 0.]
NRCWE- 048                               [1. 0.]
NRCWE- 049                               [1. 0.]
NM-400                                   [0.15789474 0.84210526]
NM-401                                   [0.68421053 0.31578947]
NM-402                                   [0.05263158 0.94736842]
NM-403                  

### Feature's Importance

In [16]:
# Get numerical feature importances and feature names
importances = list(clf.feature_importances_)
feature_list = list(df.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 9)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Purity (%)           Importance: 0.416037705
Variable: Zave (12.5 ug/ml)    Importance: 0.338027762
Variable: Length ave. (nm)     Importance: 0.245934533


### Domain of Applicability

In [17]:
test_names = [codes[i] for i in df.index]

leverage_threshold = 3*df.shape[1]/df.shape[0]
print('The Leverage threshold is:', round(leverage_threshold, 2))

The Leverage threshold is: 0.6


In [18]:
# Convert to numpy array
np_labels = np.array(labels)
np_df = np.array(df)

from numpy.linalg import matrix_power
H=list()
reliability=list()
for i in range(len(np_df)):
    H.append(np_df[i].T@(matrix_power(np_df.T@np_df, -1))@np_df[i])
    if H[i]<=leverage_threshold:
        reliability.append('reliable')
    else:
        reliability.append('unreliable')

LV = [(sample[:-1], round(l_val, 2),rely) for sample, l_val, rely in zip(test_names, H, reliability)]
for i in range(len(np_df)):
    [print('Sample: {:40} Leverage Value: {:5}    Reliability: {:20}'.format(LV[i][0],LV[i][1],LV[i][2]))];

Sample: NRCWE- 040                               Leverage Value:  0.13    Reliability: reliable            
Sample: NRCWE- 041                               Leverage Value:  0.11    Reliability: reliable            
Sample: NRCWE- 042                               Leverage Value:  0.14    Reliability: reliable            
Sample: NRCWE- 043                               Leverage Value:  0.13    Reliability: reliable            
Sample: NRCWE- 044                               Leverage Value:  0.13    Reliability: reliable            
Sample: NRCWE- 045                               Leverage Value:  0.05    Reliability: reliable            
Sample: NRCWE- 046                               Leverage Value:  0.47    Reliability: reliable            
Sample: NRCWE- 047                               Leverage Value:  0.13    Reliability: reliable            
Sample: NRCWE- 048                               Leverage Value:  0.18    Reliability: reliable            
Sample: NRCWE- 049          

### Save the final model

In [19]:
import joblib
joblib.dump(clf, 'Final_model.sav');