### Import the Lirbraries and load the Dataset, Code names

In [29]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
%matplotlib inline

df = pd.read_csv('pre-precessed_dataset.csv')
codes = open('codes.txt','r').readlines()

df.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,COOH mmol/g,Endotoxins (EU/mg),Diameter min. (nm),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,Type_PRISTINE,% Total Impurities,Genotoxicity
0,0.0,0.198214,0.559322,0.934783,0.03639,0.349333,0.020378,0.23491,0.067882,0.306306,...,0.079208,0.34,0.332558,0.206494,0.0,0.0,0.0,1.0,0.134906,0
1,0.093822,0.283929,0.567797,1.0,0.066958,0.730667,0.13246,0.375204,0.054205,0.570571,...,0.405941,0.42,0.390698,0.298701,0.0,0.0,1.0,0.0,0.075646,0
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,1.0,0.5,0.372093,0.394805,1.0,0.0,0.0,0.0,0.04878,0
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,0.034653,0.48,0.872093,0.775325,0.0,0.0,0.0,1.0,0.219889,0
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,0.044554,0.52,0.444186,0.419481,0.0,0.0,1.0,0.0,0.190736,1


### Train-Test Split

In [30]:
from Kennard_Stone import kennardstonealgorithm

Using the Kennart-Stone algorithm, we split the dataset into 2 sets, one for training and one for validation

In [31]:
train, test, train_labels, test_labels = kennardstonealgorithm(df,'Genotoxicity',5)

In [32]:
print('Training Features Shape:', train.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (10, 34)
Training Labels Shape: (10,)
Testing Features Shape: (5, 34)
Testing Labels Shape: (5,)


In [33]:
print('The names of the testing samples are:')
print('=====================================')
for i in test.index:
    print(codes[i][:-1])

The names of the testing samples are:
NRCWE- 040
NRCWE- 045
NRCWE- 048
NM-401
NM-402


### Domain of Applicability

In [34]:
# Convert to numpy array
np_train_labels = np.array(train_labels)
np_test_labels = np.array(test_labels)
np_test = np.array(test)
np_train = np.array(train)

In [35]:
leverage_threshold = 3*np_train.shape[1]/np_train.shape[0]
print('The Leverage threshold is:', round(leverage_threshold, 2))

The Leverage threshold is: 10.2


In [36]:
from numpy.linalg import matrix_power
H=[]
reliability=[]
for i in range(len(np_test)):
    H.append(np_test[i].T@(matrix_power(np_train.T@np_train, -1))@np_test[i])
    if H[i]<=leverage_threshold:
        reliability.append('reliable')
    else:
        reliability.append('unreliable')

LV = [(sample[:-1], round(l_val, 2),rely) for sample, l_val, rely in zip(codes, H, reliability)]
for i in range(len(np_test)):
    [print('Sample: {:13} Leverage Value: {}    Reliability: {}'.format(LV[i][0],LV[i][1],LV[i][2]))];

Sample: NRCWE- 040    Leverage Value: -121732331865742.0    Reliability: reliable
Sample: NRCWE- 041    Leverage Value: -5645595156943288.0    Reliability: reliable
Sample: NRCWE- 042    Leverage Value: -5257845161379162.0    Reliability: reliable
Sample: NRCWE- 043    Leverage Value: -1904003326267884.2    Reliability: reliable
Sample: NRCWE- 044    Leverage Value: 1379342182684633.5    Reliability: unreliable


### Import Bayesian Optimization Tool and Search for the Best Model

In [9]:
from Bayesian_Optimization import optimize_svm, optimize_rfc, optimize_lr

#### Optimization for the SVM model

In [10]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
bo_dict={'C':(200,1000) , 'gamma' : (0.1,1)}

# Optimization
svm_optimum = optimize_svm(train,train_labels,test,test_labels,bo_dict,5,3)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for linear kernel
|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m 1       [0m | [0m 80.0    [0m | [0m 499.6   [0m | [0m 0.9556  [0m |
| [0m 2       [0m | [0m 80.0    [0m | [0m 785.6   [0m | [0m 0.6388  [0m |
| [0m 3       [0m | [0m 80.0    [0m | [0m 324.8   [0m | [0m 0.2404  [0m |
| [0m 4       [0m | [0m 80.0    [0m | [0m 1e+03   [0m | [0m 0.45    [0m |
| [0m 5       [0m | [0m 80.0    [0m | [0m 200.0   [0m | [0m 0.2804  [0m |
| [0m 6       [0m | [0m 80.0    [0m | [0m 1e+03   [0m | [0m 0.2242  [0m |
| [0m 7       [0m | [0m 80.0    [0m | [0m 200.1   [0m | [0m 0.8236  [0m |
| [0m 8       [0m | [0m 80.0    [0m | [0m 1e+03   [0m | [0m 0.96    [0m |
Optimizing for poly kernel
|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m 1       [0m | [0m 60.0    [0m | [0m 499.6   [0m | [0m 0.9556  [0m |
| [0m

In [11]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.83


#### Optimization for the RF model

In [12]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
bo_dict={"n_estimators": (10,250), "min_samples_split": (0.1,0.5), "max_features": (0.1, 0.9)}

# Optimization
rf_optimum = optimize_rfc(train,train_labels,test,test_labels,bo_dict,10,5)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 80.0    [0m | [0m 0.3996  [0m | [0m 0.4803  [0m | [0m 185.7   [0m |
| [0m 2       [0m | [0m 80.0    [0m | [0m 0.5789  [0m | [0m 0.1624  [0m | [0m 47.44   [0m |
| [0m 3       [0m | [0m 60.0    [0m | [0m 0.1465  [0m | [0m 0.4465  [0m | [0m 154.3   [0m |
| [0m 4       [0m | [0m 80.0    [0m | [0m 0.6665  [0m | [0m 0.1082  [0m | [0m 242.8   [0m |
| [0m 5       [0m | [0m 80.0    [0m | [0m 0.766   [0m | [0m 0.1849  [0m | [0m 53.64   [0m |
| [0m 6       [0m | [0m 60.0    [0m | [0m 0.1     [0m | [0m 0.5     [0m | [0m 50.57   [0m |
| [0m 7       [0m | [0m 80.0    [0m | [0m 0.5279  [0m | [0m 0.2735  [0m | [0m 83.06   [0m |
| [0m 8       [0m | [0m 80.0    [0m | [0m 0.2148  [0m | [0m 0.1272  [0m | [0m 206.8   [0m |
| [0m 9       [0m | [0m 60.0    [0m | [0m 0.1261  [0m 

In [13]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.13


#### Optimization for the LR model

In [14]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
bo_dict={'C' : (10,100)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
lr_optimum = optimize_lr(train,train_labels,test,test_labels,bo_dict,5,3)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 norm
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 80.0    [0m | [0m 43.71   [0m |
| [0m 2       [0m | [0m 80.0    [0m | [0m 95.56   [0m |
| [0m 3       [0m | [0m 80.0    [0m | [0m 75.88   [0m |
| [0m 4       [0m | [0m 60.0    [0m | [0m 10.0    [0m |
| [0m 5       [0m | [0m 80.0    [0m | [0m 85.98   [0m |
| [0m 6       [0m | [0m 80.0    [0m | [0m 50.13   [0m |
| [0m 7       [0m | [0m 80.0    [0m | [0m 65.74   [0m |
| [0m 8       [0m | [0m 80.0    [0m | [0m 58.52   [0m |
Optimizing for l2 norm
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 60.0    [0m | [0m 43.71   [0m |
| [0m 2       [0m | [0m 60.0    [0m | [0m 95.56   [0m |
| [0m 3       [0m | [0m 60.0    [0m | [0m 75.88   [0m |
| [0m 4       [0m | [0m 60.0    [0m | [0m 10.0    [0m |
| [0m 5       [0m | [0m 60.0    [0m | [0m 10.0    [0

In [15]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.33


### Fit the models and Test the performance

In [25]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score,matthews_corrcoef,
                             classification_report, confusion_matrix)

#### Fit SVM

In [17]:
# Declare the model
svc = SVC(C=svm_optimum['params']['C'], gamma=svm_optimum['params']['gamma'], kernel=svm_optimum['params']['kernel'], random_state=42)

# Train the model on training data
svc.fit(train, train_labels);

###### Metrics on the Training set

In [18]:
# Use the model's predict method 
predictions = svc.predict(train)

In [19]:
# Print the Training accuracy
print("SVM's training accuracy:", accuracy_score(train_labels, predictions))

SVM's training accuracy: 1.0


In [20]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         4

   micro avg       1.00      1.00      1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [21]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
6 | 0
-----
0 | 4


In [26]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [22]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [23]:
#Print the Cross-Validation Score
scores = cross_val_score(svc, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.66666667 0.5        0.5       ]
Mean of Cross Validtation: 0.5833333333333333


###### Metrics on the Testing set

In [24]:
# Use the model's predict method
predictions = svc.predict(test)

In [25]:
# Print the Testing accuracy
print("SVM's testing accuracy:", accuracy_score(test_labels, predictions))

SVM's testing accuracy: 0.8


In [26]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      1.00      0.80         2

   micro avg       0.80      0.80      0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5



In [45]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
2 | 1
-----
0 | 2


In [46]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.6666666666666666
Sensitivity: 1.0


In [28]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6666666666666666


#### Fit RF

In [29]:
# Declare the model
clf = RandomForestClassifier(n_estimators=rf_optimum['params']['n_estimators'], max_features=rf_optimum['params']['max_features'], min_samples_split=rf_optimum['params']['min_samples_split'],  random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [30]:
# Use the model's predict method
predictions = clf.predict(train)

In [31]:
# Print the Train accuracy
print("RF's training accuracy:", accuracy_score(train_labels, predictions))

RF's training accuracy: 1.0


In [32]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         4

   micro avg       1.00      1.00      1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [33]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
6 | 0
-----
0 | 4


In [34]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [49]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [35]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.33333333 0.5        0.5       ]
Mean of Cross Validtation: 0.5


###### Metrics on the Testing set

In [36]:
# Use the model's predict method
predictions = clf.predict(test)

In [37]:
# Print the Test accuracy
print("RF's testing accuracy:", accuracy_score(test_labels, predictions))

RF's testing accuracy: 0.8


In [38]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      1.00      0.80         2

   micro avg       0.80      0.80      0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5



In [39]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
2 | 1
-----
0 | 2


In [51]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.6666666666666666
Sensitivity: 1.0


In [40]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6666666666666666


#### Fit LR

In [41]:
# Declare the model
logmodel = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['norm'],random_state=42)

# Train the model on training data
logmodel.fit(train, train_labels);

###### Metrics on the Training set

In [42]:
# Use the model's predict method
predictions = logmodel.predict(train)

In [43]:
# Print the train accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [44]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         4

   micro avg       1.00      1.00      1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [45]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
6 | 0
-----
0 | 4


In [54]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [46]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [47]:
#Print the Cross-Validation Score
scores = cross_val_score(logmodel, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.33333333 1.         0.5       ]
Mean of Cross Validtation: 0.625


###### Metrics on the Testing set

In [48]:
# Use the model's predict method
predictions = logmodel.predict(test)

In [49]:
# Print the Test accuracy
print("LR's testing accuracy:", accuracy_score(test_labels, predictions))

LR's testing accuracy: 0.8


In [50]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.67      1.00      0.80         2

   micro avg       0.80      0.80      0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5



In [51]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
2 | 1
-----
0 | 2


In [56]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.6666666666666666
Sensitivity: 1.0


In [52]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6666666666666666


#### Fit NB

In [53]:
# Declare the model
gnb = GaussianNB()

# Train the model on training data
gnb.fit(train,train_labels);

###### Metrics on the Training set

In [54]:
# Use the model's predict method
predictions = gnb.predict(train)

In [55]:
# Print the train accuracy
print("NB's accuracy:", accuracy_score(train_labels,predictions))

NB's accuracy: 0.8


In [56]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         6
           1       0.67      1.00      0.80         4

   micro avg       0.80      0.80      0.80        10
   macro avg       0.83      0.83      0.80        10
weighted avg       0.87      0.80      0.80        10



In [57]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
4 | 2
-----
0 | 4


In [66]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.6666666666666666
Sensitivity: 1.0


In [58]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 0.6666666666666666


In [59]:
#Print the Cross-Validation Score
scores = cross_val_score(gnb, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.33333333 0.5        1.        ]
Mean of Cross Validtation: 0.625


###### Metrics on the Testing set

In [60]:
# Use the model's predict method on the test data
predictions = gnb.predict(test)

In [61]:
# Print the Test accuracy
print("NB's accuracy:", accuracy_score(test_labels,predictions))

NB's accuracy: 0.4


In [62]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.33      0.50      0.40         2

   micro avg       0.40      0.40      0.40         5
   macro avg       0.42      0.42      0.40         5
weighted avg       0.43      0.40      0.40         5



In [63]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
1 | 2
-----
1 | 1


In [74]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.3333333333333333
Sensitivity: 0.5


In [64]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: -0.16666666666666666


### Save models for the RFE method

In [65]:
import joblib
joblib.dump(clf, 'RF_model.sav');
joblib.dump(logmodel, 'LR_model.sav');