### Import the Lirbraries and load the Dataset, Code names

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
%matplotlib inline

df = pd.read_csv('pre-precessed_dataset.csv')
codes = open('codes.txt','r').readlines()

df.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,COOH mmol/g,Endotoxins (EU/mg),Diameter min. (nm),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,Type_PRISTINE,% Total Impurities,Genotoxicity
0,0.0,0.198214,0.559322,0.934783,0.03639,0.349333,0.020378,0.23491,0.067882,0.306306,...,0.079208,0.34,0.332558,0.206494,0,0,0,1,0.134906,0
1,0.093822,0.283929,0.567797,1.0,0.066958,0.730667,0.13246,0.375204,0.054205,0.570571,...,0.405941,0.42,0.390698,0.298701,0,0,1,0,0.075646,0
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,1.0,0.5,0.372093,0.394805,1,0,0,0,0.04878,0
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,0.034653,0.48,0.872093,0.775325,0,0,0,1,0.219889,0
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,0.044554,0.52,0.444186,0.419481,0,0,1,0,0.190736,1


### Train-Test Split

In [2]:
from Kennard_Stone import kennardstonealgorithm

Using the Kennart-Stone algorithm, we split the dataset into 2 sets, one for training and one for validation

In [3]:
train, test, train_labels, test_labels = kennardstonealgorithm(df,'Genotoxicity',5)

In [4]:
print('Training Features Shape:', train.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (10, 23)
Training Labels Shape: (10,)
Testing Features Shape: (5, 23)
Testing Labels Shape: (5,)


In [5]:
print('The names of the testing samples are:')
print('=====================================')
for i in test.index:
    print(codes[i][:-1])

The names of the testing samples are:
NRCWE- 040
NRCWE- 041
NRCWE- 048
NM-401
NM-402


### Import Bayesian Optimization Tool and Search for the Best Model

In [6]:
from Bayesian_Optimization import BayesOpt

In [7]:
bo = BayesOpt(train, train_labels, folds=4, log_scaling=False)

#### Optimization for the SVM model

In [8]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# # Boundaries of the hyperparameters
params = {'C': (1,10), 'gamma':(0.1,1)}

# Optimization
svm_optimum = bo.optimize_svm(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for linear kernel
|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m 1       [0m | [0m 0.625   [0m | [0m 4.371   [0m | [0m 0.9556  [0m |
| [0m 2       [0m | [0m 0.625   [0m | [0m 7.588   [0m | [0m 0.6388  [0m |
| [0m 3       [0m | [0m 0.625   [0m | [0m 2.404   [0m | [0m 0.2404  [0m |
| [0m 4       [0m | [0m 0.625   [0m | [0m 1.523   [0m | [0m 0.8796  [0m |
| [0m 5       [0m | [0m 0.625   [0m | [0m 6.41    [0m | [0m 0.7373  [0m |
Error related to scaling.
Optimizing for poly kernel
|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m 1       [0m | [0m 0.3333  [0m | [0m 4.371   [0m | [0m 0.9556  [0m |
| [0m 2       [0m | [0m 0.3333  [0m | [0m 7.588   [0m | [0m 0.6388  [0m |
| [0m 3       [0m | [0m 0.3333  [0m | [0m 2.404   [0m | [0m 0.2404  [0m |
| [0m 4       [0m | [0m 0.3333  [0m | [0m 1.523   [0m | 

In [9]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.02


#### Optimization for the RF model

In [10]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={"n_estimators": (1,10), "min_samples_split": (0.1,0.9), "max_features": (0.5, 0.9)}

# Optimization
rf_optimum = bo.optimize_rf(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.5     [0m | [0m 0.6498  [0m | [0m 0.8606  [0m | [0m 7.588   [0m |
| [0m 2       [0m | [0m 0.375   [0m | [0m 0.7395  [0m | [0m 0.2248  [0m | [0m 2.404   [0m |
| [95m 3       [0m | [95m 0.5833  [0m | [95m 0.5232  [0m | [95m 0.7929  [0m | [95m 6.41    [0m |
| [0m 4      [0m | [0m 0.9167  [0m | [0m 0.5     [0m | [0m 0.1     [0m | [0m 10.0    [0m |
| [0m 5       [0m | [0m 0.375   [0m | [0m 0.833   [0m | [0m 0.2699  [0m | [0m 2.636   [0m |
| [0m 6       [0m | [0m 0.4167  [0m | [0m 0.9     [0m | [0m 0.9     [0m | [0m 10.0    [0m |
| [0m 7       [0m | [0m 0.9167  [0m | [0m 0.8302  [0m | [0m 0.1147  [0m | [0m 9.733   [0m |
| [0m 8       [0m | [0m 0.9167  [0m | [0m 0.87    [0m | [0m 0.1     [0m | [0m 9.067   [0m |
| [0m 9       [0m | [0m 0.375   [0m | [0m 0.5     

In [11]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.03


#### Optimization for the LR model

In [12]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (10,100)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 43.71   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 95.56   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 75.88   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 63.88   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 24.04   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.4583  [0m | [0m 43.71   [0m |
| [0m 2       [0m | [0m 0.4583  [0m | [0m 95.56   [0m |
| [0m 3       [0m | [0m 0.4583  [0m | [0m 75.88   [0m |
| [0m 4       [0m | [0m 0.4583  [0m | [0m 63.88   [0m |
| [0m 5       [0m | [0m 0.3333  [0m | [0m 24.04   [0m |
| [0m 6       [0m | [0m 0.4583  [0m | [0m 53.29   [0m |
| [0m 7       [0m | [0m 0.4583  [0m | [0m 45.57   [0m |
| [0m 8       [0m | [0m

In [13]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


### Fit the models and Test the performance

In [14]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score,matthews_corrcoef,
                             classification_report, confusion_matrix)

#### Fit SVM

In [15]:
# Declare the model
svc = SVC(C=svm_optimum['params']['C'], gamma=svm_optimum['params']['gamma'], kernel=svm_optimum['params']['kernel'], random_state=42)

# Train the model on training data
svc.fit(train, train_labels);

###### Metrics on the Training set

In [16]:
# Use the model's predict method 
predictions = svc.predict(train)

In [17]:
# Print the Training accuracy
print("SVM's training accuracy:", accuracy_score(train_labels, predictions))

SVM's training accuracy: 0.9


In [18]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.83      1.00      0.91         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10



In [19]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
4 | 1
-----
0 | 5


In [20]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.8
Sensitivity: 1.0


In [21]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 0.816496580927726


In [22]:
#Print the Cross-Validation Score
scores = cross_val_score(svc, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.66666667 1.         0.5       ]
Mean of Cross Validtation: 0.7083333333333333


###### Metrics on the Testing set

In [23]:
# Use the model's predict method
predictions = svc.predict(test)

In [24]:
# Print the Testing accuracy
print("SVM's testing accuracy:", accuracy_score(test_labels, predictions))

SVM's testing accuracy: 0.8


In [25]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.50      1.00      0.67         1

    accuracy                           0.80         5
   macro avg       0.75      0.88      0.76         5
weighted avg       0.90      0.80      0.82         5



In [26]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
3 | 1
-----
0 | 1


In [27]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.75
Sensitivity: 1.0


In [28]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6123724356957946


#### Fit RF

In [29]:
# Declare the model
clf = RandomForestClassifier(n_estimators=rf_optimum['params']['n_estimators'], max_features=rf_optimum['params']['max_features'], min_samples_split=rf_optimum['params']['min_samples_split'],  random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [31]:
# Use the model's predict method
predictions = clf.predict(train)

In [32]:
# Print the Train accuracy
print("RF's training accuracy:", accuracy_score(train_labels, predictions))

RF's training accuracy: 1.0


In [33]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [34]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [35]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [36]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [37]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [1.         0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


###### Metrics on the Testing set

In [38]:
# Use the model's predict method
predictions = clf.predict(test)

In [39]:
# Print the Test accuracy
print("RF's testing accuracy:", accuracy_score(test_labels, predictions))

RF's testing accuracy: 0.6


In [40]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.33      1.00      0.50         1

    accuracy                           0.60         5
   macro avg       0.67      0.75      0.58         5
weighted avg       0.87      0.60      0.63         5



In [41]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
2 | 2
-----
0 | 1


In [42]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.5
Sensitivity: 1.0


In [43]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.4082482904638631


#### Fit LR

In [44]:
# Declare the model
logmodel = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
logmodel.fit(train, train_labels);

###### Metrics on the Training set

In [45]:
# Use the model's predict method
predictions = logmodel.predict(train)

In [46]:
# Print the train accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [47]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [48]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [49]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [50]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [51]:
#Print the Cross-Validation Score
scores = cross_val_score(logmodel, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.         0.33333333 1.         0.5       ]
Mean of Cross Validtation: 0.4583333333333333


###### Metrics on the Testing set

In [52]:
# Use the model's predict method
predictions = logmodel.predict(test)

In [53]:
# Print the Test accuracy
print("LR's testing accuracy:", accuracy_score(test_labels, predictions))

LR's testing accuracy: 0.8


In [54]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.50      1.00      0.67         1

    accuracy                           0.80         5
   macro avg       0.75      0.88      0.76         5
weighted avg       0.90      0.80      0.82         5



In [55]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
3 | 1
-----
0 | 1


In [56]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.75
Sensitivity: 1.0


In [57]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6123724356957946


#### Fit NB

In [58]:
# Declare the model
gnb = GaussianNB()

# Train the model on training data
gnb.fit(train,train_labels);

###### Metrics on the Training set

In [59]:
# Use the model's predict method
predictions = gnb.predict(train)

In [60]:
# Print the train accuracy
print("NB's accuracy:", accuracy_score(train_labels,predictions))

NB's accuracy: 1.0


In [61]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [62]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [63]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [64]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [65]:
#Print the Cross-Validation Score
scores = cross_val_score(gnb, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.         0.33333333 0.5        1.        ]
Mean of Cross Validtation: 0.4583333333333333


###### Metrics on the Testing set

In [66]:
# Use the model's predict method on the test data
predictions = gnb.predict(test)

In [67]:
# Print the Test accuracy
print("NB's accuracy:", accuracy_score(test_labels,predictions))

NB's accuracy: 0.6


In [68]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.33      1.00      0.50         1

    accuracy                           0.60         5
   macro avg       0.67      0.75      0.58         5
weighted avg       0.87      0.60      0.63         5



In [69]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
2 | 2
-----
0 | 1


In [70]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.5
Sensitivity: 1.0


In [71]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.4082482904638631


### Save models for the RFE method

In [72]:
import joblib
joblib.dump(clf, 'RF_model.sav');
joblib.dump(logmodel, 'LR_model.sav');