### Import the Lirbraries and load the Dataset, Code names and models

In [1]:
from Kennard_Stone import kennardstonealgorithm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
from datetime import datetime
from Bayesian_Optimization import BayesOpt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (accuracy_score,matthews_corrcoef,
                             classification_report, confusion_matrix)
%matplotlib inline

df = pd.read_csv('pre-precessed_dataset.csv')
codes = open('codes.txt','r').readlines()

clf = joblib.load('LR_model.sav')

df.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,COOH mmol/g,Endotoxins (EU/mg),Diameter min. (nm),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,Type_PRISTINE,% Total Impurities,Genotoxicity
0,0.0,0.198214,0.559322,0.934783,0.03639,0.349333,0.020378,0.23491,0.067882,0.306306,...,0.079208,0.34,0.332558,0.206494,0,0,0,1,0.134906,0
1,0.093822,0.283929,0.567797,1.0,0.066958,0.730667,0.13246,0.375204,0.054205,0.570571,...,0.405941,0.42,0.390698,0.298701,0,0,1,0,0.075646,0
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,1.0,0.5,0.372093,0.394805,1,0,0,0,0.04878,0
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,0.034653,0.48,0.872093,0.775325,0,0,0,1,0.219889,0
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,0.044554,0.52,0.444186,0.419481,0,0,1,0,0.190736,1


Keep the same train test split

In [2]:
test = df.loc[[0,1,8,11,12]]
train = df.drop(index=[0,1,8,11,12],axis=0)

train_labels = train.pop('Genotoxicity')
test_labels = test.pop('Genotoxicity')

In [3]:
# Keep the test codes
test_codes = [codes[i][:-1] for i in test.index]

### RFE for LR

#### 1st iteration

Get features' importance

In [4]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(df.drop('Genotoxicity', axis=1).columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Endotoxins (EU/mg)   Importance: 2.238
Variable: Zave (12.5 ug/ml)    Importance: 2.104
Variable: PdI  (200 ug/ml)     Importance: 2.021
Variable: Length ave. (nm)     Importance: 1.939
Variable: % Total Impurities   Importance: 1.467
Variable: Peak (ug/ml)         Importance: 1.336
Variable: Type_OH              Importance: 0.974
Variable: PdI  (12.5 ug/ml)    Importance: 0.931
Variable: Zave  (200 ug/ml)    Importance: 0.728
Variable: Zave (batch)         Importance: 0.624
Variable: Type_COOH            Importance: 0.587
Variable: Type_PRISTINE        Importance: -0.304
Variable: Diameter min. (nm)   Importance: -0.587
Variable: Diameter ave. (nm)   Importance: -0.829
Variable: Diameter max. (nm)   Importance: -0.878
Variable: OH mmol/g            Importance: -0.999
Variable: COOH mmol/g          Importance: -1.012
Variable: ROS                  Importance: -1.032
Variable: BET (m2/g)           Importance: -1.164
Variable: Type_NH2             Importance: -1.258
Variable: C

Exclude features with minimum importance

In [5]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter min. (nm),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,% Total Impurities
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,0.8,1.0,1.0,0.5,0.372093,0.394805,1,0,0,0.04878
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,0.8,0.036946,0.034653,0.48,0.872093,0.775325,0,0,0,0.219889
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,0.9,0.049261,0.044554,0.52,0.444186,0.419481,0,0,1,0.190736
5,0.199591,0.342857,0.427966,0.684783,0.032156,0.0,0.27802,0.363785,0.0846,0.255255,...,0.5,0.147783,0.143564,0.66,0.339535,0.412987,1,0,0,0.517711
6,0.038274,0.323214,0.868644,0.945652,0.019584,0.552,0.659389,0.40783,0.52229,0.573574,...,0.8,0.147783,0.148515,0.36,0.302326,0.405195,0,0,0,0.136716


Optimize Logistc Regression

In [6]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [7]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.4167  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.4167  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.4167  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.4167  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.4167  [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.41666666666666663 and the optimal parameters are C=4.370861069626263 and p

In [8]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Test model's performance on the traning set

In [9]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [10]:
# Use the model's predict method 
predictions = clf.predict(train)

In [11]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [12]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [13]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [14]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [15]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [16]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.33333333 1.         0.        ]
Mean of Cross Validtation: 0.41666666666666663


#### 2nd iteration

Get the features' importance

In [17]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.015
Variable: Endotoxins (EU/mg)   Importance: 0.951
Variable: Length ave. (nm)     Importance: 0.927
Variable: PdI  (200 ug/ml)     Importance: 0.777
Variable: % Total Impurities   Importance: 0.777
Variable: Peak (ug/ml)         Importance: 0.566
Variable: PdI  (12.5 ug/ml)    Importance: 0.374
Variable: Zave  (200 ug/ml)    Importance: 0.365
Variable: Type_OH              Importance: 0.357
Variable: Zave (batch)         Importance: 0.319
Variable: Type_COOH            Importance: 0.259
Variable: Diameter min. (nm)   Importance: -0.225
Variable: Diameter ave. (nm)   Importance: -0.329
Variable: Diameter max. (nm)   Importance: -0.352
Variable: BET (m2/g)           Importance: -0.487
Variable: ROS                  Importance: -0.501
Variable: OH mmol/g            Importance: -0.513
Variable: COOH mmol/g          Importance: -0.519
Variable: Type_NH2             Importance: -0.612
Variable: CEA: C.H.N.O (wt%)   Importance: -0.928
Variable: P

Exclude features with minimum importance

In [18]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),Zave (batch),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),...,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,% Total Impurities
2,0.039432,0.342857,0.521186,1.0,0.035861,0.413333,0.014556,0.097879,0.031408,0.357357,...,1.0,0.8,1.0,1.0,0.5,0.394805,1,0,0,0.04878
3,0.048716,0.796429,0.271186,0.923913,0.011777,0.026667,0.0,0.0,0.013171,0.0,...,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0,0,0.219889
4,0.15655,0.3875,0.237288,0.934783,0.045124,0.024,0.425036,0.365416,0.241641,0.822823,...,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,0,1,0.190736
5,0.199591,0.342857,0.427966,0.684783,0.032156,0.0,0.27802,0.363785,0.0846,0.255255,...,1.0,0.5,0.147783,0.143564,0.66,0.412987,1,0,0,0.517711
6,0.038274,0.323214,0.868644,0.945652,0.019584,0.552,0.659389,0.40783,0.52229,0.573574,...,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0,0,0.136716


Optimize Logistc Regression

In [19]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [20]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.4167  [0m | [0m 4.371   [0m |
| [95m 2       [0m | [95m 0.5417  [0m | [95m 9.556   [0m |
| [0m 3       [0m | [0m 0.5417  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.5417  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.4167  [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.5417  [0m | [0m 8.664   [0m |
| [0m 7       [0m | [0m 0.5417  [0m | [0m 9.999   [0m |
| [0m 8       [0m | 

In [21]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.05


Step 5: Test model's performance (on the testing set)

In [22]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [23]:
# Use the model's predict method 
predictions = clf.predict(train)

In [24]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [25]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [26]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [27]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [28]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [29]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.33333333 1.         0.5       ]
Mean of Cross Validtation: 0.5416666666666666


#### 3rd iteration

Get the features' importance

In [30]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.366
Variable: Endotoxins (EU/mg)   Importance: 1.333
Variable: Length ave. (nm)     Importance: 1.238
Variable: PdI  (200 ug/ml)     Importance: 1.208
Variable: % Total Impurities   Importance: 1.021
Variable: Peak (ug/ml)         Importance: 0.798
Variable: Type_OH              Importance: 0.613
Variable: PdI  (12.5 ug/ml)    Importance: 0.591
Variable: Zave  (200 ug/ml)    Importance: 0.467
Variable: Type_COOH            Importance: 0.444
Variable: Zave (batch)         Importance: 0.407
Variable: Diameter ave. (nm)   Importance: -0.55
Variable: Diameter max. (nm)   Importance: -0.573
Variable: OH mmol/g            Importance: -0.658
Variable: COOH mmol/g          Importance: -0.667
Variable: BET (m2/g)           Importance: -0.669
Variable: ROS                  Importance: -0.685
Variable: Type_NH2             Importance: -0.802
Variable: CEA: C.H.N.O (wt%)   Importance: -1.337
Variable: PdI (batch)          Importance: -1.487
Variable: Pu

Exclude features with minimum importance

In [31]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_COOH,Type_NH2,Type_OH,% Total Impurities
2,0.039432,0.342857,0.521186,1.0,0.413333,0.014556,0.097879,0.031408,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,1,0,0,0.04878
3,0.048716,0.796429,0.271186,0.923913,0.026667,0.0,0.0,0.013171,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0,0,0.219889
4,0.15655,0.3875,0.237288,0.934783,0.024,0.425036,0.365416,0.241641,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,0,1,0.190736
5,0.199591,0.342857,0.427966,0.684783,0.0,0.27802,0.363785,0.0846,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,1,0,0,0.517711
6,0.038274,0.323214,0.868644,0.945652,0.552,0.659389,0.40783,0.52229,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0,0,0.136716


Optimize Logistc Regression

In [32]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [33]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.4167  [0m | [0m 4.371   [0m |
| [95m 2       [0m | [95m 0.5417  [0m | [95m 9.556   [0m |
| [0m 3       [0m | [0m 0.5417  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.5417  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.4167  [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.5417  [0m | [0m 8.664   [0m |
| [0m 7       [0m | [0m 0.5417  [0m | [0m 9.999   [0m |
| [0m 8       [0m | 

In [34]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.07


Step 5: Test model's performance (on the testing set)

In [35]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [36]:
# Use the model's predict method 
predictions = clf.predict(train)

In [37]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [38]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [39]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [40]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [41]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [42]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.33333333 1.         0.5       ]
Mean of Cross Validtation: 0.5416666666666666


#### 4th iteration

Get the features' importance

In [43]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.393
Variable: Endotoxins (EU/mg)   Importance: 1.351
Variable: Length ave. (nm)     Importance: 1.287
Variable: PdI  (200 ug/ml)     Importance: 1.214
Variable: % Total Impurities   Importance: 1.013
Variable: Peak (ug/ml)         Importance: 0.794
Variable: Type_OH              Importance: 0.604
Variable: PdI  (12.5 ug/ml)    Importance: 0.6
Variable: Zave  (200 ug/ml)    Importance: 0.503
Variable: Type_COOH            Importance: 0.438
Variable: Diameter ave. (nm)   Importance: -0.533
Variable: Diameter max. (nm)   Importance: -0.557
Variable: OH mmol/g            Importance: -0.659
Variable: COOH mmol/g          Importance: -0.668
Variable: BET (m2/g)           Importance: -0.682
Variable: ROS                  Importance: -0.712
Variable: Type_NH2             Importance: -0.795
Variable: CEA: C.H.N.O (wt%)   Importance: -1.327
Variable: PdI (batch)          Importance: -1.48
Variable: Purity (%)           Importance: -2.38


Exclude features with minimum importance

In [44]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),Zave (200 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_NH2,Type_OH,% Total Impurities
2,0.039432,0.342857,0.521186,1.0,0.413333,0.014556,0.097879,0.031408,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,0,0,0.04878
3,0.048716,0.796429,0.271186,0.923913,0.026667,0.0,0.0,0.013171,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0,0.219889
4,0.15655,0.3875,0.237288,0.934783,0.024,0.425036,0.365416,0.241641,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,1,0.190736
5,0.199591,0.342857,0.427966,0.684783,0.0,0.27802,0.363785,0.0846,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,0,0,0.517711
6,0.038274,0.323214,0.868644,0.945652,0.552,0.659389,0.40783,0.52229,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0,0.136716


Optimize Logistc Regression

In [45]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [46]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.5417  [0m | [0m 4.371   [0m |
| [95m 2       [0m | [95m 0.625   [0m | [95m 9.556   [0m |
| [0m 3       [0m | [0m 0.625   [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.625   [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.4167  [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.625   [0m | [0m 10.0    [0m |
| [0m 7       [0m | [0m 0.625   [0m | [0m 8.535   [0m |
| [0m 8       [0m | 

In [47]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.07


Step 5: Test model's performance (on the testing set)

In [48]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [49]:
# Use the model's predict method 
predictions = clf.predict(train)

In [50]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [51]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [52]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [53]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [54]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [55]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         0.5       ]
Mean of Cross Validtation: 0.625


#### 5th iteration

Get the features' importance

In [56]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Endotoxins (EU/mg)   Importance: 1.412
Variable: Zave (12.5 ug/ml)    Importance: 1.374
Variable: Length ave. (nm)     Importance: 1.287
Variable: PdI  (200 ug/ml)     Importance: 1.154
Variable: % Total Impurities   Importance: 1.063
Variable: Peak (ug/ml)         Importance: 0.85
Variable: PdI  (12.5 ug/ml)    Importance: 0.61
Variable: Type_OH              Importance: 0.559
Variable: Zave  (200 ug/ml)    Importance: 0.473
Variable: OH mmol/g            Importance: -0.555
Variable: COOH mmol/g          Importance: -0.564
Variable: Diameter max. (nm)   Importance: -0.586
Variable: Diameter ave. (nm)   Importance: -0.589
Variable: BET (m2/g)           Importance: -0.684
Variable: ROS                  Importance: -0.772
Variable: Type_NH2             Importance: -0.816
Variable: CEA: C.H.N.O (wt%)   Importance: -1.379
Variable: PdI (batch)          Importance: -1.518
Variable: Purity (%)           Importance: -2.337


Exclude features with minimum importance

In [57]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_NH2,Type_OH,% Total Impurities
2,0.039432,0.342857,0.521186,1.0,0.413333,0.014556,0.097879,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,0,0,0.04878
3,0.048716,0.796429,0.271186,0.923913,0.026667,0.0,0.0,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0,0.219889
4,0.15655,0.3875,0.237288,0.934783,0.024,0.425036,0.365416,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,1,0.190736
5,0.199591,0.342857,0.427966,0.684783,0.0,0.27802,0.363785,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,0,0,0.517711
6,0.038274,0.323214,0.868644,0.945652,0.552,0.659389,0.40783,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0,0.136716


Optimize Logistc Regression

In [58]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [59]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.625   [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.625   [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.625   [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.625   [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.625   [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.625 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [60]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [61]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [62]:
# Use the model's predict method 
predictions = clf.predict(train)

In [63]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [64]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [65]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [66]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [67]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [68]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         0.5       ]
Mean of Cross Validtation: 0.625


#### 6th iteration

Get the features' importance

In [69]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.105
Variable: Endotoxins (EU/mg)   Importance: 1.016
Variable: Length ave. (nm)     Importance: 1.001
Variable: PdI  (200 ug/ml)     Importance: 0.789
Variable: % Total Impurities   Importance: 0.784
Variable: Peak (ug/ml)         Importance: 0.551
Variable: PdI  (12.5 ug/ml)    Importance: 0.404
Variable: Type_OH              Importance: 0.325
Variable: Diameter ave. (nm)   Importance: -0.34
Variable: Diameter max. (nm)   Importance: -0.347
Variable: OH mmol/g            Importance: -0.462
Variable: COOH mmol/g          Importance: -0.468
Variable: BET (m2/g)           Importance: -0.494
Variable: ROS                  Importance: -0.529
Variable: Type_NH2             Importance: -0.625
Variable: CEA: C.H.N.O (wt%)   Importance: -0.926
Variable: PdI (batch)          Importance: -1.081
Variable: Purity (%)           Importance: -1.7


Exclude features with minimum importance

In [70]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Diameter ave. (nm),BET (m2/g),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_NH2,% Total Impurities
2,0.039432,0.342857,0.521186,1.0,0.413333,0.014556,0.097879,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,0,0.04878
3,0.048716,0.796429,0.271186,0.923913,0.026667,0.0,0.0,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0.219889
4,0.15655,0.3875,0.237288,0.934783,0.024,0.425036,0.365416,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,0.190736
5,0.199591,0.342857,0.427966,0.684783,0.0,0.27802,0.363785,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,0,0.517711
6,0.038274,0.323214,0.868644,0.945652,0.552,0.659389,0.40783,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0.136716


Optimize Logistc Regression

In [71]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [72]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.625   [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.625   [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.625   [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.625   [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.625   [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.625 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [73]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [74]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [75]:
# Use the model's predict method 
predictions = clf.predict(train)

In [76]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [77]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [78]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [79]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [80]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [81]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         0.5       ]
Mean of Cross Validtation: 0.625


#### 7th iteration

Get the features' importance

In [82]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.094
Variable: Endotoxins (EU/mg)   Importance: 0.978
Variable: Length ave. (nm)     Importance: 0.977
Variable: PdI  (200 ug/ml)     Importance: 0.855
Variable: % Total Impurities   Importance: 0.761
Variable: Peak (ug/ml)         Importance: 0.525
Variable: PdI  (12.5 ug/ml)    Importance: 0.393
Variable: Diameter ave. (nm)   Importance: -0.367
Variable: Diameter max. (nm)   Importance: -0.371
Variable: OH mmol/g            Importance: -0.492
Variable: COOH mmol/g          Importance: -0.498
Variable: BET (m2/g)           Importance: -0.521
Variable: ROS                  Importance: -0.574
Variable: Type_NH2             Importance: -0.665
Variable: CEA: C.H.N.O (wt%)   Importance: -0.88
Variable: PdI (batch)          Importance: -1.126
Variable: Purity (%)           Importance: -1.648


Exclude features with minimum importance

In [83]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),BET (m2/g),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_NH2,% Total Impurities
2,0.039432,0.521186,1.0,0.413333,0.014556,0.097879,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,0,0.04878
3,0.048716,0.271186,0.923913,0.026667,0.0,0.0,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0.219889
4,0.15655,0.237288,0.934783,0.024,0.425036,0.365416,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,0.190736
5,0.199591,0.427966,0.684783,0.0,0.27802,0.363785,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,0,0.517711
6,0.038274,0.868644,0.945652,0.552,0.659389,0.40783,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0.136716


Optimize Logistc Regression

In [84]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [85]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.625   [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.625   [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.625   [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.625   [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.625   [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.625 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [86]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [87]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [88]:
# Use the model's predict method 
predictions = clf.predict(train)

In [89]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [90]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [91]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [92]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [93]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [94]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         0.5       ]
Mean of Cross Validtation: 0.625


#### 8th iteration

Get the features' importance

In [95]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.085
Variable: Length ave. (nm)     Importance: 0.948
Variable: Endotoxins (EU/mg)   Importance: 0.946
Variable: PdI  (200 ug/ml)     Importance: 0.9
Variable: % Total Impurities   Importance: 0.78
Variable: Peak (ug/ml)         Importance: 0.513
Variable: PdI  (12.5 ug/ml)    Importance: 0.424
Variable: Diameter max. (nm)   Importance: -0.434
Variable: OH mmol/g            Importance: -0.478
Variable: BET (m2/g)           Importance: -0.479
Variable: COOH mmol/g          Importance: -0.485
Variable: ROS                  Importance: -0.568
Variable: Type_NH2             Importance: -0.656
Variable: CEA: C.H.N.O (wt%)   Importance: -0.9
Variable: PdI (batch)          Importance: -1.102
Variable: Purity (%)           Importance: -1.683


Exclude features with minimum importance

In [96]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),BET (m2/g),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_NH2,% Total Impurities
2,0.039432,0.521186,1.0,0.413333,0.014556,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,0,0.04878
3,0.048716,0.271186,0.923913,0.026667,0.0,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0.219889
4,0.15655,0.237288,0.934783,0.024,0.425036,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,0.190736
5,0.199591,0.427966,0.684783,0.0,0.27802,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,0,0.517711
6,0.038274,0.868644,0.945652,0.552,0.659389,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0.136716


Optimize Logistc Regression

In [97]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [98]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.75    [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.75    [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.75    [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.75    [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.75 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [99]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [100]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [101]:
# Use the model's predict method 
predictions = clf.predict(train)

In [102]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [103]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [104]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [105]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [106]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [107]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.75


#### 9th iteration

Get the features' importance

In [108]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.104
Variable: PdI  (200 ug/ml)     Importance: 0.969
Variable: Length ave. (nm)     Importance: 0.964
Variable: Endotoxins (EU/mg)   Importance: 0.943
Variable: % Total Impurities   Importance: 0.78
Variable: Peak (ug/ml)         Importance: 0.517
Variable: BET (m2/g)           Importance: -0.458
Variable: Diameter max. (nm)   Importance: -0.473
Variable: OH mmol/g            Importance: -0.498
Variable: COOH mmol/g          Importance: -0.505
Variable: ROS                  Importance: -0.556
Variable: Type_NH2             Importance: -0.587
Variable: CEA: C.H.N.O (wt%)   Importance: -0.911
Variable: PdI (batch)          Importance: -1.054
Variable: Purity (%)           Importance: -1.692


Exclude features with minimum importance

In [109]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Diameter max. (nm),Type_NH2,% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0.394805,0,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0.775325,0,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0.419481,0,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0.412987,0,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0.405195,0,0.136716


Optimize Logistc Regression

In [110]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [111]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.75    [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.75    [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.75    [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.75    [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.75 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [112]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [113]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [114]:
# Use the model's predict method 
predictions = clf.predict(train)

In [115]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [116]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [117]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [118]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [119]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [120]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.75


#### 10th iteration

Get the features' importance

In [121]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.108
Variable: Endotoxins (EU/mg)   Importance: 1.002
Variable: Length ave. (nm)     Importance: 1.001
Variable: PdI  (200 ug/ml)     Importance: 0.95
Variable: % Total Impurities   Importance: 0.76
Variable: Peak (ug/ml)         Importance: 0.6
Variable: Diameter max. (nm)   Importance: -0.409
Variable: OH mmol/g            Importance: -0.512
Variable: COOH mmol/g          Importance: -0.519
Variable: Type_NH2             Importance: -0.598
Variable: ROS                  Importance: -0.602
Variable: CEA: C.H.N.O (wt%)   Importance: -0.885
Variable: PdI (batch)          Importance: -1.125
Variable: Purity (%)           Importance: -1.64


Exclude features with minimum importance

In [122]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),OH mmol/g,COOH mmol/g,Endotoxins (EU/mg),Type_NH2,% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.103896,1.0,0.8,1.0,1.0,0.5,0,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.298701,1.0,0.8,0.036946,0.034653,0.48,0,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.220779,1.0,0.9,0.049261,0.044554,0.52,0,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.194805,1.0,0.5,0.147783,0.143564,0.66,0,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.857143,0.094203,0.8,0.147783,0.148515,0.36,0,0.136716


Optimize Logistc Regression

In [123]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [124]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.75    [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.75    [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.75    [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.75    [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.75 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [125]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [126]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [127]:
# Use the model's predict method 
predictions = clf.predict(train)

In [128]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [129]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [130]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [131]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [132]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [133]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.75


#### 11th iteration

Get the features' importance

In [134]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.09
Variable: PdI  (200 ug/ml)     Importance: 1.021
Variable: Length ave. (nm)     Importance: 0.96
Variable: Endotoxins (EU/mg)   Importance: 0.948
Variable: % Total Impurities   Importance: 0.789
Variable: Peak (ug/ml)         Importance: 0.577
Variable: OH mmol/g            Importance: -0.497
Variable: COOH mmol/g          Importance: -0.505
Variable: Type_NH2             Importance: -0.576
Variable: ROS                  Importance: -0.592
Variable: CEA: C.H.N.O (wt%)   Importance: -0.919
Variable: PdI (batch)          Importance: -1.078
Variable: Purity (%)           Importance: -1.704


Exclude features with minimum importance

In [135]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),COOH mmol/g,Endotoxins (EU/mg),Type_NH2,% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.103896,1.0,0.8,1.0,0.5,0,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.298701,1.0,0.8,0.034653,0.48,0,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.220779,1.0,0.9,0.044554,0.52,0,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.194805,1.0,0.5,0.143564,0.66,0,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.857143,0.094203,0.8,0.148515,0.36,0,0.136716


Optimize Logistc Regression

In [136]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [137]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.75    [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.75    [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.75    [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.75    [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.75 and the optimal parameters are C=4.370861069626263 and penalty=l2


In [138]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [139]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [140]:
# Use the model's predict method 
predictions = clf.predict(train)

In [141]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [142]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [143]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [144]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [145]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [146]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.33333333 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.75


#### 12th iteration

Get the features' importance

In [147]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.121
Variable: PdI  (200 ug/ml)     Importance: 1.03
Variable: Length ave. (nm)     Importance: 0.979
Variable: Endotoxins (EU/mg)   Importance: 0.933
Variable: % Total Impurities   Importance: 0.815
Variable: Peak (ug/ml)         Importance: 0.563
Variable: Type_NH2             Importance: -0.556
Variable: ROS                  Importance: -0.573
Variable: COOH mmol/g          Importance: -0.652
Variable: CEA: C.H.N.O (wt%)   Importance: -0.905
Variable: PdI (batch)          Importance: -1.11
Variable: Purity (%)           Importance: -1.714


Exclude features with minimum importance

In [148]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,Peak (ug/ml),CEA: C.H.N.O (wt%),COOH mmol/g,Endotoxins (EU/mg),% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.103896,1.0,0.8,1.0,0.5,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.298701,1.0,0.8,0.034653,0.48,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.220779,1.0,0.9,0.044554,0.52,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.194805,1.0,0.5,0.143564,0.66,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.857143,0.094203,0.8,0.148515,0.36,0.136716


Optimize Logistc Regression

In [149]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [150]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.8333  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.8333  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.8333  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.8333  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.8333  [0m | [0m 5.329   [0m |
| [0m 7       [0m | [0m 0.8333  [0m | [0m 8.62    [0m |
| [0m 8       [0m | [0m

In [151]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.05


Step 5: Test model's performance (on the testing set)

In [152]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [153]:
# Use the model's predict method 
predictions = clf.predict(train)

In [154]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [155]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [156]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [157]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [158]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [159]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.8333333333333333


#### 13th iteration

Get the features' importance

In [160]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.164
Variable: Length ave. (nm)     Importance: 0.977
Variable: Endotoxins (EU/mg)   Importance: 0.971
Variable: PdI  (200 ug/ml)     Importance: 0.956
Variable: % Total Impurities   Importance: 0.836
Variable: Peak (ug/ml)         Importance: 0.496
Variable: ROS                  Importance: -0.587
Variable: COOH mmol/g          Importance: -0.619
Variable: CEA: C.H.N.O (wt%)   Importance: -0.918
Variable: PdI (batch)          Importance: -1.229
Variable: Purity (%)           Importance: -1.747


Exclude features with minimum importance

In [161]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,CEA: C.H.N.O (wt%),COOH mmol/g,Endotoxins (EU/mg),% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.103896,0.8,1.0,0.5,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.298701,0.8,0.034653,0.48,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.220779,0.9,0.044554,0.52,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.194805,0.5,0.143564,0.66,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.857143,0.8,0.148515,0.36,0.136716


Optimize Logistc Regression

In [162]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [163]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.8333  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.8333  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.8333  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.8333  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.8333  [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.8333333333333333 and the optimal parameters are C=4.370861069626263 and pe

In [164]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [165]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [166]:
# Use the model's predict method 
predictions = clf.predict(train)

In [167]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [168]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [169]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [170]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [171]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [172]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.8333333333333333


#### 14th iteration

Get the features' importance

In [173]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.108
Variable: Endotoxins (EU/mg)   Importance: 1.014
Variable: Length ave. (nm)     Importance: 1.001
Variable: % Total Impurities   Importance: 0.936
Variable: PdI  (200 ug/ml)     Importance: 0.929
Variable: COOH mmol/g          Importance: -0.599
Variable: ROS                  Importance: -0.729
Variable: CEA: C.H.N.O (wt%)   Importance: -0.828
Variable: PdI (batch)          Importance: -1.286
Variable: Purity (%)           Importance: -1.704


Exclude features with minimum importance

In [174]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),ROS,CEA: C.H.N.O (wt%),Endotoxins (EU/mg),% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.103896,0.8,0.5,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.298701,0.8,0.48,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.220779,0.9,0.52,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.194805,0.5,0.66,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.857143,0.8,0.36,0.136716


Optimize Logistc Regression

In [175]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [176]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.8333  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.8333  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.8333  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.8333  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.8333  [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.8333333333333333 and the optimal parameters are C=4.370861069626263 and pe

In [177]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [178]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [179]:
# Use the model's predict method 
predictions = clf.predict(train)

In [180]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [181]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [182]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [183]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [184]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [185]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.8333333333333333


#### 15th iteration

Get the features' importance

In [186]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.167
Variable: Length ave. (nm)     Importance: 1.035
Variable: Endotoxins (EU/mg)   Importance: 0.981
Variable: % Total Impurities   Importance: 0.981
Variable: PdI  (200 ug/ml)     Importance: 0.958
Variable: ROS                  Importance: -0.685
Variable: CEA: C.H.N.O (wt%)   Importance: -0.806
Variable: PdI (batch)          Importance: -1.335
Variable: Purity (%)           Importance: -1.724


In [187]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),CEA: C.H.N.O (wt%),Endotoxins (EU/mg),% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.8,0.5,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.8,0.48,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.9,0.52,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.5,0.66,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.8,0.36,0.136716


Optimize Logistc Regression

In [188]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [189]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.8333  [0m | [0m 4.371   [0m |
| [95m 2       [0m | [95m 0.9167  [0m | [95m 9.556   [0m |
| [0m 3       [0m | [0m 0.9167  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.8333  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.8333  [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.9167  [0m | [0m 9.491   [0m |
| [0m 7       [0m | [0m 0.9167  [0m | [0m 8.352   [0m |
| [0m 8       [0m | 

In [190]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.05


Step 5: Test model's performance (on the testing set)

In [191]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [192]:
# Use the model's predict method 
predictions = clf.predict(train)

In [193]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [194]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [195]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [196]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [197]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [198]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 1.         1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


#### 16th iteration

Get the features' importance

In [199]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: PdI  (200 ug/ml)     Importance: 1.598
Variable: Length ave. (nm)     Importance: 1.549
Variable: % Total Impurities   Importance: 1.502
Variable: Endotoxins (EU/mg)   Importance: 1.446
Variable: Zave (12.5 ug/ml)    Importance: 1.386
Variable: CEA: C.H.N.O (wt%)   Importance: -0.972
Variable: PdI (batch)          Importance: -2.193
Variable: Purity (%)           Importance: -2.297


In [200]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),PdI (200 ug/ml),Endotoxins (EU/mg),% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.357357,0.5,0.04878
3,0.048716,0.923913,0.026667,0.0,0.0,0.48,0.219889
4,0.15655,0.934783,0.024,0.425036,0.822823,0.52,0.190736
5,0.199591,0.684783,0.0,0.27802,0.255255,0.66,0.517711
6,0.038274,0.945652,0.552,0.659389,0.573574,0.36,0.136716


Optimize Logistc Regression

In [201]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [202]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.9167  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.9167  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.9167  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.9167  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.8333  [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.9167  [0m | [0m 5.329   [0m |
| [0m 7       [0m | [0m 0.9167  [0m | [0m 8.62    [0m |
| [0m 8       [0m | [0m

In [203]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.05


Step 5: Test model's performance (on the testing set)

In [204]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [205]:
# Use the model's predict method 
predictions = clf.predict(train)

In [206]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [207]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [208]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [209]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [210]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [211]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 1.         1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


#### 17th iteration

Get the features' importance

In [212]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Endotoxins (EU/mg)   Importance: 1.087
Variable: Length ave. (nm)     Importance: 1.066
Variable: Zave (12.5 ug/ml)    Importance: 1.034
Variable: % Total Impurities   Importance: 0.983
Variable: PdI  (200 ug/ml)     Importance: 0.941
Variable: PdI (batch)          Importance: -1.437
Variable: Purity (%)           Importance: -1.894


In [213]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),Endotoxins (EU/mg),% Total Impurities
2,0.039432,1.0,0.413333,0.014556,0.5,0.04878
3,0.048716,0.923913,0.026667,0.0,0.48,0.219889
4,0.15655,0.934783,0.024,0.425036,0.52,0.190736
5,0.199591,0.684783,0.0,0.27802,0.66,0.517711
6,0.038274,0.945652,0.552,0.659389,0.36,0.136716


Optimize Logistc Regression

In [214]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [215]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.8333  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.8333  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.8333  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.8333  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.8333  [0m | [0m 2.404   [0m |
Error related to scaling.


Final result: The optimal model's accuracy is 0.8333333333333333 and the optimal parameters are C=4.370861069626263 and pe

In [216]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.0


Step 5: Test model's performance (on the testing set)

In [217]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [218]:
# Use the model's predict method 
predictions = clf.predict(train)

In [219]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [220]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [221]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [222]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [223]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [224]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 0.66666667 1.         1.        ]
Mean of Cross Validtation: 0.8333333333333333


#### 18th iteration

Get the features' importance

In [225]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.138
Variable: Length ave. (nm)     Importance: 1.058
Variable: Endotoxins (EU/mg)   Importance: 1.01
Variable: % Total Impurities   Importance: 0.894
Variable: PdI (batch)          Importance: -1.271
Variable: Purity (%)           Importance: -2.006


In [226]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml),Endotoxins (EU/mg)
2,0.039432,1.0,0.413333,0.014556,0.5
3,0.048716,0.923913,0.026667,0.0,0.48
4,0.15655,0.934783,0.024,0.425036,0.52
5,0.199591,0.684783,0.0,0.27802,0.66
6,0.038274,0.945652,0.552,0.659389,0.36


Optimize Logistc Regression

In [227]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [228]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 0.9167  [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.8333  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 0.9167  [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 0.9167  [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.9167  [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 0.75    [0m | [0m 1.0     [0m |
| [0m 7       [0m | [0m 0.9167  [0m | [0m 6.988   [0m |
| [0m 8       [0m | [0m

In [229]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.05


Step 5: Test model's performance (on the testing set)

In [230]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [231]:
# Use the model's predict method 
predictions = clf.predict(train)

In [232]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [233]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [234]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [235]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [236]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [237]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [0.66666667 1.         1.         1.        ]
Mean of Cross Validtation: 0.9166666666666666


#### 19th iteration

Get the features' importance

In [238]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.218
Variable: Length ave. (nm)     Importance: 1.074
Variable: Endotoxins (EU/mg)   Importance: 0.919
Variable: PdI (batch)          Importance: -1.337
Variable: Purity (%)           Importance: -2.155


In [239]:
for i in range(len(feature_importances)):
    if round(abs(feature_importances[i][1]),3)==min([round(abs(i),3) for i in importances]):
        train.drop(feature_importances[i][0], axis=1, inplace=True)
        test.drop(feature_importances[i][0], axis=1, inplace=True)
        
train.head()

Unnamed: 0,Length ave. (nm),Purity (%),PdI (batch),Zave (12.5 ug/ml)
2,0.039432,1.0,0.413333,0.014556
3,0.048716,0.923913,0.026667,0.0
4,0.15655,0.934783,0.024,0.425036
5,0.199591,0.684783,0.0,0.27802
6,0.038274,0.945652,0.552,0.659389


Optimize Logistc Regression

In [240]:
bo = BayesOpt(train, train_labels, folds = 4, n_iter=30,log_scaling=False)

In [241]:
# Get the time that the optimization started
start_time = datetime.now().strftime("%H:%M:%S")

# Boundaries of the hyperparameters
params={'C' : (1,10)}

# Optimization
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

lr_optimum = bo.optimize_lr(params)

# Get the time that the optimization ended
end_time = datetime.now().strftime("%H:%M:%S")

Optimizing for l1 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 2.404   [0m |
Error related to scaling.
Optimizing for l2 penalty
|   iter    |  target   |     C     |
-------------------------------------
| [0m 1       [0m | [0m 1.0     [0m | [0m 4.371   [0m |
| [0m 2       [0m | [0m 0.9167  [0m | [0m 9.556   [0m |
| [0m 3       [0m | [0m 1.0     [0m | [0m 7.588   [0m |
| [0m 4       [0m | [0m 1.0     [0m | [0m 6.388   [0m |
| [0m 5       [0m | [0m 0.75    [0m | [0m 2.404   [0m |
| [0m 6       [0m | [0m 1.0     [0m | [0m 5.335   [0m |
| [0m 7       [0m | [0m 1.0     [0m | [0m 8.408   [0m |
| [0m 8       [0m | [0m

In [242]:
print('Minutes to execute:', 
      round((datetime.strptime(end_time, '%H:%M:%S') - datetime.strptime(start_time, '%H:%M:%S')).seconds/60,2))

Minutes to execute: 0.05


Step 5: Test model's performance (on the testing set)

In [243]:
# Declare the model
clf = LogisticRegression(C=lr_optimum['params']['C'], penalty=lr_optimum['params']['penalty'],random_state=42)

# Train the model on training data
clf.fit(train, train_labels);

###### Metrics on the Training set

In [244]:
# Use the model's predict method 
predictions = clf.predict(train)

In [245]:
# Print the training accuracy
print("LR's training accuracy:", accuracy_score(train_labels, predictions))

LR's training accuracy: 1.0


In [246]:
# Print the classification report
print(classification_report(train_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



In [247]:
cm = confusion_matrix(train_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
5 | 0
-----
0 | 5


In [248]:
tn, fp, fn, tp = confusion_matrix(train_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 1.0
Sensitivity: 1.0


In [249]:
# Print the MCC
print('MCC:', matthews_corrcoef(train_labels,predictions))

MCC: 1.0


In [250]:
#Print the Cross-Validation Score
scores = cross_val_score(clf, train, train_labels, cv=4)

print('List of scores:', scores)
print('Mean of Cross Validtation:', scores.mean())

List of scores: [1. 1. 1. 1.]
Mean of Cross Validtation: 1.0


The cross validation score is maximized. Hence, the process terminates here. In the next cells we present some metrics of the final model on the testning set. 

###### Metrics on the Testing set

In [251]:
# Use the model's predict method
predictions = clf.predict(test)

In [252]:
# Print the testing accuracy
print("LR's Testing accuracy:", accuracy_score(test_labels, predictions))

LR's Testing accuracy: 0.8


In [253]:
# Print the classification report
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.50      1.00      0.67         1

    accuracy                           0.80         5
   macro avg       0.75      0.88      0.76         5
weighted avg       0.90      0.80      0.82         5



In [254]:
cm = confusion_matrix(test_labels,predictions)
print('Confusion Matrix:')
print(cm[0][0],'|',cm[0][1])
print('-----')
print(cm[1][0],'|',cm[1][1])

Confusion Matrix:
3 | 1
-----
0 | 1


In [255]:
tn, fp, fn, tp = confusion_matrix(test_labels,predictions).ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
print('Specificity:',specificity)
print('Sensitivity:',sensitivity)

Specificity: 0.75
Sensitivity: 1.0


In [256]:
# Print the MCC
print('MCC:', matthews_corrcoef(test_labels,predictions))

MCC: 0.6123724356957946


Hence, the final RF model is the above, using 4 features, and the importance of the features is the following 

In [257]:
# Get numerical feature importances and feature names
importances = list(clf.coef_.reshape(clf.coef_.shape[1]))
feature_list = list(train.columns)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Zave (12.5 ug/ml)    Importance: 1.262
Variable: Length ave. (nm)     Importance: 1.233
Variable: PdI (batch)          Importance: -1.538
Variable: Purity (%)           Importance: -2.092


The classification probabilities of the testing samples: 

In [258]:
print('-----------------------------------------------------------------')
print('  Sample                                    Prob(0)    Prob(1)')
print('-----------------------------------------------------------------')
for i in test.index:
    print('{:40} {}'.format(codes[i][:-1],clf.predict_proba(np.array(test.loc[i]).reshape(1, -1))[0]));

-----------------------------------------------------------------
  Sample                                    Prob(0)    Prob(1)
-----------------------------------------------------------------
NRCWE- 040                               [0.73751777 0.26248223]
NRCWE- 041                               [0.81740429 0.18259571]
NRCWE- 048                               [0.72970493 0.27029507]
NM-401                                   [0.36099822 0.63900178]
NM-402                                   [0.31110851 0.68889149]


### Domain of Applicability

In [259]:
test_names = [codes[i] for i in test.index]

In [260]:
leverage_threshold = 3*train.shape[1]/train.shape[0]
print('The Leverage threshold is:', round(leverage_threshold, 2))

The Leverage threshold is: 1.2


In [261]:
# Labels are the values we want to predict
l_train_labels = np.array(train_labels)
l_test_labels = np.array(test_labels)
# Convert to numpy array
l_test = np.array(test)
l_train = np.array(train)

In [262]:
from numpy.linalg import matrix_power
H=list()
reliability=list()
for i in range(len(l_test)):
    H.append(l_test[i].T@(matrix_power(l_train.T@l_train, -1))@l_test[i])
    if H[i]<=leverage_threshold:
        reliability.append('reliable')
    else:
        reliability.append('unreliable')

LV = [(sample[:-1], round(l_val, 2),rely) for sample, l_val, rely in zip(test_names, H, reliability)]
for i in range(len(l_test)):
    [print('Sample: {:40} Leverage Value: {:5}    Reliability: {:20}'.format(LV[i][0],LV[i][1],LV[i][2]))];

Sample: NRCWE- 040                               Leverage Value:   0.2    Reliability: reliable            
Sample: NRCWE- 041                               Leverage Value:  0.36    Reliability: reliable            
Sample: NRCWE- 048                               Leverage Value:  0.43    Reliability: reliable            
Sample: NM-401                                   Leverage Value:  0.51    Reliability: reliable            
Sample: NM-402                                   Leverage Value:  0.19    Reliability: reliable            
