In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pprint import pprint

from sklearn.model_selection import RandomizedSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler

# Load Data

In [3]:
# Load data and create DF
df = pd.read_csv("https://raw.githubusercontent.com/jenamis/purifAI/jen/MachineLearning/merged_for_testing.csv")
print(df.shape)
df.head()

(1086, 71)


Unnamed: 0,SAMPLE_ID,STRUCTURE_ID,PREFERRED_LCMS_METHOD,SPE_METHOD,METHOD,SPE_SUCCESSFUL,CRASHED_OUT,SAMPLE_STATUS,SAMPLE_CURRENT_STATUS,TERMINATION_CAUSE,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,MTA0ST2022-051-001_S2,MTA0ST2022-051-001,Xbridge HpH,HLB,HLB/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,2,9,2,0,1,1,3
1,MTA0ST2022-051-003_G1,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,1,8,2,0,1,1,3
2,MTA0ST2022-051-004,MTA0ST2022-051-004,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Stored,,...,0,2,3,1,6,2,0,1,1,3
3,MTA0ST2022-051-005,MTA0ST2022-051-005,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Stored,,...,1,2,2,1,6,2,0,1,1,3
4,MTA0ST2022-051-006,MTA0ST2022-051-006,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,3,2,1,6,2,0,1,1,4


In [4]:
# Encode SPE method
le = LabelEncoder()
df_encoded = df.copy()
df_encoded["SPE_METHOD"] = le.fit_transform(df_encoded["SPE_METHOD"])

# Define Features and Target and Split and Scale Data

In [5]:
# Create features
X = df.drop(columns = ["SAMPLE_ID", 
                       "STRUCTURE_ID",
                       "PREFERRED_LCMS_METHOD", 
                       "SPE_METHOD", 
                       "METHOD", 
                       "SPE_SUCCESSFUL", 
                       "CRASHED_OUT", 
                       "SAMPLE_STATUS", 
                       "SAMPLE_CURRENT_STATUS", 
                       "TERMINATION_CAUSE", 
                       "TERMINATION_STEP", 
                       "TERMINATION_DETAILS", 
                       "REACTION_SCALE_(mmol)", 
                       "SELECTED_FRACTIONS", 
                       "VOLUME_COLLECTED_(mL)", 
                       "TOTAL_FRACTIONS_COLLECTED", 
                       "RECOVERED_SAMPLE_DRY_MASS_(mg)", 
                       "PERCENT_YIELD", 
                       "%_PURITY_(BY_LCMS)", 
                       "PURIFICATION_COMMENTS",
                       "EXTRA_COMMENTS",
                       'logP', 
                       'HBD', 
                       'HBA',
                       'Flexibility',
                       'Rotatable Bonds'])

# Create target
y = df_encoded["SPE_METHOD"]

In [6]:
X.describe()

Unnamed: 0,MolWt,ExactMolWt,TPSA,qed,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,NumValenceElectrons,MaxPartialCharge,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
count,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,...,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0
mean,390.582017,390.148447,70.864291,0.616214,370.89353,3.953765,104.919159,0.276478,143.001842,0.28436,...,1.539595,3.197053,5.049724,1.279006,7.571823,4.201657,0.044199,0.328729,0.372928,3.952118
std,83.810932,83.65081,21.882793,0.173756,79.761121,1.247837,22.850108,0.144754,30.10259,0.100642,...,0.75824,0.860731,1.292058,0.863207,2.126397,1.944112,0.226938,0.561141,0.59333,0.846522
min,226.283,226.121846,16.13,0.138213,212.171,0.40492,62.0307,0.0,80.0,0.036113,...,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0
25%,326.3095,326.081302,55.915,0.470119,310.219,3.07739,88.019,0.1875,122.0,0.226267,...,1.0,3.0,4.0,1.0,6.0,3.0,0.0,0.0,0.0,3.0
50%,375.453,375.104148,68.1,0.639582,357.135,3.95262,101.86485,0.277778,138.0,0.255791,...,2.0,3.0,5.0,1.0,7.0,4.0,0.0,0.0,0.0,4.0
75%,450.326,449.102145,83.12,0.765451,430.063,4.7524,116.9609,0.363636,164.0,0.375954,...,2.0,4.0,6.0,2.0,9.0,5.0,0.0,1.0,1.0,4.0
max,774.895,774.238214,164.5,0.92578,737.599,7.0606,200.3182,0.882353,286.0,0.585809,...,4.0,5.0,13.0,4.0,16.0,12.0,3.0,4.0,4.0,10.0


In [7]:
# Check balance of target values
y.value_counts()

1    896
0    190
Name: SPE_METHOD, dtype: int64

In [8]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [9]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[-1.20369719, -1.20312573,  1.20773171, ..., -0.6034376 ,
        -0.65025003,  0.05177056],
       [ 0.34225786,  0.34456683,  1.60849655, ...,  1.19582323,
         1.04081555,  0.05177056],
       [-0.23632135, -0.23441384, -1.1869444 , ...,  1.19582323,
         1.04081555,  0.05177056],
       ...,
       [-0.63430121, -0.63320249, -1.23934122, ...,  1.19582323,
         1.04081555,  0.05177056],
       [ 0.62083169,  0.62304415, -0.19895759, ..., -0.6034376 ,
        -0.65025003,  0.05177056],
       [-0.67103409, -0.66991226,  0.09418135, ...,  2.99508407,
         2.73188113,  0.05177056]])

### Grid search on Logistic Regression with SMOTE Oversampling

In [10]:
ml_list = {"Logistic Regression with Random Oversampling":RandomOverSampler(random_state = 1),
            "Logistic Regression with SMOTE Oversampling":SMOTE(random_state = 1, sampling_strategy = "auto"),
            "Logistic Regression with Random Undersampling":RandomUnderSampler(random_state = 1),
            "Logistic Regression with Cluster Centroids Undersampling":ClusterCentroids(random_state = 1),
          "Logistic Regression with SMOTEENN Combination Over- and Undersampling":SMOTEENN(random_state=1)}
    
ret = []
for x in ml_list:
    X_resampled, y_resampled = ml_list[x].fit_resample(X_train_scaled, y_train)
    # setting the testing parameters
    params = {"C" :np.logspace(-3,3,7),
             'penalty'    : ['l1' , 'l2']}

    # create model
    model = LogisticRegression(random_state=1)

    # Random search of parameters, using 3 fold cross validation
    grid_search = GridSearchCV(model, param_grid=params, 
                          cv = 3, n_jobs = -1)

    # Fit the grid search model
    grid_search.fit(X_resampled, y_resampled)
    grid_search.best_params_
    
    def evaluate(model, X_test_scaled, y_test):
        y_pred = model.predict(X_test_scaled)
        ba_score = balanced_accuracy_score(y_test, y_pred)
        return ba_score

    # performance of base model
    base_model = LogisticRegression(random_state=1)
    base_model.fit(X_resampled, y_resampled)
    base_accuracy = evaluate(base_model, X_test_scaled, y_test)
    
    # performance of grid model for best parameter setting
    best_grid = grid_search.best_estimator_
    best_grid.fit(X_resampled, y_resampled)
    grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)

    improvement = '{:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy)
    
    ret.append({
        "Name": x,
        "Base model Balanced Accuracy":base_accuracy,
        "Grid model Balanced Accuracy":grid_accuracy,
        "Improvement" : improvement
    })


In [11]:
summary_df = pd.DataFrame(ret).sort_values("Grid model Balanced Accuracy", ascending=False)
summary_df

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
4,Logistic Regression with SMOTEENN Combination ...,0.842432,0.878198,4.25%
1,Logistic Regression with SMOTE Oversampling,0.862703,0.875946,1.54%
2,Logistic Regression with Random Undersampling,0.829189,0.85018,2.53%
3,Logistic Regression with Cluster Centroids Und...,0.867207,0.847928,-2.22%
0,Logistic Regression with Random Oversampling,0.843694,0.824685,-2.25%


In [12]:
df_lr = pd.DataFrame(ret)
df_lr

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Logistic Regression with Random Oversampling,0.843694,0.824685,-2.25%
1,Logistic Regression with SMOTE Oversampling,0.862703,0.875946,1.54%
2,Logistic Regression with Random Undersampling,0.829189,0.85018,2.53%
3,Logistic Regression with Cluster Centroids Und...,0.867207,0.847928,-2.22%
4,Logistic Regression with SMOTEENN Combination ...,0.842432,0.878198,4.25%


In [45]:
df_xgb = pd.read_csv('df_xgb.csv')
df_xgb

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,XGBoost,0.854234,0.863243,1.05%


In [46]:
df_ee = pd.read_csv('df_ee.csv')
df_ee


Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Easy Ensemble AdaBoost,0.864685,0.86018,-0.52%


In [47]:
summary = pd.merge(df_lr, df_xgb, how ="outer")
summary

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Logistic Regression with Random Oversampling,0.843694,0.824685,-2.25%
1,Logistic Regression with SMOTE Oversampling,0.862703,0.875946,1.54%
2,Logistic Regression with Random Undersampling,0.829189,0.85018,2.53%
3,Logistic Regression with Cluster Centroids Und...,0.867207,0.847928,-2.22%
4,Logistic Regression with SMOTEENN Combination ...,0.842432,0.878198,4.25%
5,XGBoost,0.854234,0.863243,1.05%


In [48]:
summary = pd.merge(summary, df_ee, how ="outer").sort_values("Grid model Balanced Accuracy", ascending=False)
summary

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
4,Logistic Regression with SMOTEENN Combination ...,0.842432,0.878198,4.25%
1,Logistic Regression with SMOTE Oversampling,0.862703,0.875946,1.54%
5,XGBoost,0.854234,0.863243,1.05%
6,Easy Ensemble AdaBoost,0.864685,0.86018,-0.52%
2,Logistic Regression with Random Undersampling,0.829189,0.85018,2.53%
3,Logistic Regression with Cluster Centroids Und...,0.867207,0.847928,-2.22%
0,Logistic Regression with Random Oversampling,0.843694,0.824685,-2.25%
