In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import dependencies
import numpy as np
from numpy import loadtxt
import pandas as pd
from collections import Counter
import sqlalchemy
from sqlalchemy import create_engine, text

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN


# Define Features and Target and Split and Scale Data

In [4]:
# load data
df = pd.read_csv("lcms_df.csv")

# Create features
X = df.drop(columns = ["structure_id", "preferred_lcms_method"])

# Create target
y = df["preferred_lcms_method"]

In [6]:
df.columns

Index(['structure_id', 'preferred_lcms_method', 'MolWt', 'exactMolWt', 'qed',
       'TPSA', 'HeavyAtomMolWt', 'MolLogP', 'MolMR', 'FractionCSP3',
       'NumValenceElectrons', 'MaxPartialCharge', 'MinPartialCharge',
       'FpDensityMorgan1', 'BalabanJ', 'BertzCT', 'HallKierAlpha', 'Ipc',
       'Kappa2', 'LabuteASA', 'PEOE_VSA10', 'PEOE_VSA2', 'SMR_VSA10',
       'SMR_VSA4', 'SlogP_VSA2', 'SlogP_VSA6', 'MaxEStateIndex',
       'MinEStateIndex', 'EState_VSA3', 'EState_VSA8', 'HeavyAtomCount',
       'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles',
       'NumAliphaticHeterocycles', 'NumAliphaticRings',
       'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings',
       'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
       'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles',
       'NumSaturatedRings', 'RingCount'],
      dtype='object')

In [4]:
X.describe()

Unnamed: 0,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,NumValenceElectrons,MaxPartialCharge,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
count,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,...,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0
mean,388.563602,388.127765,0.620003,70.103597,369.037449,3.943946,104.416344,0.275646,142.206107,0.283503,...,1.548664,3.201336,5.005725,1.243321,7.507634,4.138359,0.044847,0.323473,0.368321,3.959924
std,83.193165,83.027644,0.172518,21.376005,79.287778,1.249958,22.610226,0.143889,29.787716,0.100816,...,0.761739,0.857863,1.262177,0.837831,2.115107,1.904837,0.228972,0.562555,0.594329,0.852166
min,226.283,226.121846,0.138213,16.13,212.171,0.40492,62.0307,0.0,80.0,0.036113,...,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0
25%,324.65225,324.185915,0.479639,55.63,307.96475,3.06918,87.391475,0.1875,122.0,0.225395,...,1.0,3.0,4.0,1.0,6.0,3.0,0.0,0.0,0.0,3.0
50%,374.331,373.620195,0.64819,67.35,355.229,3.94468,101.61065,0.277778,138.0,0.255791,...,2.0,3.0,5.0,1.0,7.0,4.0,0.0,0.0,0.0,4.0
75%,447.36725,446.356872,0.767278,82.015,427.326,4.747525,116.444975,0.360909,160.0,0.348704,...,2.0,4.0,6.0,2.0,9.0,5.0,0.0,1.0,1.0,4.0
max,774.895,774.238214,0.92578,164.5,737.599,7.0606,200.3182,0.882353,286.0,0.585809,...,4.0,5.0,13.0,4.0,16.0,12.0,3.0,4.0,4.0,10.0


In [5]:
# Check balance of target values
y.value_counts()

Xbridge HpH    729
Gemini LpH     319
Name: preferred_lcms_method, dtype: int64

In [6]:
# Normal train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [7]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[ 0.59733689,  0.58882916,  0.92822901, ...,  1.15766035,
         1.01840514, -1.11432097],
       [ 0.11735691,  0.10851085,  0.5338891 , ...,  1.15766035,
         1.01840514,  0.01727629],
       [-0.37292342, -0.3724427 , -0.34769352, ..., -0.56894503,
        -0.61519984,  0.01727629],
       ...,
       [-0.77102791, -0.77100915,  0.55035922, ..., -0.56894503,
        -0.61519984,  0.01727629],
       [-1.2995918 , -1.29924377,  1.29041326, ...,  1.15766035,
         1.01840514, -1.11432097],
       [-0.08062311, -0.07826255,  1.40469134, ...,  1.15766035,
         1.01840514, -1.11432097]])

# Test Machine Learning Models

In [8]:
ml_list1 = {
    "Balanced Random Forest Classifier":BalancedRandomForestClassifier(n_estimators=100, random_state=1),
    "Easy Ensemble AdaBoost Classifier":EasyEnsembleClassifier(n_estimators=100, random_state=1),
    "XGBoost Classifier":XGBClassifier(n_estimators = 100, random_state = 1)
     }


ml_list2 = {"Logistic Regression with Random Oversampling":RandomOverSampler(random_state = 1),
            "Logistic Regression with SMOTE Oversampling":SMOTE(random_state = 1, sampling_strategy = "auto"),
            "Logistic Regression with Random Undersampling":RandomUnderSampler(random_state = 1),
            "Logistic Regression with Cluster Centroids Undersampling":ClusterCentroids(random_state = 1),
           "Logistic Regression with SMOTEENN Combination Over- and Undersampling":SMOTEENN(random_state=1)}
    

ret = []

for x in ml_list1:
    ml = ml_list1[x].fit(X_train_scaled, y_train)
    y_pred = ml.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    ret.append({
        "Name": x,
        "Balanced Accuracy Score":ba_score,
    })

for x in ml_list2:
    X_resampled, y_resampled = ml_list2[x].fit_resample(X_train_scaled, y_train)
    ml = LogisticRegression(random_state=1).fit(X_resampled, y_resampled)

    y_pred = ml.predict(X_test_scaled)
    
    ba_score = balanced_accuracy_score(y_test, y_pred)
    ret.append({
        "Name": x,
        "Balanced Accuracy Score":ba_score,
    })
    



In [9]:
summary_df = pd.DataFrame(ret).sort_values("Balanced Accuracy Score", ascending=False)
summary_df

Unnamed: 0,Name,Balanced Accuracy Score
0,Balanced Random Forest Classifier,0.873397
6,Logistic Regression with Cluster Centroids Und...,0.872631
2,XGBoost Classifier,0.865803
4,Logistic Regression with SMOTE Oversampling,0.86427
3,Logistic Regression with Random Oversampling,0.860577
7,Logistic Regression with SMOTEENN Combination ...,0.858835
5,Logistic Regression with Random Undersampling,0.856884
1,Easy Ensemble AdaBoost Classifier,0.852425
