In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
from numpy import loadtxt
import pandas as pd
from collections import Counter
import sqlalchemy
from sqlalchemy import create_engine, text


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN


# Load Data

In [3]:
# Connect to AWS database
database_url = f'postgresql://postgres:purifai2022@purifai.ceoinb9nwfxg.us-west-1.rds.amazonaws.com/postgres'
engine = sqlalchemy.create_engine(database_url)
connection = engine.connect()

In [4]:
# Establish connection with engine object
with engine.connect() as conn:
    spe_analysis = conn.execute("SELECT * FROM outcomes INNER JOIN structures ON outcomes.structure_id = structures.structure_id WHERE spe_successful = 'true';")

In [5]:
# Set columns names and data contents
columns = [x for x in spe_analysis.keys()]
data = [x for x in spe_analysis]

# Create DF
spe_analysis_df = pd.DataFrame(data, columns = columns)
spe_analysis_df = spe_analysis_df.loc[:,~spe_analysis_df.columns.duplicated()].copy()
print(spe_analysis_df.shape)
spe_analysis_df.head()

(1080, 65)


Unnamed: 0,sample_id,structure_id,preferred_lcms_method,spe_method,method,spe_successful,crashed_out,sample_status,sample_current_status,termination_cause,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,00YLL22-042-002,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,00YLL22-042-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,00YLL22-042-004,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,00YLL22-042-005,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,00YLL22-042-008,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,2,0,1,1,5


In [6]:
# retrive the column name as a list
spe_analysis_df.columns.to_list()

['sample_id',
 'structure_id',
 'preferred_lcms_method',
 'spe_method',
 'method',
 'spe_successful',
 'crashed_out',
 'sample_status',
 'sample_current_status',
 'termination_cause',
 'termination_step',
 'termination_details',
 'reaction_scale',
 'selected_fractions',
 'volume_collected',
 'total_fractions_collected',
 'recovered_sample_dry_mass',
 'percent_yield',
 'percent_purity',
 'purification_comments',
 'MolWt',
 'exactMolWt',
 'qed',
 'TPSA',
 'HeavyAtomMolWt',
 'MolLogP',
 'MolMR',
 'FractionCSP3',
 'NumValenceElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'FpDensityMorgan1',
 'BalabanJ',
 'BertzCT',
 'HallKierAlpha',
 'Ipc',
 'Kappa2',
 'LabuteASA',
 'PEOE_VSA10',
 'PEOE_VSA2',
 'SMR_VSA10',
 'SMR_VSA4',
 'SlogP_VSA2',
 'SlogP_VSA6',
 'MaxEStateIndex',
 'MinEStateIndex',
 'EState_VSA3',
 'EState_VSA8',
 'HeavyAtomCount',
 'NHOHCount',
 'NOCount',
 'NumAliphaticCarbocycles',
 'NumAliphaticHeterocycles',
 'NumAliphaticRings',
 'NumAromaticCarbocycles',
 'NumAromaticHe

In [7]:
# Remove columns not used for ML model
df_ml = spe_analysis_df.drop(columns = ["sample_id", 
                               "preferred_lcms_method",
                               "method",
                               "spe_successful",
                               "crashed_out",
                               "sample_status",
                               "sample_current_status",
                               "termination_cause",
                               "termination_step",
                               "termination_details",
                               "reaction_scale",
                               "selected_fractions",
                               "volume_collected",
                               "total_fractions_collected",
                               "recovered_sample_dry_mass",
                               "percent_yield",
                               "percent_purity",
                               "purification_comments"])

df_ml.head()

Unnamed: 0,structure_id,spe_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,MCX,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,MCX,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,MCX,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,MCX,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,MCX,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


In [8]:
# Check for duplicates
df_ml.duplicated().sum()

22

In [9]:
# Drop duplicates - keeps rows with same structure ID that were tested and successfully purified with both SPE methods
df_ml = df_ml.drop_duplicates()
df_ml.head()

Unnamed: 0,structure_id,spe_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,MCX,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,MCX,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,MCX,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,MCX,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,MCX,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


In [10]:
# Encode SPE method
le = LabelEncoder()
df_encoded = df_ml.copy()
df_encoded["spe_method"] = le.fit_transform(df_encoded["spe_method"])

# Define Features and Target and Split and Scale Data

In [11]:
# Create features
X = df_encoded.drop(columns = ["spe_method", "structure_id"])

# Create target
y = df_encoded["spe_method"]

In [12]:
X.describe()

Unnamed: 0,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,NumValenceElectrons,MaxPartialCharge,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
count,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,...,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0,1058.0
mean,389.160841,388.725909,0.618677,70.119026,369.651563,3.955435,104.494754,0.275119,142.404537,0.284755,...,1.548204,3.205104,5.008507,1.247637,7.529301,4.148393,0.044423,0.323251,0.367675,3.961248
std,83.429934,83.266409,0.173401,21.34971,79.551184,1.257792,22.630721,0.1441,29.86407,0.101214,...,0.7597,0.861503,1.262562,0.832817,2.122901,1.912391,0.227927,0.561664,0.593225,0.850961
min,226.283,226.121846,0.138213,16.13,212.171,0.40492,62.0307,0.0,80.0,0.036113,...,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0
25%,325.372,325.013067,0.478493,55.63,308.25075,3.073165,87.467125,0.185185,122.0,0.225437,...,1.0,3.0,4.0,1.0,6.0,3.0,0.0,0.0,0.0,3.0
50%,374.4645,374.120132,0.647325,67.29,355.796,3.94736,101.7447,0.277778,138.0,0.255791,...,2.0,3.0,5.0,1.0,7.0,4.0,0.0,0.0,0.0,4.0
75%,447.85025,447.113784,0.76705,81.93,427.326,4.76242,116.614325,0.359286,160.0,0.390685,...,2.0,4.0,6.0,2.0,9.0,5.0,0.0,1.0,1.0,4.0
max,774.895,774.238214,0.92578,164.5,737.599,7.0606,200.3182,0.882353,286.0,0.585809,...,4.0,5.0,13.0,4.0,16.0,12.0,3.0,4.0,4.0,10.0


In [13]:
# Check balance of target values
y.value_counts()

1    873
0    185
Name: spe_method, dtype: int64

In [14]:
# Normal train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [15]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[-0.23601421, -0.23375002,  0.48955629, ..., -0.59315847,
        -0.64066596, -1.15652533],
       [-0.7835774 , -0.78252458,  1.0243091 , ..., -0.59315847,
        -0.64066596,  0.04547282],
       [ 2.0575004 ,  2.06085669, -1.30592787, ..., -0.59315847,
        -0.64066596,  0.04547282],
       ...,
       [-0.65166017, -0.65066704,  0.62824528, ..., -0.59315847,
        -0.64066596, -1.15652533],
       [-1.29642992, -1.29611669,  1.4488616 , ...,  1.16197089,
         1.05282772, -1.15652533],
       [-0.24719506, -0.24515829,  1.56059483, ...,  1.16197089,
         1.05282772, -1.15652533]])

# Test Machine Learning Models

In [16]:
ml_list1 = {
    "Balanced Random Forest Classifier":BalancedRandomForestClassifier(n_estimators=100, random_state=1),
    "Easy Ensemble AdaBoost Classifier":EasyEnsembleClassifier(n_estimators=100, random_state=1),
    "XGBoost Classifier":XGBClassifier(n_estimators = 100, random_state = 1)
     }


ml_list2 = {"LR with Random Oversampling":RandomOverSampler(random_state = 1),
            "LR with SMOTE Oversampling":SMOTE(random_state = 1, sampling_strategy = "auto"),
            "LR with Random Undersampling":RandomUnderSampler(random_state = 1),
            "LR with Cluster Centroids Undersampling":ClusterCentroids(random_state = 1),
           "LR with SMOTEENN Over and Undersampling":SMOTEENN(random_state=1)}
    

ret = []

for x in ml_list1:
    ml = ml_list1[x].fit(X_train_scaled, y_train)
    y_pred = ml.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    ret.append({
        "Name": x,
        "Balanced Accuracy Score":ba_score,
    })

for x in ml_list2:
    X_resampled, y_resampled = ml_list2[x].fit_resample(X_train_scaled, y_train)
    ml = LogisticRegression(random_state=1).fit(X_resampled, y_resampled)

    y_pred = ml.predict(X_test_scaled)
    
    ba_score = balanced_accuracy_score(y_test, y_pred)
    ret.append({
        "Name": x,
        "Balanced Accuracy Score":ba_score,
    })
    



In [17]:
summary_df = pd.DataFrame(ret).sort_values("Balanced Accuracy Score", ascending=False)
summary_df

Unnamed: 0,Name,Balanced Accuracy Score
0,Balanced Random Forest Classifier,0.900943
2,XGBoost Classifier,0.891509
1,Easy Ensemble AdaBoost Classifier,0.879717
4,LR with SMOTE Oversampling,0.863208
6,LR with Cluster Centroids Undersampling,0.851415
7,LR with SMOTEENN Over and Undersampling,0.839623
3,LR with Random Oversampling,0.837264
5,LR with Random Undersampling,0.825472
