In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
from numpy import loadtxt
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, text
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, f1_score
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

# Load Data

In [3]:
# Connect to AWS database
database_url = f'postgresql://postgres:purifai2022@purifai.ceoinb9nwfxg.us-west-1.rds.amazonaws.com/postgres'
engine = sqlalchemy.create_engine(database_url)
connection = engine.connect()

In [4]:
# Establish connection with engine object
with engine.connect() as conn:
    lcms_analysis = conn.execute("SELECT * FROM outcomes INNER JOIN structures ON outcomes.structure_id = structures.structure_id WHERE spe_successful = 'true' AND preferred_lcms_method IS NOT NULL AND NOT preferred_lcms_method = 'LunaOmega LpH';")

In [5]:
# Set columns names and data contents
columns = [x for x in lcms_analysis.keys()]
data = [x for x in lcms_analysis]

# Create DF
lcms_analysis_df = pd.DataFrame(data, columns = columns)
lcms_analysis_df = lcms_analysis_df.loc[:,~lcms_analysis_df.columns.duplicated()].copy()
print(lcms_analysis_df.shape)
lcms_analysis_df.head()

(1069, 65)


Unnamed: 0,sample_id,structure_id,preferred_lcms_method,spe_method,method,spe_successful,crashed_out,sample_status,sample_current_status,termination_cause,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,00YLL22-042-002,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,00YLL22-042-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,00YLL22-042-004,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,00YLL22-042-005,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,00YLL22-042-008,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,2,0,1,1,5


In [6]:
# Remove columns not used for ML model
df = lcms_analysis_df.drop(columns = ["sample_id", 
                               "spe_method",
                               "method",
                               "spe_successful",
                               "crashed_out",
                               "sample_status",
                               "sample_current_status",
                               "termination_cause",
                               "termination_step",
                               "termination_details",
                               "reaction_scale",
                               "selected_fractions",
                               "volume_collected",
                               "total_fractions_collected",
                               "recovered_sample_dry_mass",
                               "percent_yield",
                               "percent_purity",
                               "purification_comments"])

df.head()

Unnamed: 0,structure_id,preferred_lcms_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,Gemini LpH,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,Xbridge HpH,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,Xbridge HpH,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,Xbridge HpH,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,Xbridge HpH,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


In [7]:
# Check for duplicates
df.duplicated().sum()

21

In [8]:
# Drop duplicates
df = df.drop_duplicates()
print(df.shape)
df.head()

(1048, 47)


Unnamed: 0,structure_id,preferred_lcms_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,Gemini LpH,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,Xbridge HpH,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,Xbridge HpH,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,Xbridge HpH,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,Xbridge HpH,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


# Define Features and Target and Split and Scale Data

In [9]:
# Create features
X = df.drop(columns = ["preferred_lcms_method", "structure_id"])

# Create target
y = df["preferred_lcms_method"]

In [10]:
# Check balance of target values
y.value_counts()

Xbridge HpH    729
Gemini LpH     319
Name: preferred_lcms_method, dtype: int64

In [11]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [12]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[ 0.59716775,  0.58864192,  0.93079866, ...,  1.15766035,
         1.01840514, -1.11664399],
       [ 0.11661332,  0.10774292,  0.53609135, ...,  1.15766035,
         1.01840514,  0.01584914],
       [-0.37425378, -0.37379209, -0.34631261, ..., -0.56894503,
        -0.61519984,  0.01584914],
       ...,
       [-0.77283473, -0.77284039,  0.55257681, ..., -0.56894503,
        -0.61519984,  0.01584914],
       [-1.30203121, -1.30171363,  1.29332034, ...,  1.15766035,
         1.01840514, -1.11664399],
       [-0.08160365, -0.07925628,  1.40770489, ...,  1.15766035,
         1.01840514, -1.11664399]])

### Grid search on Easy Ensemble AdaBoost 

In [13]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Look at parameters used by our current forest
model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'base_estimator': None,
 'n_estimators': 100,
 'n_jobs': None,
 'random_state': 1,
 'replacement': False,
 'sampling_strategy': 'auto',
 'verbose': 0,
 'warm_start': False}


In [15]:
# Use grid search to search for best hyperparameters
# Create the parameter grid  
params = {
 'n_estimators': [i for i in range(50,800,50)]}

# Create a base model
model = EasyEnsembleClassifier(random_state = 1)
# Instantiate the grid search model
grid_search = GridSearchCV(model, param_grid=params, 
                          cv = 3, n_jobs = None)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_

{'n_estimators': 250}

In [13]:
# Compare the base model with the best grid search model.

def evaluate(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    print('Model Performance')
    print(f'Balanced accuracy score: {ba_score}')
    return ba_score

def f1_calc(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    f1_score_calc = f1_score(y_test, y_pred, average = "weighted")
    print(f"Weighted F1 Score: {f1_score_calc}")
    return f1_score_calc

# performance of base model
base_model = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
base_model.fit(X_train_scaled, y_train)
base_accuracy = evaluate(base_model, X_test_scaled, y_test)
base_f1 = f1_calc(base_model, X_test_scaled, y_test)

Model Performance
Balanced accuracy score: 0.8524247491638797
Weighted F1 Score: 0.8580660055457039


In [14]:
# performance of the best_grid
best_grid = EasyEnsembleClassifier(n_estimators = 250, random_state = 1)
best_grid.fit(X_train_scaled, y_train)
grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)
grid_f1 = f1_calc(best_grid, X_test_scaled, y_test)
grid_ba_difference = '{:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy)
grid_f1_difference = "{:0.2f}%".format( 100 * (grid_f1 - base_f1) / base_f1)
print(f"Balanced accuracy score difference for model with best grid parameters: {grid_ba_difference}")
print(f"F1 score difference for model with best grid parameters: {grid_f1_difference}")

Model Performance
Balanced accuracy score: 0.8551421404682273
Weighted F1 Score: 0.8616244737952081
Balanced accuracy score difference for model with best grid parameters: 0.32%
F1 score difference for model with best grid parameters: 0.41%


In [16]:
ee_list = {
        "Name": 'Easy Ensemble AdaBoost',
        "Base Model BA":base_accuracy,
        "Grid Model BA":grid_accuracy,
        "BA Difference": grid_ba_difference,
        "Base Model F1": base_f1,
        "Grid Model F1": grid_f1,
        "F1 Difference": grid_f1_difference
    }
df_ee = pd.DataFrame(ee_list, index=[0])
df_ee

Unnamed: 0,Name,Base Model BA,Grid Model BA,BA Difference,Base Model F1,Grid Model F1,F1 Difference
0,Easy Ensemble AdaBoost,0.852425,0.855142,0.32%,0.858066,0.861624,0.41%


In [17]:
df_ee.to_csv('df_lcms_ee_with_f1.csv', index =False)