In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, text
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from collections import Counter
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Load Data

In [3]:
# Connect to AWS database
database_url = f'postgresql://postgres:purifai2022@purifai.ceoinb9nwfxg.us-west-1.rds.amazonaws.com/postgres'
engine = sqlalchemy.create_engine(database_url)
connection = engine.connect()

In [4]:
# Establish connection with engine object
with engine.connect() as conn:
    lcms_analysis = conn.execute("SELECT * FROM outcomes INNER JOIN structures ON outcomes.structure_id = structures.structure_id WHERE spe_successful = 'true' AND preferred_lcms_method IS NOT NULL AND NOT preferred_lcms_method = 'LunaOmega LpH';")

In [5]:
# Set columns names and data contents
columns = [x for x in lcms_analysis.keys()]
data = [x for x in lcms_analysis]

# Create DF
lcms_analysis_df = pd.DataFrame(data, columns = columns)
lcms_analysis_df = lcms_analysis_df.loc[:,~lcms_analysis_df.columns.duplicated()].copy()
print(lcms_analysis_df.shape)
lcms_analysis_df.head()

(1069, 65)


Unnamed: 0,sample_id,structure_id,preferred_lcms_method,spe_method,method,spe_successful,crashed_out,sample_status,sample_current_status,termination_cause,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,00YLL22-042-002,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,00YLL22-042-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,00YLL22-042-004,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,00YLL22-042-005,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,00YLL22-042-008,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Shipped,,...,2,3,5,1,9,2,0,1,1,5


In [6]:
# Remove columns not used for ML model
df = lcms_analysis_df.drop(columns = ["sample_id", 
                               "spe_method",
                               "method",
                               "spe_successful",
                               "crashed_out",
                               "sample_status",
                               "sample_current_status",
                               "termination_cause",
                               "termination_step",
                               "termination_details",
                               "reaction_scale",
                               "selected_fractions",
                               "volume_collected",
                               "total_fractions_collected",
                               "recovered_sample_dry_mass",
                               "percent_yield",
                               "percent_purity",
                               "purification_comments"])

df.head()

Unnamed: 0,structure_id,preferred_lcms_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,Gemini LpH,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,Xbridge HpH,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,Xbridge HpH,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,Xbridge HpH,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,Xbridge HpH,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


In [7]:
# Check for duplicates
df.duplicated().sum()

21

In [8]:
# Drop duplicates
df = df.drop_duplicates()
print(df.shape)
df.head()

(1048, 47)


Unnamed: 0,structure_id,preferred_lcms_method,MolWt,exactMolWt,qed,TPSA,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YLL22-042-002,Gemini LpH,450.326,449.102145,0.648315,83.46,429.158,2.7852,114.4697,0.35,...,2,3,5,1,10,5,0,0,0,4
1,00YLL22-042-003,Xbridge HpH,446.338,445.10723,0.65459,74.23,425.17,3.5488,115.3177,0.380952,...,2,3,4,1,9,2,0,1,1,5
2,00YLL22-042-004,Xbridge HpH,434.327,433.10723,0.688605,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,3,0,1,1,5
3,00YLL22-042-005,Xbridge HpH,447.326,446.102479,0.670782,77.47,427.166,3.0478,114.5557,0.35,...,2,3,4,1,10,3,0,1,1,5
4,00YLL22-042-008,Xbridge HpH,434.327,433.10723,0.673755,66.39,413.159,3.011,112.1937,0.4,...,2,3,5,1,9,2,0,1,1,5


# Define Features and Target and Split and Scale Data

In [9]:
# Create features
X = df.drop(columns = ["preferred_lcms_method", "structure_id"])

# Create target
y = df["preferred_lcms_method"]

In [10]:
# Check balance of target values
y.value_counts()

Xbridge HpH    729
Gemini LpH     319
Name: preferred_lcms_method, dtype: int64

In [11]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [12]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[ 0.59716775,  0.58864192,  0.93079866, ...,  1.15766035,
         1.01840514, -1.11664399],
       [ 0.11661332,  0.10774292,  0.53609135, ...,  1.15766035,
         1.01840514,  0.01584914],
       [-0.37425378, -0.37379209, -0.34631261, ..., -0.56894503,
        -0.61519984,  0.01584914],
       ...,
       [-0.77283473, -0.77284039,  0.55257681, ..., -0.56894503,
        -0.61519984,  0.01584914],
       [-1.30203121, -1.30171363,  1.29332034, ...,  1.15766035,
         1.01840514, -1.11664399],
       [-0.08160365, -0.07925628,  1.40770489, ...,  1.15766035,
         1.01840514, -1.11664399]])

#  Grid Search on XGBoost

In [13]:
# Look at parameters used by our current forest
model = XGBClassifier(random_state=1)
print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'binary:logistic',
 'predictor': None,
 'random_state': 1,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'use_label_encoder': True,
 'validate_parameters': None,
 'verbosity': None}


In [15]:
# using grid search to focus on the most promising hyperparameters ranges found in the random search.

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
params = {"n_estimators" : [100,200,400,600,800],
 'learning_rate': [0.001, 0.01, 0.05, 0.1],
 'gamma': [0.0, 0.05, 0.1],
 'colsample_bytree': [0, 0.3, 0.5],
         'max_depth':range(3,10,2), 
          'min_child_weight':range(1,6,2)}

# Create a based model
model = XGBClassifier(n_estimators = 100, random_state = 1)
# Instantiate the grid search model
grid_search = GridSearchCV(model, param_grid=params, 
                          cv = 3, n_jobs = None)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_

















































































































































































































































































































































































































































































































{'colsample_bytree': 0.5,
 'gamma': 0.1,
 'learning_rate': 0.01,
 'max_depth': 9,
 'min_child_weight': 3,
 'n_estimators': 200}

In [16]:
# Compare the base model with the best grid search model.
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
    
def evaluate(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    print(f'Balanced accuracy score: {ba_score}')
    return ba_score

# performance of base model
base_model = XGBClassifier(n_estimators = 100, random_state = 1)
base_model.fit(X_train_scaled, y_train)
base_accuracy = evaluate(base_model, X_test_scaled, y_test)

# performance of the best_grid
best_grid = grid_search.best_estimator_
best_grid.fit(X_train_scaled, y_train)
grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)
improvement = '{:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy)
print(f'Improvement of best_grid on XGBoost is {improvement}')


Balanced accuracy score: 0.8630852842809364
Balanced accuracy score: 0.8857998885172798
Improvement of best_grid on XGBoost is 2.63%


In [17]:
xgb_list = {
        "Name": 'XGBoost',
        "Base model Balanced Accuracy":base_accuracy,
        "Grid model Balanced Accuracy":grid_accuracy,
        "Improvement" : improvement
    }
df_xgb = pd.DataFrame(xgb_list, index=[0])
df_xgb

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,XGBoost,0.863085,0.8858,2.63%


In [18]:
df_xgb.to_csv('df_lcms_xgb.csv', index=False)

### compare recall and precision in base and grid model

In [19]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Make predictions using test data
y_pred = best_grid.predict(X_test_scaled)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual Gemini LpH", "Actual Xbridge HpH"], columns = ["Predicted Gemini LpH", "Predicted Xbridge HpH"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Best grid Confusion Matrix:")
display(cm_df)
print(f"\n\nBest grid Imbalanced Classification Report: \n\n{ic_report}")

Best grid Confusion Matrix:


Unnamed: 0,Predicted Gemini LpH,Predicted Xbridge HpH
Actual Gemini LpH,64,14
Actual Xbridge HpH,9,175




Best grid Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

 Gemini LpH       0.88      0.82      0.95      0.85      0.88      0.77        78
Xbridge HpH       0.93      0.95      0.82      0.94      0.88      0.79       184

avg / total       0.91      0.91      0.86      0.91      0.88      0.78       262



In [20]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Make predictions using test data
y_pred = base_model.predict(X_test_scaled)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual Gemini LpH", "Actual Xbridge HpH"], columns = ["Predicted Gemini LpH", "Predicted Xbridge HpH"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Base model Confusion Matrix:")
display(cm_df)
print(f"\n\nBase model Imbalanced Classification Report: \n\n{ic_report}")

Base model Confusion Matrix:


Unnamed: 0,Predicted Gemini LpH,Predicted Xbridge HpH
Actual Gemini LpH,63,15
Actual Xbridge HpH,15,169




Base model Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

 Gemini LpH       0.81      0.81      0.92      0.81      0.86      0.73        78
Xbridge HpH       0.92      0.92      0.81      0.92      0.86      0.75       184

avg / total       0.89      0.89      0.84      0.89      0.86      0.75       262

