In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
from numpy import loadtxt
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, text
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
from collections import Counter

# Load Data

In [3]:
# Load data and create DF
df = pd.read_csv("https://raw.githubusercontent.com/jenamis/purifAI/jen/MachineLearning/merged_for_testing.csv")
print(df.shape)
df.head()

(1086, 71)


Unnamed: 0,SAMPLE_ID,STRUCTURE_ID,PREFERRED_LCMS_METHOD,SPE_METHOD,METHOD,SPE_SUCCESSFUL,CRASHED_OUT,SAMPLE_STATUS,SAMPLE_CURRENT_STATUS,TERMINATION_CAUSE,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,MTA0ST2022-051-001_S2,MTA0ST2022-051-001,Xbridge HpH,HLB,HLB/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,2,9,2,0,1,1,3
1,MTA0ST2022-051-003_G1,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,1,8,2,0,1,1,3
2,MTA0ST2022-051-004,MTA0ST2022-051-004,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Stored,,...,0,2,3,1,6,2,0,1,1,3
3,MTA0ST2022-051-005,MTA0ST2022-051-005,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Stored,,...,1,2,2,1,6,2,0,1,1,3
4,MTA0ST2022-051-006,MTA0ST2022-051-006,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,3,2,1,6,2,0,1,1,4


In [4]:
# # Connect to AWS database
# database_url = f'postgresql://postgres:purifai2022@purifai.ceoinb9nwfxg.us-west-1.rds.amazonaws.com/postgres'
# engine = sqlalchemy.create_engine(database_url)
# connection = engine.connect()

In [5]:
# # Establish connection with engine object
# with engine.connect() as conn:
#     result = conn.execute(text("SELECT * FROM spe_analysis;"))

In [6]:
# # Create DF
# df = pd.DataFrame(result, columns = result.keys())
# df = df.drop(columns = "_c0")
# print(df.shape)
# df.head()

In [7]:
# Encode SPE method
le = LabelEncoder()
df_encoded = df.copy()
df_encoded["SPE_METHOD"] = le.fit_transform(df_encoded["SPE_METHOD"])

In [8]:
# # Keep only rows where "SPE_SUCCESSFUL" is TRUE
# df = df[df["SPE_SUCCESSFUL"] == "TRUE"]
# print(df.shape)
# df.head()

# Define Features and Target and Split and Scale Data

In [9]:
# Create features
X = df.drop(columns = ["SAMPLE_ID", 
                       "STRUCTURE_ID",
                       "PREFERRED_LCMS_METHOD", 
                       "SPE_METHOD", 
                       "METHOD", 
                       "SPE_SUCCESSFUL", 
                       "CRASHED_OUT", 
                       "SAMPLE_STATUS", 
                       "SAMPLE_CURRENT_STATUS", 
                       "TERMINATION_CAUSE", 
                       "TERMINATION_STEP", 
                       "TERMINATION_DETAILS", 
                       "REACTION_SCALE_(mmol)", 
                       "SELECTED_FRACTIONS", 
                       "VOLUME_COLLECTED_(mL)", 
                       "TOTAL_FRACTIONS_COLLECTED", 
                       "RECOVERED_SAMPLE_DRY_MASS_(mg)", 
                       "PERCENT_YIELD", 
                       "%_PURITY_(BY_LCMS)", 
                       "PURIFICATION_COMMENTS",
                       "EXTRA_COMMENTS",
                       'logP', 
                       'HBD', 
                       'HBA',
                       'Flexibility',
                       'Rotatable Bonds'])

# Create target
y = df_encoded["SPE_METHOD"]

In [10]:
X.describe()

Unnamed: 0,MolWt,ExactMolWt,TPSA,qed,HeavyAtomMolWt,MolLogP,MolMR,FractionCSP3,NumValenceElectrons,MaxPartialCharge,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
count,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,...,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0,1086.0
mean,390.582017,390.148447,70.864291,0.616214,370.89353,3.953765,104.919159,0.276478,143.001842,0.28436,...,1.539595,3.197053,5.049724,1.279006,7.571823,4.201657,0.044199,0.328729,0.372928,3.952118
std,83.810932,83.65081,21.882793,0.173756,79.761121,1.247837,22.850108,0.144754,30.10259,0.100642,...,0.75824,0.860731,1.292058,0.863207,2.126397,1.944112,0.226938,0.561141,0.59333,0.846522
min,226.283,226.121846,16.13,0.138213,212.171,0.40492,62.0307,0.0,80.0,0.036113,...,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0
25%,326.3095,326.081302,55.915,0.470119,310.219,3.07739,88.019,0.1875,122.0,0.226267,...,1.0,3.0,4.0,1.0,6.0,3.0,0.0,0.0,0.0,3.0
50%,375.453,375.104148,68.1,0.639582,357.135,3.95262,101.86485,0.277778,138.0,0.255791,...,2.0,3.0,5.0,1.0,7.0,4.0,0.0,0.0,0.0,4.0
75%,450.326,449.102145,83.12,0.765451,430.063,4.7524,116.9609,0.363636,164.0,0.375954,...,2.0,4.0,6.0,2.0,9.0,5.0,0.0,1.0,1.0,4.0
max,774.895,774.238214,164.5,0.92578,737.599,7.0606,200.3182,0.882353,286.0,0.585809,...,4.0,5.0,13.0,4.0,16.0,12.0,3.0,4.0,4.0,10.0


In [11]:
# Check balance of target values
y.value_counts()

1    896
0    190
Name: SPE_METHOD, dtype: int64

In [12]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [13]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[-1.20369719, -1.20312573,  1.20773171, ..., -0.6034376 ,
        -0.65025003,  0.05177056],
       [ 0.34225786,  0.34456683,  1.60849655, ...,  1.19582323,
         1.04081555,  0.05177056],
       [-0.23632135, -0.23441384, -1.1869444 , ...,  1.19582323,
         1.04081555,  0.05177056],
       ...,
       [-0.63430121, -0.63320249, -1.23934122, ...,  1.19582323,
         1.04081555,  0.05177056],
       [ 0.62083169,  0.62304415, -0.19895759, ..., -0.6034376 ,
        -0.65025003,  0.05177056],
       [-0.67103409, -0.66991226,  0.09418135, ...,  2.99508407,
         2.73188113,  0.05177056]])

#  Grid Search

### Grid search on XGBoost

In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Look at parameters used by our current forest
model = XGBClassifier(random_state=1)
print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'binary:logistic',
 'predictor': None,
 'random_state': 1,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'use_label_encoder': True,
 'validate_parameters': None,
 'verbosity': None}


In [30]:
# Use the random grid to search for best hyperparameters
from sklearn.model_selection import RandomizedSearchCV

# setting the testing parameters
params = {"n_estimators" : [100,200,300,400,500,600,700,800,900,1000],
    "learning_rate"    : [0.001, 0.005, 0.05, 0.10, 0.2] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7, 10 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }

# create model
model = XGBClassifier()

# Random search of parameters, using 3 fold cross validation
random_search = RandomizedSearchCV(model, param_distributions=params, scoring='balanced_accuracy', 
                                   n_jobs=-1, cv=3, random_state=1)


# Fit the random search model
random_search.fit(X_train_scaled, y_train)

random_search.best_params_






{'n_estimators': 200,
 'min_child_weight': 1,
 'max_depth': 6,
 'learning_rate': 0.2,
 'gamma': 0.3,
 'colsample_bytree': 0.3}

In [31]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3,
              enable_categorical=False, gamma=0.3, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [32]:
# Evaluate Random Search
# Compare the base model with the best random search model.

def evaluate(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    print('Model Performance')
    print(f'Balanced accuracy score: {ba_score}')
    return ba_score

# performance of base model
base_model = XGBClassifier(n_estimators = 100, random_state = 1)
base_model.fit(X_train_scaled, y_train)
base_accuracy = evaluate(base_model, X_test_scaled, y_test)



Model Performance
Balanced accuracy score: 0.8542342342342342


In [33]:
# performance of random model for best parameter setting
best_random = random_search.best_estimator_
best_random.fit(X_train_scaled, y_train)
random_accuracy = evaluate(best_random, X_test_scaled, y_test)

Model Performance
Balanced accuracy score: 0.8687387387387387


In [34]:
# Improvement 
print('Improvement of best random model: {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of best random model: 1.70%.


## Further improve our results by using grid search

In [37]:
#  further improve our results by using grid search to focus on the 
# most promising hyperparameters ranges found in the random search.

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
params = {"n_estimators" : [100,200,400,600,800],
    'min_child_weight': [1,3,5,7,9,11], 
 'max_depth': [1,6,8,11],
 'learning_rate': [0.001, 0.01, 0.05, 0.1],
 'gamma': [0.0, 0.05, 0.1],
 'colsample_bytree': [0, 0.3, 0.5]}

# Create a based model
model = XGBClassifier(n_estimators = 100, random_state = 1)
# Instantiate the grid search model
grid_search = GridSearchCV(model, param_grid=params, 
                          cv = 3, n_jobs = -1)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































KeyboardInterrupt: 

In [None]:
# performance of the best_grid
best_grid = grid_search.best_estimator_
best_grid.fit(X_train_scaled, y_train)
grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

In [21]:
# # Grid search on Easy Ensemble AdaBoost
# from imblearn.ensemble import EasyEnsembleClassifier
# from sklearn.model_selection import GridSearchCV

# parameters = {'n_estimators': [100, 400]}
# model = EasyEnsembleClassifier()
# clf = GridSearchCV(estimator=model,
#              param_grid=parameters)
# clf.fit(X_train_scaled, y_train)

# print(clf.get_params())
# clf.score(X_train_scaled, y_train)

In [22]:
# # A parameter grid for Easy Ensemble AdaBoost
# params = {'n_estimators': [100, 400]}

# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import GridSearchCV
# folds = 5
# param_comb = 50

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1)
# print('start！！')
# random_search = GridSearchCV(model, param_grid=params, scoring='balanced_accuracy', n_jobs=1, cv=skf.split(X_train_scaled,y_train), 
#                                    verbose=3)

# # Here we go
# random_search.fit(X_train_scaled, y_train)
# print('finished!!!')

# random_search.score(X_train_scaled, y_train)

In [23]:
## Grid search on Balanced Random Forest

# from imblearn.ensemble import BalancedRandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# parameters = {
#               'n_estimators': [100, 400],
#                 'max_depth':[100,1000],
#     "random_state":[1,42]
# }
# model = BalancedRandomForestClassifier()
# clf = GridSearchCV(estimator=model,
#              param_grid=parameters)
# clf.fit(X_train_scaled, y_train)

# print(clf.get_params())
# clf.score(X_train_scaled, y_train)

In [24]:
# # A parameter grid for Balanced Random Forest
# params = {'n_estimators': [100, 400],
#                 'max_depth':[100,1000],
#     "random_state":[1,42]
# }

# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import GridSearchCV
# folds = 5
# param_comb = 50

# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1)
# print('start！！')
# random_search = GridSearchCV(model, param_grid=params, scoring='balanced_accuracy', n_jobs=1, cv=skf.split(X_train_scaled,y_train), 
#                                    verbose=3)

# # Here we go
# random_search.fit(X_train_scaled, y_train)
# print('finished!!!')

# random_search.score(X_train_scaled, y_train)