In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Import dependencies
import numpy as np
from numpy import loadtxt
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine, text
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

# Define Features and Target and Split and Scale Data

In [14]:
# load data
df = pd.read_csv("lcms_df.csv")

# Create features
X = df.drop(columns = ["preferred_lcms_method", "structure_id"])

# Create target
y = df["preferred_lcms_method"]

In [15]:
# Check balance of target values
y.value_counts()

Xbridge HpH    729
Gemini LpH     319
Name: preferred_lcms_method, dtype: int64

In [16]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [17]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[ 0.59733689,  0.58882916,  0.92822901, ...,  1.15766035,
         1.01840514, -1.11432097],
       [ 0.11735691,  0.10851085,  0.5338891 , ...,  1.15766035,
         1.01840514,  0.01727629],
       [-0.37292342, -0.3724427 , -0.34769352, ..., -0.56894503,
        -0.61519984,  0.01727629],
       ...,
       [-0.77102791, -0.77100915,  0.55035922, ..., -0.56894503,
        -0.61519984,  0.01727629],
       [-1.2995918 , -1.29924377,  1.29041326, ...,  1.15766035,
         1.01840514, -1.11432097],
       [-0.08062311, -0.07826255,  1.40469134, ...,  1.15766035,
         1.01840514, -1.11432097]])

### Grid search on Easy Ensemble AdaBoost 

In [18]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Look at parameters used by our current forest
model = EasyEnsembleClassifier(n_estimators=100, random_state=1)
print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'base_estimator': None,
 'n_estimators': 100,
 'n_jobs': None,
 'random_state': 1,
 'replacement': False,
 'sampling_strategy': 'auto',
 'verbose': 0,
 'warm_start': False}


In [20]:
# Evaluate Random Search
# Compare the base model with the best random search model.

def evaluate(model, X_test_scaled, y_test):
    y_pred = model.predict(X_test_scaled)
    ba_score = balanced_accuracy_score(y_test, y_pred)
    print('Model Performance')
    print(f'Balanced accuracy score: {ba_score}')
    return ba_score

# performance of base model
base_model = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
base_model.fit(X_train_scaled, y_train)
base_accuracy = evaluate(base_model, X_test_scaled, y_test)



Model Performance
Balanced accuracy score: 0.8524247491638797


## Further improve our results by using grid search

In [21]:
#  further improve our results by using grid search to focus on the 
# most promising hyperparameters ranges found in the random search.

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
params = {
 'n_estimators': [i for i in range(50,800,50)]}

# Create a based model
model = EasyEnsembleClassifier(random_state = 1)
# Instantiate the grid search model
grid_search = GridSearchCV(model, param_grid=params, 
                          cv = 3, n_jobs = -1)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)
grid_search.best_params_

{'n_estimators': 100}

In [25]:
# performance of the best_grid
best_grid = grid_search.best_estimator_
best_grid.fit(X_train_scaled, y_train)
grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)
improvement = '{:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy)
print(f'Improvement of best_grid on Easy Ensemble AdaBoost is {improvement}')


Model Performance
Balanced accuracy score: 0.8524247491638797
Improvement of best_grid on Easy Ensemble AdaBoost is 0.00%


In [23]:
ee_list = {
        "Name": 'Easy Ensemble AdaBoost',
        "Base model Balanced Accuracy":base_accuracy,
        "Grid model Balanced Accuracy":grid_accuracy,
        "Improvement" : improvement
    }
df_ee = pd.DataFrame(ee_list, index=[0])
df_ee

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Easy Ensemble AdaBoost,0.852425,0.852425,0.00%


In [24]:
df_ee.to_csv('df_ee.csv', index =False)