In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd
import sqlalchemy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pprint import pprint

from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler

# Define Features and Target and Split and Scale Data

In [3]:
# load data
df = pd.read_csv("lcms_df.csv")

# Create features
X = df.drop(columns = ["preferred_lcms_method", "structure_id"])

# Create target
y = df["preferred_lcms_method"]

In [4]:
# Check balance of target values
y.value_counts()

Xbridge HpH    729
Gemini LpH     319
Name: preferred_lcms_method, dtype: int64

In [5]:
# Normal train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [6]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled

array([[ 0.59733689,  0.58882916,  0.92822901, ...,  1.15766035,
         1.01840514, -1.11432097],
       [ 0.11735691,  0.10851085,  0.5338891 , ...,  1.15766035,
         1.01840514,  0.01727629],
       [-0.37292342, -0.3724427 , -0.34769352, ..., -0.56894503,
        -0.61519984,  0.01727629],
       ...,
       [-0.77102791, -0.77100915,  0.55035922, ..., -0.56894503,
        -0.61519984,  0.01727629],
       [-1.2995918 , -1.29924377,  1.29041326, ...,  1.15766035,
         1.01840514, -1.11432097],
       [-0.08062311, -0.07826255,  1.40469134, ...,  1.15766035,
         1.01840514, -1.11432097]])

### Grid search on Logistic Regression

In [7]:
ml_list = {"Logistic Regression with Random Oversampling":RandomOverSampler(random_state = 1),
            "Logistic Regression with SMOTE Oversampling":SMOTE(random_state = 1, sampling_strategy = "auto"),
            "Logistic Regression with Random Undersampling":RandomUnderSampler(random_state = 1),
            "Logistic Regression with Cluster Centroids Undersampling":ClusterCentroids(random_state = 1),
          "Logistic Regression with SMOTEENN Combination Over- and Undersampling":SMOTEENN(random_state=1)}
    
ret = []
for x in ml_list:
    X_resampled, y_resampled = ml_list[x].fit_resample(X_train_scaled, y_train)
    # setting the testing parameters
    params = {"C" :np.logspace(-5,5,50),
             'penalty'    : ['l1' , 'l2','none']}

    # create model
    model = LogisticRegression(random_state=1)

    # Random search of parameters, using 3 fold cross validation
    grid_search = GridSearchCV(model, param_grid=params, 
                          cv = 3, n_jobs = None)

    # Fit the grid search model
    grid_search.fit(X_resampled, y_resampled)
    grid_search.best_params_
    
    def evaluate(model, X_test_scaled, y_test):
        y_pred = model.predict(X_test_scaled)
        ba_score = balanced_accuracy_score(y_test, y_pred)
        return ba_score

    # performance of base model
    base_model = LogisticRegression(random_state=1)
    base_model.fit(X_resampled, y_resampled)
    base_accuracy = evaluate(base_model, X_test_scaled, y_test)
    
    # performance of grid model for best parameter setting
    best_grid = grid_search.best_estimator_
    best_grid.fit(X_resampled, y_resampled)
    grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)

    improvement = '{:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy)
    
    ret.append({
        "Name": x,
        "Base model Balanced Accuracy":base_accuracy,
        "Grid model Balanced Accuracy":grid_accuracy,
        "Improvement" : improvement
    })


In [8]:
summary_df = pd.DataFrame(ret).sort_values("Grid model Balanced Accuracy", ascending=False)
summary_df

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
3,Logistic Regression with Cluster Centroids Und...,0.872631,0.872631,0.00%
4,Logistic Regression with SMOTEENN Combination ...,0.858835,0.867963,1.06%
1,Logistic Regression with SMOTE Oversampling,0.86427,0.866012,0.20%
0,Logistic Regression with Random Oversampling,0.860577,0.852425,-0.95%
2,Logistic Regression with Random Undersampling,0.856884,0.845039,-1.38%


In [9]:
df_lr = pd.DataFrame(ret)
df_lr

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Logistic Regression with Random Oversampling,0.860577,0.852425,-0.95%
1,Logistic Regression with SMOTE Oversampling,0.86427,0.866012,0.20%
2,Logistic Regression with Random Undersampling,0.856884,0.845039,-1.38%
3,Logistic Regression with Cluster Centroids Und...,0.872631,0.872631,0.00%
4,Logistic Regression with SMOTEENN Combination ...,0.858835,0.867963,1.06%


In [10]:
df_lcms_xgb = pd.read_csv('df_lcms_xgb.csv')
df_lcms_xgb

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,XGBoost,0.865803,0.87939,1.57%


In [11]:
df_lcms_ee = pd.read_csv('df_lcms_ee.csv')
df_lcms_ee


Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Easy Ensemble AdaBoost,0.852425,0.852425,0.00%


In [12]:
df_lcms_brf = pd.read_csv('df_lcms_brf.csv')
df_lcms_brf

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Balanced Random Forest,0.873397,0.874164,0.09%


In [13]:
summary = pd.merge(df_lr, df_lcms_xgb, how ="outer")
summary

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,Logistic Regression with Random Oversampling,0.860577,0.852425,-0.95%
1,Logistic Regression with SMOTE Oversampling,0.86427,0.866012,0.20%
2,Logistic Regression with Random Undersampling,0.856884,0.845039,-1.38%
3,Logistic Regression with Cluster Centroids Und...,0.872631,0.872631,0.00%
4,Logistic Regression with SMOTEENN Combination ...,0.858835,0.867963,1.06%
5,XGBoost,0.865803,0.87939,1.57%


In [14]:
summary = pd.merge(summary, df_lcms_ee, how ="outer").sort_values("Grid model Balanced Accuracy", ascending=False)
summary

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
5,XGBoost,0.865803,0.87939,1.57%
3,Logistic Regression with Cluster Centroids Und...,0.872631,0.872631,0.00%
4,Logistic Regression with SMOTEENN Combination ...,0.858835,0.867963,1.06%
1,Logistic Regression with SMOTE Oversampling,0.86427,0.866012,0.20%
0,Logistic Regression with Random Oversampling,0.860577,0.852425,-0.95%
6,Easy Ensemble AdaBoost,0.852425,0.852425,0.00%
2,Logistic Regression with Random Undersampling,0.856884,0.845039,-1.38%


In [15]:
summary = pd.merge(summary, df_lcms_brf, how ="outer").sort_values("Grid model Balanced Accuracy", ascending=False)
summary

Unnamed: 0,Name,Base model Balanced Accuracy,Grid model Balanced Accuracy,Improvement
0,XGBoost,0.865803,0.87939,1.57%
7,Balanced Random Forest,0.873397,0.874164,0.09%
1,Logistic Regression with Cluster Centroids Und...,0.872631,0.872631,0.00%
2,Logistic Regression with SMOTEENN Combination ...,0.858835,0.867963,1.06%
3,Logistic Regression with SMOTE Oversampling,0.86427,0.866012,0.20%
4,Logistic Regression with Random Oversampling,0.860577,0.852425,-0.95%
5,Easy Ensemble AdaBoost,0.852425,0.852425,0.00%
6,Logistic Regression with Random Undersampling,0.856884,0.845039,-1.38%
