In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN
from collections import Counter

# Read CSV and Preprocess Data

In [None]:
# # Load data and create DF
# file_path = "../Resources/structures_method_test.csv"
# df = pd.read_csv(file_path, index_col = "STRUCTURE_ID")
# df.head()

In [3]:
# Load structures data and create DF
structures_df = pd.read_csv("../Resources/structures_clean.csv")
print(structures_df.shape)
structures_df.head()

(1922, 51)


Unnamed: 0,STRUCTURE_ID,MolWt,ExactMolWt,logP,HBD,HBA,TPSA,Flexibility,Rotatable Bonds,qed,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,00YRB22-045-001,355.32,355.125609,1.519,1,7,75.25,0.1071,3,0.891533,...,2,2,7,1,10,2,0,1,1,4
1,00YRB22-045-002,355.32,355.125609,1.861,1,7,75.25,0.1071,3,0.891533,...,2,2,7,1,10,2,0,1,1,4
2,00YRB22-045-003,377.323,377.131089,0.5684,1,9,94.23,0.1034,3,0.817937,...,1,1,9,1,12,2,0,1,1,4
3,00YRB22-045-004,319.287,319.125609,0.7379,1,7,75.77,0.125,3,0.881979,...,1,1,7,1,10,2,0,1,1,3
4,00YRB22-045-005,372.347,372.140925,2.111,1,7,80.07,0.1034,3,0.852766,...,1,1,7,1,10,2,0,1,1,4


In [4]:
# Load outcome data and create DF
outcomes_df = pd.read_csv("../Resources/outcomes_clean.csv")
print(outcomes_df.shape)
outcomes_df.head()

(2295, 21)


Unnamed: 0,SAMPLE_ID,STRUCTURE_ID,PREFERRED_LCMS_METHOD,SPE_METHOD,METHOD,SPE_SUCCESSFUL,CRASHED_OUT,SAMPLE_STATUS,SAMPLE_CURRENT_STATUS,TERMINATION_CAUSE,...,TERMINATION_DETAILS,REACTION_SCALE_(mmol),SELECTED_FRACTIONS,VOLUME_COLLECTED_(mL),TOTAL_FRACTIONS_COLLECTED,RECOVERED_SAMPLE_DRY_MASS_(mg),PERCENT_YIELD,%_PURITY_(BY_LCMS),PURIFICATION_COMMENTS,EXTRA_COMMENTS
0,MTA0ST2022-051-001,MTA0ST2022-051-001,Xbridge HpH,MCX,MCX/Xbridge HpH,,,Failed,Terminated,Challenges of Sample/Chemistry,...,No conversion (SM is major peak),0.1,,,,,47.70%,,,
1,MTA0ST2022-051-001_S2,MTA0ST2022-051-001,Xbridge HpH,HLB,HLB/Xbridge HpH,True,,Complete,Completed & Stored,,...,,0.1,F3L/F4S,10.17,2.0,17.6,0.00%,0.99,,
2,MTA0ST2022-051-002,MTA0ST2022-051-002,,,,,,Failed,Terminated,Challenges of Sample/Chemistry,...,"Insufficient COI detected, below threshold,No ...",0.1,,,,,11.00%,,,
3,MTA0ST2022-051-003_G1,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,,0.1,F3L/F1S,19.46,2.0,3.9,8.50%,0.98,,
4,MTA0ST2022-051-003_G2,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,,,Failed,Terminated,Challenges of Sample/Chemistry,...,No conversion (SM is major peak),0.1,F3L/F1S,9.78,2.0,3.0,41.30%,0.0,";;;""2nd group; no COI observed"";",


In [5]:
# Merge structures and outcomes DFs into single DF for ML
df = pd.merge(outcomes_df, structures_df, on = ["STRUCTURE_ID"])
print(df.shape)
df.head()

(1880, 71)


Unnamed: 0,SAMPLE_ID,STRUCTURE_ID,PREFERRED_LCMS_METHOD,SPE_METHOD,METHOD,SPE_SUCCESSFUL,CRASHED_OUT,SAMPLE_STATUS,SAMPLE_CURRENT_STATUS,TERMINATION_CAUSE,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
0,MTA0ST2022-051-001,MTA0ST2022-051-001,Xbridge HpH,MCX,MCX/Xbridge HpH,,,Failed,Terminated,Challenges of Sample/Chemistry,...,1,2,4,2,9,2,0,1,1,3
1,MTA0ST2022-051-001_S2,MTA0ST2022-051-001,Xbridge HpH,HLB,HLB/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,2,9,2,0,1,1,3
2,MTA0ST2022-051-002,MTA0ST2022-051-002,,,,,,Failed,Terminated,Challenges of Sample/Chemistry,...,2,3,4,1,8,2,0,1,1,4
3,MTA0ST2022-051-003_G1,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,1,8,2,0,1,1,3
4,MTA0ST2022-051-003_G2,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,,,Failed,Terminated,Challenges of Sample/Chemistry,...,1,2,4,1,8,2,0,1,1,3


In [8]:
# # Review data types
# dict(df.dtypes)

In [9]:
# # Check for null values
# dict(df.isnull().sum())

In [11]:
# # Check for duplicates
# df.duplicated().sum()

In [None]:
# # Identify duplicate rows
# duplicates = df[df.duplicated() == True]
# duplicates

In [None]:
# # Drop duplicates
# df = df.drop_duplicates()
# print(df.shape)
# df.head()

In [12]:
# Keep only rows where "SPE_SUCCESSFUL" is True
df = df[df["SPE_SUCCESSFUL"] == True]
print(df.shape)
df.head()

(1091, 71)


Unnamed: 0,SAMPLE_ID,STRUCTURE_ID,PREFERRED_LCMS_METHOD,SPE_METHOD,METHOD,SPE_SUCCESSFUL,CRASHED_OUT,SAMPLE_STATUS,SAMPLE_CURRENT_STATUS,TERMINATION_CAUSE,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
1,MTA0ST2022-051-001_S2,MTA0ST2022-051-001,Xbridge HpH,HLB,HLB/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,2,9,2,0,1,1,3
3,MTA0ST2022-051-003_G1,MTA0ST2022-051-003,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,2,4,1,8,2,0,1,1,3
5,MTA0ST2022-051-004,MTA0ST2022-051-004,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Stored,,...,0,2,3,1,6,2,0,1,1,3
6,MTA0ST2022-051-005,MTA0ST2022-051-005,Gemini LpH,MCX,MCX/Gemini LpH,True,,Complete,Completed & Stored,,...,1,2,2,1,6,2,0,1,1,3
7,MTA0ST2022-051-006,MTA0ST2022-051-006,Xbridge HpH,MCX,MCX/Xbridge HpH,True,,Complete,Completed & Stored,,...,1,3,2,1,6,2,0,1,1,4


# Define Features and Target and Split and Scale Data

In [13]:
# Create features
X = df.drop(columns = ["SAMPLE_ID", 
                       "STRUCTURE_ID",
                       "PREFERRED_LCMS_METHOD", 
                       "SPE_METHOD", 
                       "METHOD", 
                       "SPE_SUCCESSFUL", 
                       "CRASHED_OUT", 
                       "SAMPLE_STATUS", 
                       "SAMPLE_CURRENT_STATUS", 
                       "TERMINATION_CAUSE", 
                       "TERMINATION_STEP", 
                       "TERMINATION_DETAILS", 
                       "REACTION_SCALE_(mmol)", 
                       "SELECTED_FRACTIONS", 
                       "VOLUME_COLLECTED_(mL)", 
                       "TOTAL_FRACTIONS_COLLECTED", 
                       "RECOVERED_SAMPLE_DRY_MASS_(mg)", 
                       "PERCENT_YIELD", 
                       "%_PURITY_(BY_LCMS)", 
                       "PURIFICATION_COMMENTS",
                       "EXTRA_COMMENTS"])

# Create target
y = df["SPE_METHOD"]

In [14]:
X.describe()

Unnamed: 0,MolWt,ExactMolWt,logP,HBD,HBA,TPSA,Flexibility,Rotatable Bonds,qed,HeavyAtomMolWt,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
count,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,...,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0,1091.0
mean,390.107102,389.674482,3.575146,1.278643,5.774519,70.774079,0.165605,5.246563,0.617345,370.428466,...,1.536205,3.192484,5.041247,1.278643,7.555454,4.19615,0.043996,0.331806,0.375802,3.949588
std,83.949167,83.788746,1.062014,0.862731,1.576936,21.890474,0.051668,2.365012,0.174272,79.907039,...,0.759841,0.861819,1.296836,0.862731,2.136059,1.941347,0.226437,0.561687,0.593482,0.847136
min,226.283,226.121846,0.6602,0.0,2.0,16.13,0.05,1.0,0.138213,212.171,...,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0
25%,325.777,325.126803,2.8255,1.0,5.0,55.87,0.125,3.0,0.472609,309.186,...,1.0,3.0,4.0,1.0,6.0,3.0,0.0,0.0,0.0,3.0
50%,375.351,375.090518,3.678,1.0,6.0,67.39,0.1667,5.0,0.642067,356.325,...,2.0,3.0,5.0,1.0,7.0,4.0,0.0,0.0,0.0,4.0
75%,449.832,448.628306,4.263,2.0,7.0,83.12,0.2051,7.0,0.766516,429.322,...,2.0,4.0,6.0,2.0,9.0,5.0,0.0,1.0,1.0,4.0
max,774.895,774.238214,6.399,4.0,12.0,164.5,0.3404,16.0,0.92578,737.599,...,4.0,5.0,13.0,4.0,16.0,12.0,3.0,4.0,4.0,10.0


In [15]:
# Check balance of target values
y.value_counts()

MCX    901
HLB    190
Name: SPE_METHOD, dtype: int64

In [16]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [17]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Test Machine Learning Models

### Balanced Random Forest Classifier

In [18]:
# Train BalancedRandomForestClassifier
brf_classifier = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
brf_classifier.fit(X_train_scaled, y_train)

# Make predictions using test data
y_pred = brf_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8863275039745628


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,44,7
Actual MCX,20,202




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.69      0.86      0.91      0.77      0.89      0.78        51
        MCX       0.97      0.91      0.86      0.94      0.89      0.79       222

avg / total       0.91      0.90      0.87      0.91      0.89      0.79       273



In [19]:
# List features sorted in descending order by feature importance
importances = brf_classifier.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

[(0.06728142341608923, 'FractionCSP3'),
 (0.05567588935564286, 'SMR_VSA10'),
 (0.049712335709404, 'TPSA'),
 (0.043804794525455513, 'PEOE_VSA2'),
 (0.03992896487538047, 'logP'),
 (0.03777270536521909, 'MolMR'),
 (0.03644441912823002, 'qed'),
 (0.032004880309328736, 'RingCount'),
 (0.030214858184132592, 'MinEStateIndex'),
 (0.02903067538285011, 'FpDensityMorgan1'),
 (0.028966843760236873, 'MaxPartialCharge'),
 (0.02842255112806667, 'NumValenceElectrons'),
 (0.028312351298543988, 'LabuteASA'),
 (0.028082238162171678, 'HallKierAlpha'),
 (0.02731443833914136, 'MolLogP'),
 (0.026609840210730287, 'Flexibility'),
 (0.025368923909751786, 'EState_VSA8'),
 (0.024368096231674723, 'MaxEStateIndex'),
 (0.0242062794924948, 'MinPartialCharge'),
 (0.023477564390641848, 'BertzCT'),
 (0.022909918713137124, 'SlogP_VSA6'),
 (0.020793440946965135, 'NumAromaticHeterocycles'),
 (0.020713630312626696, 'Ipc'),
 (0.018374756542304645, 'SlogP_VSA2'),
 (0.01834492057012029, 'BalabanJ'),
 (0.018114858346370746, 'Ka

In [20]:
# Select features to include in model
brf_sel = SelectFromModel(brf_classifier)
brf_sel.fit(X_train_scaled, y_train)
brf_selection = brf_sel.get_support()
sorted(zip(brf_selection, X.columns), reverse = True)

[(True, 'qed'),
 (True, 'logP'),
 (True, 'TPSA'),
 (True, 'SlogP_VSA6'),
 (True, 'SMR_VSA10'),
 (True, 'RingCount'),
 (True, 'PEOE_VSA2'),
 (True, 'NumValenceElectrons'),
 (True, 'NumAromaticHeterocycles'),
 (True, 'MolMR'),
 (True, 'MolLogP'),
 (True, 'MinPartialCharge'),
 (True, 'MinEStateIndex'),
 (True, 'MaxPartialCharge'),
 (True, 'MaxEStateIndex'),
 (True, 'LabuteASA'),
 (True, 'Ipc'),
 (True, 'HallKierAlpha'),
 (True, 'FractionCSP3'),
 (True, 'FpDensityMorgan1'),
 (True, 'Flexibility'),
 (True, 'EState_VSA8'),
 (True, 'BertzCT'),
 (False, 'SlogP_VSA2'),
 (False, 'SMR_VSA4'),
 (False, 'Rotatable Bonds'),
 (False, 'PEOE_VSA10'),
 (False, 'NumSaturatedRings'),
 (False, 'NumSaturatedHeterocycles'),
 (False, 'NumSaturatedCarbocycles'),
 (False, 'NumRotatableBonds'),
 (False, 'NumHeteroatoms'),
 (False, 'NumHDonors'),
 (False, 'NumHAcceptors'),
 (False, 'NumAromaticRings'),
 (False, 'NumAromaticCarbocycles'),
 (False, 'NumAliphaticRings'),
 (False, 'NumAliphaticHeterocycles'),
 (False

#### Test with Selected Features Only

In [21]:
# Train-test split and scale selected features
X_selected_train, X_selected_test, y_train, y_test = train_test_split(brf_sel.transform(X), y, random_state = 1)

# Fit StandardScaler
X_selected_scaler = scaler.fit(X_selected_train)

# Scale data
X_selected_train_scaled = X_selected_scaler.transform(X_selected_train)
X_selected_test_scaled = X_selected_scaler.transform(X_selected_test)

In [22]:
# Train BalancedRandomForestClassifier
brf_selected_classifier = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
brf_selected_classifier.fit(X_selected_train_scaled, y_train)

# Make predictions using test data
y_pred = brf_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8840752517223105


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,44,7
Actual MCX,21,201




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.86      0.91      0.76      0.88      0.78        51
        MCX       0.97      0.91      0.86      0.93      0.88      0.78       222

avg / total       0.91      0.90      0.87      0.90      0.88      0.78       273



### Easy Ensemble AdaBoost Classifier

In [23]:
# Train EasyEnsembleClassifier
ee_classifier = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
ee_classifier.fit(X_train_scaled, y_train)

# Make predictions using test data
y_pred = ee_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8811605723370429


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,46,5
Actual MCX,31,191




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.60      0.90      0.86      0.72      0.88      0.78        51
        MCX       0.97      0.86      0.90      0.91      0.88      0.77       222

avg / total       0.90      0.87      0.89      0.88      0.88      0.77       273



#### Test with Balanced Random Forest Selected Features Only

In [24]:
# Train EasyEnsembleClassifier
ee_selected_classifier = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
ee_selected_classifier.fit(X_selected_train_scaled, y_train)

# Make predictions using test data
y_pred = ee_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8668521462639109


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,45,6
Actual MCX,33,189




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.58      0.88      0.85      0.70      0.87      0.75        51
        MCX       0.97      0.85      0.88      0.91      0.87      0.75       222

avg / total       0.90      0.86      0.88      0.87      0.87      0.75       273



### Logistic Regression with Random Oversampling

In [25]:
# Resample training data with RandomOverSampler
ros = RandomOverSampler(random_state = 1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 679, 'HLB': 679})

In [26]:
# Train Logistic Regression model using resampled data
ros_lr_classifier = LogisticRegression(random_state = 1)
ros_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = ros_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8516163222045575


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,40,11
Actual MCX,18,204




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.69      0.78      0.92      0.73      0.85      0.71        51
        MCX       0.95      0.92      0.78      0.93      0.85      0.73       222

avg / total       0.90      0.89      0.81      0.90      0.85      0.73       273



#### Test with Balanced Random Forest Selected Features Only

In [27]:
# Resample selected feature training data with RandomOverSampler
X_selected_resampled, y_resampled = ros.fit_resample(X_selected_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 679, 'HLB': 679})

In [28]:
# Train Logistic Regression model using resampled data
ros_lr_selected_classifier = LogisticRegression(random_state = 1)
ros_lr_selected_classifier.fit(X_selected_resampled, y_resampled)

# Make predictions using test data
y_pred = ros_lr_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8493640699523053


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,40,11
Actual MCX,19,203




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.78      0.91      0.73      0.85      0.71        51
        MCX       0.95      0.91      0.78      0.93      0.85      0.73       222

avg / total       0.90      0.89      0.81      0.89      0.85      0.72       273



### Logistic Regression with SMOTE Oversampling

In [29]:
# Resample training data with SMOTE
smote = SMOTE(random_state = 1, sampling_strategy = "auto")
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 679, 'HLB': 679})

In [30]:
# Train Logistic Regression model using resampled data
smote_lr_classifier = LogisticRegression(random_state = 1)
smote_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = smote_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8538685744568097


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,40,11
Actual MCX,17,205




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.70      0.78      0.92      0.74      0.85      0.71        51
        MCX       0.95      0.92      0.78      0.94      0.85      0.73       222

avg / total       0.90      0.90      0.81      0.90      0.85      0.73       273



#### Test with Balanced Random Forest Selected Features Only

In [31]:
# Resample selected feature training data with SMOTE
X_selected_resampled, y_resampled = smote.fit_resample(X_selected_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 679, 'HLB': 679})

In [32]:
# Train Logistic Regression model using resampled data
smote_lr_selected_classifier = LogisticRegression(random_state = 1)
smote_lr_selected_classifier.fit(X_selected_resampled, y_resampled)

# Make predictions using test data
y_pred = smote_lr_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8516163222045575


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,40,11
Actual MCX,18,204




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.69      0.78      0.92      0.73      0.85      0.71        51
        MCX       0.95      0.92      0.78      0.93      0.85      0.73       222

avg / total       0.90      0.89      0.81      0.90      0.85      0.73       273



### Logistic Regression with Random Undersampling

In [33]:
# Resample training data with RandomUnderSampler
rus = RandomUnderSampler(random_state = 1)
X_resampled, y_resampled = rus.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 139, 'MCX': 139})

In [34]:
# Train Logistic Regression model using resampled data
rus_lr_classifier = LogisticRegression(random_state = 1)
rus_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = rus_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8591679915209327


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,41,10
Actual MCX,19,203




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.80      0.91      0.74      0.86      0.73        51
        MCX       0.95      0.91      0.80      0.93      0.86      0.74       222

avg / total       0.90      0.89      0.82      0.90      0.86      0.74       273



#### Test with Balanced Random Forest Selected Features Only

In [35]:
# Resample selected feature training data with RandomUnderSampler
X_selected_resampled, y_resampled = rus.fit_resample(X_selected_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 139, 'MCX': 139})

In [36]:
# Train Logistic Regression model using resampled data
rus_lr_selected_classifier = LogisticRegression(random_state = 1)
rus_lr_selected_classifier.fit(X_selected_resampled, y_resampled)

# Make predictions using test data
y_pred = rus_lr_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8591679915209327


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,41,10
Actual MCX,19,203




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.80      0.91      0.74      0.86      0.73        51
        MCX       0.95      0.91      0.80      0.93      0.86      0.74       222

avg / total       0.90      0.89      0.82      0.90      0.86      0.74       273



### Logistic Regression with Cluster Centroids Undersampling

In [37]:
# Resample training data with ClusterCentroids
cc = ClusterCentroids(random_state = 1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 139, 'MCX': 139})

In [38]:
# Train Logistic Regression model using resampled data
cc_lr_classifier = LogisticRegression(random_state = 1)
cc_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = cc_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8720190779014309


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,43,8
Actual MCX,22,200




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.66      0.84      0.90      0.74      0.87      0.76        51
        MCX       0.96      0.90      0.84      0.93      0.87      0.76       222

avg / total       0.91      0.89      0.85      0.89      0.87      0.76       273



#### Test with Balanced Random Forest Selected Features Only

In [39]:
# Resample selected feature training data with ClusterCentroids
X_selected_resampled, y_resampled = cc.fit_resample(X_selected_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 139, 'MCX': 139})

In [40]:
# Train Logistic Regression model using resampled data
cc_lr_selected_classifier = LogisticRegression(random_state = 1)
cc_lr_selected_classifier.fit(X_selected_resampled, y_resampled)

# Make predictions using test data
y_pred = cc_lr_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matriX_selected
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8403550609432962


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,40,11
Actual MCX,23,199




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.63      0.78      0.90      0.70      0.84      0.70        51
        MCX       0.95      0.90      0.78      0.92      0.84      0.71       222

avg / total       0.89      0.88      0.81      0.88      0.84      0.71       273



### Logistic Regression with SMOTEENN Combination Over- and Undersampling

In [41]:
# Resample training data with SMOTEENN
smoteenn = SMOTEENN(random_state = 1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 624, 'MCX': 575})

In [42]:
# Train Logistic Regression model using resampled data
smoteenn_lr_classifier = LogisticRegression(random_state = 1)
smoteenn_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = smoteenn_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8765235824059354


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,43,8
Actual MCX,20,202




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.84      0.91      0.75      0.88      0.76        51
        MCX       0.96      0.91      0.84      0.94      0.88      0.77       222

avg / total       0.91      0.90      0.86      0.90      0.88      0.77       273



#### Test with Balanced Random Forest Selected Features Only

In [43]:
# Resample selected feature training data with SMOTEENN
X_selected_resampled, y_resampled = smoteenn.fit_resample(X_selected_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 611, 'MCX': 585})

In [44]:
# Train Logistic Regression model using resampled data
smoteenn_lr_selected_classifier = LogisticRegression(random_state = 1)
smoteenn_lr_selected_classifier.fit(X_selected_resampled, y_resampled)

# Make predictions using test data
y_pred = smoteenn_lr_selected_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns = ["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Display model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n")
print(f"Confusion Matrix:")
display(cm_df)
print(f"\n\nImbalanced Classification Report: \n\n{ic_report}")

Balanced Accuracy Score: 0.8493640699523053


Confusion Matrix:


Unnamed: 0,Predicted HLB,Predicted MCX
Actual HLB,40,11
Actual MCX,19,203




Imbalanced Classification Report: 

                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.68      0.78      0.91      0.73      0.85      0.71        51
        MCX       0.95      0.91      0.78      0.93      0.85      0.73       222

avg / total       0.90      0.89      0.81      0.89      0.85      0.72       273

