In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.combine import SMOTEENN
from collections import Counter

# Read CSV and Preprocess Data

In [3]:
# Load data and create DF
file_path = "../Resources/structures_method_test.csv"
df = pd.read_csv(file_path, index_col = "STRUCTURE_ID")
df.head()

Unnamed: 0_level_0,MolWt,EXACT_MASS,logP,MW,HBD,HBA,TPSA,Flexibility,Rotatable Bonds,MolWt.1,...,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,SPE_METHOD
STRUCTURE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00YRB22-045-001,355.32,355.1,1.519,355.3,1,7,75.25,0.1071,3,355.32,...,2,7,1,10,2,0,1,1,4,MCX
00YRB22-045-002,355.32,355.1,1.861,355.3,1,7,75.25,0.1071,3,355.32,...,2,7,1,10,2,0,1,1,4,MCX
00YRB22-045-003,377.323,377.1,0.5684,377.3,1,9,94.23,0.1034,3,377.323,...,1,9,1,12,2,0,1,1,4,MCX
00YRB22-045-004,319.287,319.1,0.7379,319.3,1,7,75.77,0.125,3,319.287,...,1,7,1,10,2,0,1,1,3,MCX
00YRB22-045-005,372.347,372.1,2.111,372.3,1,7,80.07,0.1034,3,372.347,...,1,7,1,10,2,0,1,1,4,MCX


In [4]:
# Check for null values
df.isnull().sum()

MolWt                       0
EXACT_MASS                  0
logP                        0
MW                          0
HBD                         0
HBA                         0
TPSA                        0
Flexibility                 0
Rotatable Bonds             0
MolWt.1                     0
ExactMolWt                  0
qed                         0
HeavyAtomMolWt              0
MolLogP                     0
MolMR                       0
FractionCSP3                0
NumValenceElectrons         0
MaxPartialCharge            0
MinPartialCharge            0
TPSA.1                      0
FpDensityMorgan1            0
BalabanJ                    0
BertzCT                     0
HallKierAlpha               0
Ipc                         0
Kappa2                      0
LabuteASA                   0
PEOE_VSA10                  0
PEOE_VSA2                   0
SMR_VSA10                   0
SMR_VSA4                    0
SlogP_VSA2                  0
SlogP_VSA6                  0
MaxEStateI

In [5]:
# Check for duplicates
df.duplicated().sum()

30

In [6]:
# Identify duplicate rows
duplicates = df[df.duplicated() == True]
duplicates

Unnamed: 0_level_0,MolWt,EXACT_MASS,logP,MW,HBD,HBA,TPSA,Flexibility,Rotatable Bonds,MolWt.1,...,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,SPE_METHOD
STRUCTURE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00YRB22-055-077,366.446,366.1,2.263,366.4,3,6,83.12,0.2069,6,366.446,...,3,5,3,7,5,0,1,1,4,MCX
00YRB22-057-002,370.371,370.2,2.262,370.4,1,5,62.13,0.2143,6,370.371,...,1,4,1,8,4,0,1,1,3,MCX
00YRB22-057-007,349.334,349.1,1.168,349.3,1,6,75.02,0.2,5,349.334,...,1,6,1,10,3,0,1,1,3,MCX
00YRB22-057-008,348.365,348.2,2.274,348.4,1,5,62.13,0.1923,5,348.365,...,0,4,1,8,3,1,1,2,3,MCX
00YRB22-057-010,362.392,362.2,2.56,362.4,1,5,62.13,0.1852,5,362.392,...,0,4,1,8,3,1,1,2,3,MCX
00YRB22-057-011,382.342,382.1,1.374,382.3,2,7,90.81,0.1667,5,382.342,...,2,5,2,10,3,0,1,1,4,MCX
00YRB22-057-012,382.342,382.1,1.374,382.3,2,7,90.81,0.1667,5,382.342,...,2,5,2,10,3,0,1,1,4,MCX
00YRB22-057-013,370.371,370.2,2.262,370.4,1,5,62.13,0.2143,6,370.371,...,1,4,1,8,4,0,1,1,3,MCX
00YRB22-057-014,370.371,370.2,2.262,370.4,1,5,62.13,0.2143,6,370.371,...,1,4,1,8,4,0,1,1,3,MCX
00YRB22-057-016,384.398,384.2,2.612,384.4,1,5,62.13,0.2069,6,384.398,...,1,4,1,8,4,0,1,1,3,MCX


In [7]:
# # Keep only rows where "SPE_SUCCESSFUL" is True
# df = df[df["SPE_SUCCESSFUL"] == True]
# print(df.shape)
# df.head()

# Define Features and Target and Split and Scale Data

In [8]:
# Create features
X = df.drop(columns = ["SPE_METHOD", 
#                        "SAMPLE_ID", 
#                        "PREFERRED_LCMS_METHOD", 
#                        "METHOD", 
#                        "SPE_SUCCESSFUL", 
#                        "CRASHED_OUT", 
#                        "SAMPLE_STATUS", 
#                        "SAMPLE_CURRENT_STATUS", 
#                        "TERMINATION_CAUSE", 
#                        "TERMINATION_STEP", 
#                        "TERMINATION_DETAILS", 
#                        "REACTION_SCALE_(mmol)", 
#                        "SELECTED_FRACTIONS", 
#                        "VOLUME_COLLECTED_(mL)", 
#                        "TOTAL_FRACTIONS_COLLECTED", 
#                        "RECOVERED_SAMPLE_DRY_MASS_(mg)", 
#                        "PERCENT_YIELD", 
#                        "%_PURITY_(BY_LCMS)", 
#                        "PURIFICATION_COMMENTS"
                      ])

# Create target
y = df["SPE_METHOD"]

In [9]:
X.describe()

Unnamed: 0,MolWt,EXACT_MASS,logP,MW,HBD,HBA,TPSA,Flexibility,Rotatable Bonds,MolWt.1,...,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount
count,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,...,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0,1923.0
mean,403.661791,403.272387,3.471085,403.653666,1.5039,6.131045,76.338435,0.180943,5.983879,403.661791,...,1.455018,3.176287,5.144566,1.5039,7.679147,4.718669,0.055122,0.399376,0.454498,4.024961
std,102.286066,102.1599,1.231912,102.283054,1.057829,2.005582,28.914773,0.069576,3.246586,102.286066,...,0.877176,1.0421,1.574017,1.057829,2.450237,2.58737,0.247944,0.566714,0.610679,1.025888
min,226.283,226.1,-0.8532,226.3,0.0,1.0,3.24,0.03448,1.0,226.283,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,335.373,335.1,2.676,335.4,1.0,5.0,60.73,0.1304,3.0,335.373,...,1.0,3.0,4.0,1.0,6.0,3.0,0.0,0.0,0.0,3.0
50%,382.489,382.1,3.535,382.5,1.0,6.0,75.25,0.1714,6.0,382.489,...,1.0,3.0,5.0,1.0,8.0,5.0,0.0,0.0,0.0,4.0
75%,450.543,450.2,4.253,450.5,2.0,7.0,91.93,0.2174,8.0,450.543,...,2.0,4.0,6.0,2.0,9.0,6.0,0.0,1.0,1.0,5.0
max,832.88,832.3,6.899,832.9,6.0,14.0,179.8,0.4737,19.0,832.88,...,4.0,7.0,13.0,6.0,16.0,15.0,3.0,4.0,4.0,10.0


In [10]:
# Check balance of target values
y.value_counts()

MCX    1840
HLB      83
Name: SPE_METHOD, dtype: int64

In [11]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [12]:
# Create StandardScaler instance
scaler = StandardScaler()

# Fit StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Test Machine Learning Models

### Random Forest Classifier

In [13]:
# Train RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 100, random_state = 1)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions using test data
y_pred = rf_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy Score: {ba_score}\n\n")

# Display confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])
print(f"Confusion Matrix: \n{cm_df}\n\n")

# Print imbalanced classification report
print(f"Imbalanced Classification Report: \n{classification_report_imbalanced(y_test, y_pred)}")

Balanced Accuracy Score: 0.8333333333333333


Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             16              8
Actual MCX              0            457


Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       1.00      0.67      1.00      0.80      0.82      0.64        24
        MCX       0.98      1.00      0.67      0.99      0.82      0.69       457

avg / total       0.98      0.98      0.68      0.98      0.82      0.69       481



In [14]:
# List features sorted in descending order by feature importance
importances = rf_classifier.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

[(0.06988503625672754, 'qed'),
 (0.06775909805738546, 'PEOE_VSA10'),
 (0.060356194309790806, 'MaxPartialCharge'),
 (0.04982122752298182, 'FpDensityMorgan1'),
 (0.04106803384316927, 'SlogP_VSA6'),
 (0.03403197765889834, 'MolLogP'),
 (0.032605094456677315, 'MaxEStateIndex'),
 (0.03155262853350493, 'MinEStateIndex'),
 (0.029373417809140476, 'BalabanJ'),
 (0.028001784384290983, 'logP'),
 (0.025174821800443704, 'HallKierAlpha'),
 (0.022366115294396364, 'MinPartialCharge'),
 (0.021865613323481872, 'NumRotatableBonds'),
 (0.01995944814777704, 'SlogP_VSA2'),
 (0.019698937765471616, 'TPSA.1'),
 (0.019220492861160395, 'EState_VSA3'),
 (0.019130396847501722, 'TPSA'),
 (0.019100332101591132, 'MolWt.1'),
 (0.01908235978508183, 'ExactMolWt'),
 (0.018326347176485142, 'NumSaturatedHeterocycles'),
 (0.018293557466810897, 'EState_VSA8'),
 (0.018113243227389702, 'HeavyAtomMolWt'),
 (0.01770246471465509, 'NumHAcceptors'),
 (0.01766641645451778, 'FractionCSP3'),
 (0.016475019479098033, 'BertzCT'),
 (0.0158

In [15]:
# Select features to include in model
rf_sel = SelectFromModel(rf_classifier)
rf_sel.fit(X_train_scaled, y_train)
rf_selection = rf_sel.get_support()
sorted(zip(rf_selection, X.columns), reverse = True)

[(True, 'qed'),
 (True, 'logP'),
 (True, 'TPSA.1'),
 (True, 'TPSA'),
 (True, 'SlogP_VSA6'),
 (True, 'SlogP_VSA2'),
 (True, 'PEOE_VSA10'),
 (True, 'NumRotatableBonds'),
 (True, 'MolWt.1'),
 (True, 'MolLogP'),
 (True, 'MinPartialCharge'),
 (True, 'MinEStateIndex'),
 (True, 'MaxPartialCharge'),
 (True, 'MaxEStateIndex'),
 (True, 'HallKierAlpha'),
 (True, 'FpDensityMorgan1'),
 (True, 'ExactMolWt'),
 (True, 'EState_VSA3'),
 (True, 'BalabanJ'),
 (False, 'SMR_VSA4'),
 (False, 'SMR_VSA10'),
 (False, 'Rotatable Bonds'),
 (False, 'RingCount'),
 (False, 'PEOE_VSA2'),
 (False, 'NumValenceElectrons'),
 (False, 'NumSaturatedRings'),
 (False, 'NumSaturatedHeterocycles'),
 (False, 'NumSaturatedCarbocycles'),
 (False, 'NumHeteroatoms'),
 (False, 'NumHDonors'),
 (False, 'NumHAcceptors'),
 (False, 'NumAromaticRings'),
 (False, 'NumAromaticHeterocycles'),
 (False, 'NumAromaticCarbocycles'),
 (False, 'NumAliphaticRings'),
 (False, 'NumAliphaticHeterocycles'),
 (False, 'NumAliphaticCarbocycles'),
 (False, '

### TEST: Logistic Regression with all features and with RandomForest selected features only

In [16]:
# # Train-test split and scale selected features
# X_selected_train, X_selected_test, y_train, y_test = train_test_split(rf_sel.transform(X), y, random_state = 1)

# # Fit StandardScaler
# X_selected_scaler = scaler.fit(X_selected_train)

# # Scale data
# X_selected_train_scaled = X_selected_scaler.transform(X_selected_train)
# X_selected_test_scaled = X_selected_scaler.transform(X_selected_test)

In [17]:
# # Fit Logistic Regression model with all features
# model = LogisticRegression().fit(X_train_scaled, y_train)
# print(f'Training Score: {model.score(X_train_scaled, y_train)}')
# print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

In [18]:
# # Fit Logistic Regression model with selected features
# selected_model = LogisticRegression()
# selected_model.fit(X_selected_train_scaled, y_train)
# print(f'Training Score: {selected_model.score(X_selected_train_scaled, y_train)}')
# print(f'Testing Score: {selected_model.score(X_selected_test_scaled, y_test)}')

### Balanced Random Forest Classifier

In [19]:
# Train BalancedRandomForestClassifier
brf_classifier = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
brf_classifier.fit(X_train_scaled, y_train)

# Make predictions using test data
y_pred = brf_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9298869438366155

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             22              2
Actual MCX             26            431

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.46      0.92      0.94      0.61      0.93      0.86        24
        MCX       1.00      0.94      0.92      0.97      0.93      0.87       457

avg / total       0.97      0.94      0.92      0.95      0.93      0.87       481



In [20]:
# List features sorted in descending order by feature importance
importances = brf_classifier.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

[(0.07489388934708288, 'qed'),
 (0.05841662356560493, 'PEOE_VSA10'),
 (0.05549839586543551, 'MolLogP'),
 (0.04305140133688488, 'NumAliphaticHeterocycles'),
 (0.03706301522538697, 'ExactMolWt'),
 (0.03678865610017803, 'NumHAcceptors'),
 (0.03392346802756164, 'BalabanJ'),
 (0.03299962185528018, 'PEOE_VSA2'),
 (0.032840940470299056, 'FpDensityMorgan1'),
 (0.03229033125202033, 'Kappa2'),
 (0.03138604914144967, 'NumAliphaticRings'),
 (0.030566498786376465, 'NumRotatableBonds'),
 (0.029916006559038893, 'FractionCSP3'),
 (0.02958949839848239, 'SMR_VSA10'),
 (0.02683717615884012, 'BertzCT'),
 (0.025695316378304286, 'HeavyAtomMolWt'),
 (0.02351478420347112, 'MolMR'),
 (0.02228280012672082, 'MW'),
 (0.021207231208814667, 'MinPartialCharge'),
 (0.019433262440490238, 'NOCount'),
 (0.01907438149408866, 'NumSaturatedRings'),
 (0.018856745502781182, 'TPSA.1'),
 (0.018829658456418742, 'MaxEStateIndex'),
 (0.018700305069411012, 'TPSA'),
 (0.014978177076748558, 'SlogP_VSA2'),
 (0.014785187932971646, 'EX

In [21]:
# Select features to include in model
brf_sel = SelectFromModel(brf_classifier)
brf_sel.fit(X_train_scaled, y_train)
brf_selection = brf_sel.get_support()
sorted(zip(brf_selection, X.columns), reverse = True)

[(True, 'qed'),
 (True, 'TPSA.1'),
 (True, 'TPSA'),
 (True, 'SMR_VSA10'),
 (True, 'PEOE_VSA2'),
 (True, 'PEOE_VSA10'),
 (True, 'NumSaturatedRings'),
 (True, 'NumRotatableBonds'),
 (True, 'NumHAcceptors'),
 (True, 'NumAliphaticRings'),
 (True, 'NumAliphaticHeterocycles'),
 (True, 'NOCount'),
 (True, 'MolMR'),
 (True, 'MolLogP'),
 (True, 'MinPartialCharge'),
 (True, 'MaxEStateIndex'),
 (True, 'MW'),
 (True, 'Kappa2'),
 (True, 'HeavyAtomMolWt'),
 (True, 'FractionCSP3'),
 (True, 'FpDensityMorgan1'),
 (True, 'ExactMolWt'),
 (True, 'BertzCT'),
 (True, 'BalabanJ'),
 (False, 'logP'),
 (False, 'SlogP_VSA6'),
 (False, 'SlogP_VSA2'),
 (False, 'SMR_VSA4'),
 (False, 'Rotatable Bonds'),
 (False, 'RingCount'),
 (False, 'NumValenceElectrons'),
 (False, 'NumSaturatedHeterocycles'),
 (False, 'NumSaturatedCarbocycles'),
 (False, 'NumHeteroatoms'),
 (False, 'NumHDonors'),
 (False, 'NumAromaticRings'),
 (False, 'NumAromaticHeterocycles'),
 (False, 'NumAromaticCarbocycles'),
 (False, 'NumAliphaticCarbocycle

#### Test with BRF Selected Features

In [22]:
# Train-test split and scale selected features
X_selected_train, X_selected_test, y_train, y_test = train_test_split(brf_sel.transform(X), y, random_state = 1)

# Fit StandardScaler
X_selected_scaler = scaler.fit(X_selected_train)

# Scale data
X_selected_train_scaled = X_selected_scaler.transform(X_selected_train)
X_selected_test_scaled = X_selected_scaler.transform(X_selected_test)

In [23]:
# Train BalancedRandomForestClassifier
brf_classifier = BalancedRandomForestClassifier(n_estimators = 100, random_state = 1)
brf_classifier.fit(X_selected_train_scaled, y_train)

# Make predictions using test data
y_pred = brf_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9189004376367615

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             21              3
Actual MCX             17            440

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.55      0.88      0.96      0.68      0.92      0.84        24
        MCX       0.99      0.96      0.88      0.98      0.92      0.85       457

avg / total       0.97      0.96      0.88      0.96      0.92      0.85       481



### TEST: Logistic Regression with all features and with BalancedRandomForest selected features only

In [24]:
# Fit Logistic Regression model with all features
model = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {model.score(X_train_scaled, y_train)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test)}')

Training Score: 0.9882108183079057
Testing Score: 0.9875259875259875


In [25]:
# Fit Logistic Regression model with selected features
selected_model = LogisticRegression()
selected_model.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {selected_model.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {selected_model.score(X_selected_test_scaled, y_test)}')

Training Score: 0.9840499306518724
Testing Score: 0.9854469854469855


### Easy Ensemble AdaBoost Classifier

In [26]:
# Train EasyEnsembleClassifier
ee_classifier = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
ee_classifier.fit(X_train_scaled, y_train)

# Make predictions using test data
y_pred = ee_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9781181619256017

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             24              0
Actual MCX             20            437

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.55      1.00      0.96      0.71      0.98      0.96        24
        MCX       1.00      0.96      1.00      0.98      0.98      0.95       457

avg / total       0.98      0.96      1.00      0.96      0.98      0.95       481



#### Test with BRF Selected Features

In [27]:
# Train EasyEnsembleClassifier
ee_classifier = EasyEnsembleClassifier(n_estimators = 100, random_state = 1)
ee_classifier.fit(X_selected_train_scaled, y_train)

# Make predictions using test data
y_pred = ee_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9583789204959883

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             23              1
Actual MCX             19            438

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.55      0.96      0.96      0.70      0.96      0.92        24
        MCX       1.00      0.96      0.96      0.98      0.96      0.92       457

avg / total       0.98      0.96      0.96      0.96      0.96      0.92       481



### Gradient Boosting Classifier

In [28]:
# Do we want to try this one?

### Logistic Regression with Random Oversampling

In [29]:
# Resample training data with RandomOverSampler
ros = RandomOverSampler(random_state = 1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 1383, 'HLB': 1383})

In [30]:
# Train Logistic Regression model using resampled data
ros_lr_classifier = LogisticRegression(random_state = 1)
ros_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = ros_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9473924142961342

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             22              2
Actual MCX             10            447

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.69      0.92      0.98      0.79      0.95      0.89        24
        MCX       1.00      0.98      0.92      0.99      0.95      0.90       457

avg / total       0.98      0.98      0.92      0.98      0.95      0.90       481



#### Test with BRF Selected Features - WHAT IS CORRECT ORDER FOR STEPS BELOW?

In [31]:
# # Train-test split and scale selected features
# X_selected_train, X_selected_test, y_train, y_test = train_test_split(brf_sel.transform(X), y, random_state = 1)

# # Fit StandardScaler
# X_selected_scaler = scaler.fit(X_selected_train)

# # Scale data
# X_selected_train_scaled = X_selected_scaler.transform(X_selected_train)
# X_selected_test_scaled = X_selected_scaler.transform(X_selected_test)

# Resample training data with RandomOverSampler
X_selected_resampled, y_resampled = ros.fit_resample(X_selected_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 1383, 'HLB': 1383})

In [32]:
# Train Logistic Regression model using resampled data
ros_lr_classifier = LogisticRegression(random_state = 1)
ros_lr_classifier.fit(X_selected_resampled, y_resampled)

# Make predictions using test data
y_pred = ros_lr_classifier.predict(X_selected_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9024890590809628

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             21              3
Actual MCX             32            425

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.40      0.88      0.93      0.55      0.90      0.81        24
        MCX       0.99      0.93      0.88      0.96      0.90      0.82       457

avg / total       0.96      0.93      0.88      0.94      0.90      0.82       481



### Logistic Regression with SMOTE Oversampling

In [33]:
# Resample training data with SMOTE
smote = SMOTE(random_state = 1, sampling_strategy = "auto")
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'MCX': 1383, 'HLB': 1383})

In [34]:
# Train Logistic Regression model using resampled data
smote_lr_classifier = LogisticRegression(random_state = 1)
smote_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = smote_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9484865061998541

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             22              2
Actual MCX              9            448

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.71      0.92      0.98      0.80      0.95      0.89        24
        MCX       1.00      0.98      0.92      0.99      0.95      0.90       457

avg / total       0.98      0.98      0.92      0.98      0.95      0.90       481



### Logistic Regression with Random Undersampling

In [35]:
# Resample training data with RandomUnderSampler
rus = RandomUnderSampler(random_state = 1)
X_resampled, y_resampled = rus.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 59, 'MCX': 59})

In [36]:
# Train Logistic Regression model using resampled data
rus_lr_classifier = LogisticRegression(random_state = 1)
rus_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = rus_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9343544857768052

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             24              0
Actual MCX             60            397

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.29      1.00      0.87      0.44      0.93      0.88        24
        MCX       1.00      0.87      1.00      0.93      0.93      0.86       457

avg / total       0.96      0.88      0.99      0.91      0.93      0.86       481



### Logistic Regression with Cluster Centroids Undersampling

In [37]:
# Resample data using ClusterCentroids
cc = ClusterCentroids(random_state = 1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 59, 'MCX': 59})

In [38]:
# Train Logistic Regression model using resampled data
cc_lr_classifier = LogisticRegression(random_state = 1)
cc_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = cc_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9266046681254558

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             22              2
Actual MCX             29            428

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.43      0.92      0.94      0.59      0.93      0.86        24
        MCX       1.00      0.94      0.92      0.97      0.93      0.86       457

avg / total       0.97      0.94      0.92      0.95      0.93      0.86       481



### Logistic Regression with SMOTEENN Combination Over- and Undersampling

In [39]:
# Resample training data with SMOTEENN
smoteenn = SMOTEENN(random_state = 1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'HLB': 1383, 'MCX': 1334})

In [40]:
# Train Logistic Regression model using resampled data
smoteenn_lr_classifier = LogisticRegression(random_state = 1)
smoteenn_lr_classifier.fit(X_resampled, y_resampled)

# Make predictions using test data
y_pred = smoteenn_lr_classifier.predict(X_test_scaled)

# Calculate balanced accuracy score
ba_score = balanced_accuracy_score(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index = ["Actual HLB", "Actual MCX"], columns=["Predicted HLB", "Predicted MCX"])

# Generate imbalanced classification report
ic_report = classification_report_imbalanced(y_test, y_pred)

# Print model performance metrics
print(f"Balanced Accuracy Score: {ba_score}\n\n"
      f"Confusion Matrix: \n{cm_df}\n\n"
      f"Imbalanced Classification Report: \n{ic_report}")

Balanced Accuracy Score: 0.9462983223924143

Confusion Matrix: 
            Predicted HLB  Predicted MCX
Actual HLB             22              2
Actual MCX             11            446

Imbalanced Classification Report: 
                   pre       rec       spe        f1       geo       iba       sup

        HLB       0.67      0.92      0.98      0.77      0.95      0.89        24
        MCX       1.00      0.98      0.92      0.99      0.95      0.90       457

avg / total       0.98      0.97      0.92      0.97      0.95      0.90       481

