In [82]:
#import random
from collections import Counter
from importlib import reload

import h5py
import joblib
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torch.nn as nn
import torch.optim as optim
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, TensorDataset
from xgboost import XGBClassifier

import models_Gloire
import utilities

reload(utilities)
reload(models_Gloire)

<module 'models_Gloire' from "/Users/gloirelinvani/Library/Mobile Documents/com~apple~CloudDocs/School/Magistère d'Informatique/LDD3/S6/Stage/Earthquakes/earthquakes/New_code/models_Gloire.py">

# Train, Test split

In [83]:
# Function to generate the training and test data
def generate_train_test_data(hdf5_out_put_file_path, train_percentage, n_pixels_lat, n_pixels_lon, z_component, cnn):
    with h5py.File(hdf5_out_put_file_path, 'r') as file:
        sequences = list(file.keys())
        n_sequences = len(sequences)
        n_train_sequences = int(train_percentage * n_sequences)
        n_test_sequences = n_sequences - n_train_sequences

        print("n_train", n_train_sequences, "n_test", n_test_sequences)
        print("total number of sequences", n_train_sequences + n_test_sequences)
        print("n_sequences", n_sequences)

        if not cnn:
            n_train_samples = n_pixels_lat * n_pixels_lon * n_train_sequences
            n_test_samples = n_pixels_lat * n_pixels_lon * n_test_sequences
            print("n_train_samples", n_train_samples)
            print("n_test_samples", n_test_samples)
            # 2 for displacement vectors norms and distance pixel to the mainshock
            x_train = np.zeros((n_train_samples, 2))
            y_train = np.zeros(n_train_samples)
            x_test = np.zeros((n_test_samples, 2))
            y_test = np.zeros(n_test_samples)

            sample_index = 0
            for i in range(n_train_sequences):
                group = file[sequences[i]]
                interpolated_displacements = group['interpolated_displacement'][()].reshape(-1, 5)

                # Remove masked pixels
                mask = interpolated_displacements[:, -1] == 1
                #print(mask.sum())
                interpolated_displacements = interpolated_displacements[mask]

                labels = group['labels'][()].flatten()
                labels = labels[mask]

                #print(interpolated_displacements)
                if z_component:
                    norms = np.linalg.norm(interpolated_displacements[..., :3], axis=1)
                else:
                    norms = np.linalg.norm(interpolated_displacements[..., :2], axis=1)
                #interpolated_displacements = interpolated_displacements[..., 3]
                #interpolated_displacements = interpolated_displacements.reshape(-1, 1)

                #print(norms)
                #sys.exit()

                n_samples = norms.shape[0]
                x_train[sample_index:sample_index + n_samples] = np.hstack(
                    (norms.reshape(-1, 1), interpolated_displacements[..., -2].reshape(-1, 1)))
                #[norms.reshape(-1, 1), interpolated_displacements[..., -2].reshape(-1,1)], axis=1)
                #print(x_train[sample_index:sample_index + n_samples].shape)
                #sys.exit()
                y_train[sample_index:sample_index + n_samples] = labels

                sample_index += n_samples

            # Remove excess zeros
            x_train = x_train[:sample_index]
            y_train = y_train[:sample_index]
            sample_index = 0
            for i in range(n_train_sequences, n_sequences):
                group = file[sequences[i]]
                interpolated_displacements = group['interpolated_displacement'][()].reshape(-1, 5)

                # Remove masked pixels
                mask = interpolated_displacements[:, -1] == 1
                interpolated_displacements = interpolated_displacements[mask]
                labels = group['labels'][()].flatten()
                labels = labels[mask]

                if z_component:
                    norms = np.linalg.norm(interpolated_displacements[..., :3], axis=1)
                else:
                    norms = np.linalg.norm(interpolated_displacements[..., :2], axis=1)
                # interpolated_displacements = interpolated_displacements[..., 3]
                #interpolated_displacements = interpolated_displacements.reshape(-1, 1)

                n_samples = norms.shape[0]
                x_test[sample_index:sample_index + n_samples] = np.hstack(
                    (norms.reshape(-1, 1), interpolated_displacements[..., -2].reshape(-1, 1)))
                y_test[sample_index:sample_index + n_samples] = labels

                sample_index += n_samples
            # Remove excess zeros
            x_test = x_test[:sample_index]
            y_test = y_test[:sample_index]

            return x_train, y_train, x_test, y_test
        else:
            # 2 for displacement vectors norms and distance pixel to the mainshock
            x_train = np.zeros((n_train_sequences, n_pixels_lat, n_pixels_lon, 3))
            y_train = np.zeros((n_train_sequences, n_pixels_lat, n_pixels_lon))
            x_test = np.zeros_like(x_train)
            y_test = np.zeros_like(y_train)

            for i in range(n_train_sequences):
                group = file[sequences[i]]
                interpolated_displacements = group['interpolated_displacement'][()]
                labels = group['labels'][()]
                if z_component:
                    norms = np.linalg.norm(interpolated_displacements[..., :3], axis=2)
                else:
                    norms = np.linalg.norm(interpolated_displacements[..., :2], axis=2)
                # shape: (n_pixels_lat, n_pixels_lon, 2) for the displacement vectors norms and distance pixel to the mainshock
                x_train[i] = np.stack([norms, interpolated_displacements[..., -2], interpolated_displacements[..., -1]],
                                      axis=-1)
                #print(x_train[i].shape)
                y_train[i] = labels

            for i in range(n_train_sequences, n_sequences):
                group = file[sequences[i]]
                interpolated_displacements = group['interpolated_displacement'][()]
                labels = group['labels'][()]
                if z_component:
                    norms = np.linalg.norm(interpolated_displacements[..., :3], axis=2)
                else:
                    norms = np.linalg.norm(interpolated_displacements[..., :2], axis=2)
                x_test[i - n_train_sequences] = np.stack(
                    [norms, interpolated_displacements[..., -2], interpolated_displacements[..., -1]], axis=-1)
                #print(x_test[i - n_train_sequences].shape)
                #sys.exit()
                y_test[i - n_train_sequences] = labels

            return x_train, y_train, x_test, y_test

# Parameters

In [84]:
#Path to the HDF5 output file
hdf5_out_put_file_path = "Data/Interpolated_Data_reg=False_soft_labels=False_elasticity=True_min_mainshock_mag=6_min_stations_per_main_shock=3_min_after_shock_mag=4_after_shock_time_window=45_n_days_before_mainshock=1_n_days_after_mainshock=1.hdf5"
train_percentage = 0.8
n_pixels_lat = 50  # Number of pixels in the latitude direction
n_pixels_lon = 50  # Number of pixels in the longitude direction
z_component = False  # Whether to use the z component of the displacement vectors
cnn = True  # Whether to use a CNN model or not

In [85]:
x_train_val, y_train_val, x_test, y_test = generate_train_test_data(hdf5_out_put_file_path, train_percentage,
                                                                    n_pixels_lat,
                                                                    n_pixels_lon,
                                                                    z_component,
                                                                    cnn)

n_train 132 n_test 34
total number of sequences 166
n_sequences 166


# Train, Validation split

In [86]:
# Calculate the split index
split_index = int(0.8 * len(x_train_val))  # 80% for training, 20% for validation

# Split the data
x_train = x_train_val[:split_index]
y_train = y_train_val[:split_index]

x_val = x_train_val[split_index:]
y_val = y_train_val[split_index:]

In [87]:
x_train[y_train == 1]

array([[  2.40014319, 110.80120583,   0.        ],
       [  2.35516775,  41.02931667,   0.        ],
       [  2.31825672,   0.        ,   1.        ],
       ...,
       [  0.33266482, 113.35751632,   0.        ],
       [  0.24977639, 149.95279387,   1.        ],
       [  0.24286018, 147.64151047,   0.        ]])

In [88]:
x_train.shape

(105, 50, 50, 3)

In [89]:
y_train

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [90]:
y_train.shape

(105, 50, 50)

In [91]:
y_train.sum()

11413.0

In [59]:
lr = LogisticRegression(random_state=42, n_jobs=-1)

In [60]:
lr.fit(x_train, y_train)

In [61]:
y_train_pred = lr.predict(x_train)
train_score = metrics.balanced_accuracy_score(y_train, y_train_pred)

In [62]:
train_score

0.5263875682634579

# Oversampling the training set

In [63]:
# Oversampling the training set using SMOTE
oversampler = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = oversampler.fit_resample(x_train, y_train)

In [64]:
Counter(y_train_resampled)

Counter({0.0: 122226, 1.0: 122226})

In [65]:
lr_resampled = LogisticRegression(random_state=42, n_jobs=-1)
lr_resampled.fit(x_train_resampled, y_train_resampled)

In [66]:
y_train_pred_resampled = lr_resampled.predict(x_train_resampled)
train_score_resampled = metrics.balanced_accuracy_score(y_train_resampled, y_train_pred_resampled)

In [67]:
train_score_resampled

0.6414919902475742

# Baseline models

In [68]:
x_train

array([[  2.44613239,  92.33735315],
       [  2.43237149,  88.58553127],
       [  2.4197729 ,  84.86292917],
       ...,
       [  2.18681355, 124.99688661],
       [  2.11637382, 123.16651474],
       [  2.01678129, 121.4375761 ]])

## CNN models

### Simple CNN

In [93]:
# Setting the seed for reproducibility
utilities.set_seed()
# Initialize the model
simple_cnn = models_Gloire.SimpleCNN()

# Loss and optimizer
criterion = nn.BCELoss(reduction='none')
optimizer = optim.Adam(simple_cnn.parameters(), lr=0.001)

# Assuming x_train and y_train are defined and preprocessed appropriately
x_train_tensor = torch.tensor(x_train[:, :, :, :2], dtype=torch.float32).permute(0, 3, 1,
                                                                                 2)  # Convert to (105, 2, 50, 50)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # Convert to (105, 1, 50, 50)
train_mask = torch.tensor(x_train[:, :, :, 2], dtype=torch.float32).unsqueeze(1)  # Convert to (105, 1, 50, 50)

x_val_tensor = torch.tensor(x_val[:, :, :, :2], dtype=torch.float32).permute(0, 3, 1, 2)  # Convert to (105, 2, 50, 50)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)  # Convert to (105, 1, 50, 50)
val_mask = torch.tensor(x_val[:, :, :, 2], dtype=torch.float32).unsqueeze(1)  # Convert to (105, 1, 50, 50)

x_test_tensor = torch.tensor(x_test[:, :, :, :2], dtype=torch.float32).permute(0, 3, 1,
                                                                               2)  # Convert to (105, 2, 50, 50)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)  # Convert to (105, 1, 50, 50)
test_mask = torch.tensor(x_test[:, :, :, 2], dtype=torch.float32).unsqueeze(1)  # Convert to (105, 1, 50, 50)

# Prepare DataLoader
train_dataset = TensorDataset(x_train_tensor, y_train_tensor, train_mask)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor, val_mask)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, generator=torch.Generator().manual_seed(42))
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Train the model
models_Gloire.train_simple_cnn_model(simple_cnn, train_loader, criterion, optimizer, num_epochs=20)

Epoch 1/20, Loss: 0.7388786245137453
Epoch 2/20, Loss: 0.5260149575769901
Epoch 3/20, Loss: 0.9482496194541454
Epoch 4/20, Loss: 0.4892433686181903
Epoch 5/20, Loss: 0.4411251377314329
Epoch 6/20, Loss: 0.43475123308598995
Epoch 7/20, Loss: 0.38018006831407547
Epoch 8/20, Loss: 0.0978187695145607
Epoch 9/20, Loss: 0.07730127405375242
Epoch 10/20, Loss: 0.10159548092633486
Epoch 11/20, Loss: 0.0734406765550375
Epoch 12/20, Loss: 0.08020349871367216
Epoch 13/20, Loss: 0.07465546485036612
Epoch 14/20, Loss: 0.07710205018520355
Epoch 15/20, Loss: 0.09552344772964716
Epoch 16/20, Loss: 0.08854550868272781
Epoch 17/20, Loss: 0.07213073456659913
Epoch 18/20, Loss: 0.09405478276312351
Epoch 19/20, Loss: 0.0865883557125926
Epoch 20/20, Loss: 0.07585322111845016
Balanced Accuracy: 50.00%
ROC AUC: 0.71
Accuracy: 95.81%


In [94]:
# Evaluate the model on the train set
models_Gloire.evaluate_cnn_model(simple_cnn, train_loader)

Balanced Accuracy: 50.00%
ROC AUC: 0.71
Accuracy: 95.81%


In [95]:
# Evaluate the model on the validation set
models_Gloire.evaluate_cnn_model(simple_cnn, val_loader)

Balanced Accuracy: 50.00%
ROC AUC: 0.63
Accuracy: 98.57%


### Improved CNN

In [99]:
# Initialize the model
improved_cnn = models_Gloire.ImprovedCNN()

# Compute class weights
pos_weight = torch.tensor([(y_train == 0).sum() / (y_train == 1).sum()])

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='none')
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

# Train the model
models_Gloire.train_improved_cnn_model(improved_cnn, train_loader, criterion, optimizer, scheduler, num_epochs=20)


The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.



Epoch 1/20, Loss: 1.4121
Epoch 2/20, Loss: 1.4104
Epoch 3/20, Loss: 1.3732
Epoch 4/20, Loss: 1.4465
Epoch 5/20, Loss: 1.4411
Epoch 6/20, Loss: 1.3819
Epoch 7/20, Loss: 1.3238
Epoch 8/20, Loss: 1.4661
Epoch 9/20, Loss: 1.4003
Epoch 10/20, Loss: 1.3528
Epoch 11/20, Loss: 1.4245
Epoch 12/20, Loss: 1.3692
Epoch 13/20, Loss: 1.3934
Epoch 14/20, Loss: 1.4085
Epoch 15/20, Loss: 1.4749
Epoch 16/20, Loss: 1.3626
Epoch 17/20, Loss: 1.4063
Epoch 18/20, Loss: 1.3713
Epoch 19/20, Loss: 1.3176
Epoch 20/20, Loss: 1.4249


In [100]:
# Evaluate the model on the train set
models_Gloire.evaluate_cnn_model(improved_cnn, train_loader)

Balanced Accuracy: 49.51%
ROC AUC: 0.37
Accuracy: 92.20%


In [101]:
# Evaluate the model on the validation set
models_Gloire.evaluate_cnn_model(improved_cnn, val_loader)

Balanced Accuracy: 50.14%
ROC AUC: 0.54
Accuracy: 94.09%


## Other baseline models

In [73]:
# Calculate the scale_pos_weight hyperparameter for XGBoost
# This is used to balance the positive and negative class weights. Since the dataset is imbalanced, we want to give more weight to the positive class.
class_counts = Counter(y_train)
scale_pos_weight = class_counts[0] / class_counts[1]

# Calculate the class_weight hyperparameter for SVM
class_weight_options = {0: 1, 1: scale_pos_weight}

# Defining models
models = [
    {
        'name': 'Ensemble Soft Voting Random Forest Classifier',
        'estimator': models_Gloire.BootstrappedSoftVotingClassifier(RandomForestClassifier(random_state=42, n_jobs=-1,
                                                                                           n_estimators=10))
    },
    {
        'name': 'Ensemble Soft Voting XGBoost Classifier',
        'estimator': models_Gloire.BootstrappedSoftVotingClassifier(XGBClassifier(random_state=42, n_jobs=-1,
                                                                                  ), n_estimators=10, xg_boost=True)
    },
    {
        'name': 'Ensemble Soft Voting LightGBM Classifier',
        'estimator': models_Gloire.BootstrappedSoftVotingClassifier(LGBMClassifier(random_state=42, n_jobs=-1),
                                                                    n_estimators=10)

    },
    {
        'name': 'Ensemble Soft Voting Logistic Regression',
        'estimator': models_Gloire.BootstrappedSoftVotingClassifier(LogisticRegression(random_state=42, n_jobs=-1),
                                                                    n_estimators=10)
    },
    {
        'name': 'Random Forest Classifier',
        'estimator': RandomForestClassifier(random_state=42, n_jobs=-1, class_weight=class_weight_options)
    },
    {
        'name': 'XGBoost',
        'estimator': XGBClassifier(random_state=42, n_jobs=-1,
                                   scale_pos_weight=scale_pos_weight)
    },
    {
        'name': 'LightGBM',
        'estimator': LGBMClassifier(random_state=42, objective='binary', n_jobs=-1, class_weight=class_weight_options)

    },
    {
        'name': 'Logistic Regression',
        'estimator': LogisticRegression(random_state=42, n_jobs=-1, class_weight=class_weight_options)
    }
]


# Function to evaluate a baseline model
def evaluate_baseline_model(x_train, y_train, x_test, y_test, model, model_name):
    pipeline = make_pipeline(
        #SMOTE(random_state=42),
        model)

    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    y_probas = pipeline.predict_proba(x_test)
    precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_probas[:, 1])
    # Calculate the area under the precision-recall curve
    auprc = metrics.auc(recall, precision)
    auroc = metrics.roc_auc_score(y_test, y_probas[:, 1])
    return {
        'Model': model_name,
        'Recall': metrics.recall_score(y_test, y_pred, average='binary'),
        'Balanced Accuracy': metrics.balanced_accuracy_score(y_test, y_pred),
        'AUROC': auroc,
        'F1 Score': metrics.f1_score(y_test, y_pred, average='binary'),
        'AUPRC': auprc,
        'Accuracy': metrics.accuracy_score(y_test, y_pred)
    }, conf_matrix

In [70]:
def plot_confusion_matrix(conf_matrix, model_name):
    fig = px.imshow(conf_matrix, labels=dict(x="Predicted", y="True"),
                    x=['No Aftershock', 'Aftershock'], y=['No Aftershock', 'Aftershock'],
                    color_continuous_scale='Viridis',
                    color_continuous_midpoint=np.average(conf_matrix),
                    title=f"Confusion Matrix for {model_name}",
                    text_auto=True)
    fig.show()

# Train score

In [74]:
# Iterate through models and evaluate them
results_train = []
for model in models:
    score, conf_matrix = evaluate_baseline_model(x_train, y_train, x_train, y_train,
                                                 model['estimator'], model['name'])
    plot_confusion_matrix(conf_matrix, model['name'])
    results_train.append(score)

# Convert scores to a DataFrame
results_df_train = pd.DataFrame(results_train)

[LightGBM] [Info] Number of positive: 4812, number of negative: 122363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 127175, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 4799, number of negative: 122376
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 127175, number of used features: 2
[LightGBM] [Info] [bin

[LightGBM] [Info] Number of positive: 4949, number of negative: 122226
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001386 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 127175, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [75]:
results_df_train

Unnamed: 0,Model,Recall,Balanced Accuracy,AUROC,F1 Score,AUPRC,Accuracy
0,Ensemble Soft Voting Random Forest Classifier,0.647808,0.823638,0.999073,0.780049,0.976116,0.985783
1,Ensemble Soft Voting XGBoost Classifier,0.665387,0.793422,0.918948,0.369129,0.427494,0.911492
2,Ensemble Soft Voting LightGBM Classifier,0.571833,0.745508,0.861135,0.320553,0.346548,0.905665
3,Ensemble Soft Voting Logistic Regression,0.465751,0.642502,0.671182,0.157091,0.164605,0.805496
4,Random Forest Classifier,0.997777,0.998889,1.0,0.998887,1.0,0.999914
5,XGBoost,0.772883,0.8184,0.911017,0.301098,0.408712,0.860374
6,LightGBM,0.606789,0.752031,0.857322,0.292861,0.34511,0.885968
7,Logistic Regression,0.462922,0.642519,0.671308,0.158098,0.164536,0.808138


# Validation score

In [76]:
x_val

array([[  1.90237303, 119.81446225],
       [  1.78147696, 118.30152946],
       [  1.65926464, 116.90305595],
       ...,
       [  2.97959278, 151.18114928],
       [  2.79942072, 153.75925219],
       [  2.66760357, 156.40656952]])

In [77]:
y_val.shape

(31794,)

In [78]:
y_val.sum()

443.0

In [79]:
x_val.shape

(31794, 2)

In [80]:
# Iterate through models and evaluate them
results_val = []
for model in models:
    score, conf_matrix = evaluate_baseline_model(x_train, y_train, x_val, y_val,
                                                 model['estimator'], model['name'])
    plot_confusion_matrix(conf_matrix, model['name'])
    results_val.append(score)

# Convert scores to a DataFrame
results_df_val = pd.DataFrame(results_val)

[LightGBM] [Info] Number of positive: 4953, number of negative: 122222
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 127175, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 4911, number of negative: 122264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 127175, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 4949, number of negative: 122226
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 127175, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [81]:
results_df_val

Unnamed: 0,Model,Recall,Balanced Accuracy,AUROC,F1 Score,AUPRC,Accuracy
0,Ensemble Soft Voting Random Forest Classifier,0.051919,0.525002,0.618922,0.087452,0.052773,0.984903
1,Ensemble Soft Voting XGBoost Classifier,0.182844,0.571008,0.615275,0.0898,0.070128,0.948355
2,Ensemble Soft Voting LightGBM Classifier,0.194131,0.580527,0.62386,0.109834,0.072893,0.956155
3,Ensemble Soft Voting Logistic Regression,0.392777,0.627028,0.648906,0.070077,0.066338,0.854752
4,Random Forest Classifier,0.054176,0.525764,0.612825,0.087273,0.048262,0.984211
5,XGBoost,0.259594,0.576114,0.609046,0.058614,0.065934,0.883815
6,LightGBM,0.230248,0.5879,0.616051,0.090586,0.067343,0.935585
7,Logistic Regression,0.388262,0.625569,0.648907,0.070004,0.066329,0.856262


# Hyperparameter optimization

In [None]:
CV = 5  # Number of cross-validation folds

In [None]:
# Defining models and their hyperparameters
models = [
    {
        'name': 'Random Forest Classifier',
        'estimator': RandomForestClassifier(random_state=42, n_jobs=-1),
        'hyperparameters': {
            'model__n_estimators': [100, 300],
            'model__max_depth': [None, 5, 15],
            'model__min_samples_split': [3, 5],
            'model__min_samples_leaf': [1, 3],
            'model__bootstrap': [True],
            'model__criterion': ['gini', 'entropy']
        }
    },
    {
        'name': 'XGBoost',
        'estimator': XGBClassifier(random_state=42, objective='binary:logistic',
                                   n_jobs=-1),
        'hyperparameters': {
            'model__max_depth': [0, 5, 15],
            'model__learning_rate': [0.01, 0.1],
            'model__n_estimators': [100, 300],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
        }
    },
    {
        'name': 'LightGBM',
        'estimator': LGBMClassifier(random_state=42, objective='binary', n_jobs=-1),
        'hyperparameters': {
            #'model__num_leaves': [15, 31, 63],
            'model__learning_rate': [0.01, 0.1],
            'model__n_estimators': [100, 300],
            'model__max_depth': [-1, 5, 15],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
        }
    },
    {
        'name': 'Logistic Regression',
        'estimator': LogisticRegression(random_state=42, n_jobs=-1),
        'hyperparameters': {
            'model__max_iter': [100, 1000],
            'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'model__penalty': ['l1', 'l2'],
            'model__solver': ['liblinear']
        }
    }
]


# Function to perform model training, evaluation, and printing results
def evaluate_model(x_train, y_train, x_test, y_test, model_dict, hyperparameters):
    model = model_dict['estimator']
    results = []
    pipeline = Pipeline(steps=[('model', model)])

    # Defining the scoring metrics
    # scoring_metrics = {
    #'recall': metrics.make_scorer(metrics.recall_score, average='binary'),
    # balanced_accuracy = metrics.make_scorer(metrics.balanced_accuracy_score),
    #'f1_score': metrics.make_scorer(metrics.f1_score, average='binary')
    # }
    grid_search = model_selection.GridSearchCV(pipeline, hyperparameters,
                                               scoring=metrics.make_scorer(metrics.recall_score, average='binary'),
                                               n_jobs=-1,
                                               cv=CV,
                                               refit=True
                                               )
    grid_search.fit(x_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test)
    # Conf matrix
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    #print(f"Confusion Matrix for {model_dict['name']}:\n{conf_matrix}")
    y_probas = best_model.predict_proba(x_test)
    precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_probas[:, 1])
    # Calculate the area under the precision-recall curve
    auprc = metrics.auc(recall, precision)
    auroc = metrics.roc_auc_score(y_test, y_probas[:, 1])
    results.append({
        'Model': model_dict['name'],
        'Recall': metrics.recall_score(y_test, y_pred, average='binary'),
        'Balanced Accuracy': metrics.balanced_accuracy_score(y_test, y_pred),
        'AUROC': auroc,
        'F1 Score': metrics.f1_score(y_test, y_pred, average='binary'),
        'AUPRC': auprc,
        'Accuracy': metrics.accuracy_score(y_test, y_pred)
    })

    return grid_search, results, best_model, conf_matrix

In [None]:
grid_search_RF, results_RF, best_model_RF, conf_matrix_RF = evaluate_model(x_train, y_train, x_val,
                                                                           y_val, models[0],
                                                                           models[0]['hyperparameters'])
# Saving the best Random Forest model
joblib.dump(grid_search_RF, 'grid_search_RF.pkl')
print(f"Random Forest - Best Hyperparameters: {grid_search_RF.best_params_}")

In [None]:
print(f"Best RF model mean CV score: {grid_search_RF.best_score_:.2%})")

In [None]:
results_RF_df = pd.DataFrame(results_RF)
results_RF_df

In [None]:
grid_search_RF = joblib.load('grid_search_RF.pkl')
best_model_RF = grid_search_RF.best_estimator_
y_pred = best_model_RF.predict(x_val)
# Conf matrix
conf_matrix_RF = metrics.confusion_matrix(y_val, y_pred)

In [None]:
plot_confusion_matrix(conf_matrix_RF, 'Random Forest')

In [None]:
grid_search_XGB, results_XGB, best_model_XGB, conf_matrix_XGB = evaluate_model(x_train, y_train,
                                                                               x_val, y_val, models[1],
                                                                               models[1]['hyperparameters'])
# Saving the best XGBoost model
joblib.dump(grid_search_XGB, 'grid_search_XGB.pkl')
print(f"XGBoost - Best Hyperparameters: {grid_search_XGB.best_params_}")

In [None]:
print(f"Best XGB model mean CV score: {grid_search_XGB.best_score_:.2%})")

In [None]:
results_XGB_df = pd.DataFrame(results_XGB)
results_XGB_df

In [None]:
plot_confusion_matrix(conf_matrix_XGB, 'XGBoost')

In [None]:
grid_search_LGBM, results_LGBM, best_model_LGBM, conf_matrix_LGBM = evaluate_model(x_train, y_train,
                                                                                   x_val, y_val, models[2],
                                                                                   models[2]['hyperparameters'])
# Saving the best LightGBM model
joblib.dump(grid_search_LGBM, 'grid_search_LGBM.pkl')
print(f"LightGBM - Best Hyperparameters: {grid_search_LGBM.best_params_}")

In [None]:
print(f"Best LGBM model mean CV score: {grid_search_LGBM.best_score_:.2%})")

In [None]:
results_LGBM_df = pd.DataFrame(results_LGBM)
results_LGBM_df

In [None]:
plot_confusion_matrix(conf_matrix_LGBM, 'LightGBM')

In [None]:
grid_search_LR, results_LR, best_model_LR, conf_matrix_LR = evaluate_model(x_train, y_train, x_val,
                                                                           y_val, models[3],
                                                                           models[3]['hyperparameters'])
#Saving the best LightGBM model
joblib.dump(grid_search_LR, 'grid_search_LR.pkl')
print(f"Logistic Regression - Best Hyperparameters: {grid_search_LR.best_params_}")

In [None]:
print(f"Best Logistic Regression model mean CV score: {grid_search_LR.best_score_:.2%})")

In [None]:
results_LR_df = pd.DataFrame(results_LR)
results_LR_df

In [None]:
plot_confusion_matrix(conf_matrix_LR, 'Logistic Regression')