In [1]:
import sys
sys.path.append('..')

import os
import logging
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold, GridSearchCV,  ParameterGrid, train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)
from sklearn.multiclass import OneVsRestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import joblib

from utils.data_utils import process_data_for_lr
from utils.evaluation import evaluate_model

from IPython.display import display, Markdown


logger = logging.getLogger('LogisticRegression')
logger.setLevel(logging.INFO)

In [2]:
def load_data(file_path, num_rows=None):
    df = pd.read_csv(file_path)
    return df.sample(n=num_rows, random_state=42) if num_rows else df

In [3]:
def tune_lr(X_train, y_train, cv_folds=5, param_grid=None):
    """
    Train a Logistic Regression model using k-fold cross-validation with hyperparameter tuning.

    Parameters:
    - X_train: Training features (DataFrame or array)
    - y_train: Training labels (Series or array)
    - cv_folds: Number of folds for cross-validation.
    - param_grid: Dictionary of hyperparameters to search over.

    Returns:
    - best_model: The best Logistic Regression model found via GridSearchCV.
    - grid_search: The fitted GridSearchCV object (for further inspection if needed).
    """
    # Set default hyperparameter grid if not provided
    if param_grid is None:
        # Define separate parameter grids for different solvers to ensure compatibility
        param_grid = [
            # liblinear solver - works with l1 and l2 penalties
            {
                'solver': ['liblinear'],
                'penalty': ['l1', 'l2'],
                'C': [0.001, 0.01, 0.1, 1, 10],
                'max_iter': [100, 500, 1000]
            },
            # lbfgs solver - only works with l2 penalty
            {
                'solver': ['lbfgs'],
                'penalty': ['l2'],
                'C': [0.001, 0.01, 0.1, 1, 10],
                'max_iter': [100, 500, 1000]
            }
        ]
        
    # Initialize the logistic regression classifier
    lr = LogisticRegression(random_state=42)

    # Setup k-fold cross-validation
    kfold = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

    # Initialize GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(
        estimator=lr,
        param_grid=param_grid,
        cv=kfold,
        scoring='accuracy',
        n_jobs=-1,
        refit=True,  # refit on the entire training set with the best params
        verbose=2
    )

    # Fit grid search
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print("Best parameters found:", best_params)
    print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

    best_model = grid_search.best_estimator_
    return best_model, grid_search, best_params

In [4]:
# Load data
df_1000 = load_data('../data/train.csv', num_rows=1000)

# Process data
lr_obj = process_data_for_lr(df_1000, mode='train')

X = lr_obj['X']
y = lr_obj['y']
feature_names = lr_obj['feature_names']
preprocessor = lr_obj['preprocessor']
scaler = preprocessor['scaler']
encoder = preprocessor['encoder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
best_model, grid_search, best_params = tune_lr(X_train, y_train, cv_folds=5)

y_predictions = best_model.predict(X_test)
y_probabilities = best_model.predict_proba(X_test)[:, 1]

lr_metrics, lr_markdown = evaluate_model(
    y_test,
    y_predictions,
    y_probabilities,
    model_name="Logistic Regression"
)

display(Markdown(lr_markdown))

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best parameters found: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.7467


# Logistic Regression Evaluation Results

## Metrics

| Metric                    | Score    |
|---------------------------|----------|
| Validation Accuracy       | 0.7400 |
| Validation F1 Score (weighted) | 0.7344 |
| Validation Precision (weighted) | 0.7346 |
| Validation Recall (class 1) | 0.5676 |
| Binary Precision (class 1) | 0.6774 |
| Binary F1 Score (class 1) | 0.6176 |
| ROC-AUC Score             | 0.8130 |

## Detailed Classification Report

```
              precision    recall  f1-score   support

           0       0.77      0.84      0.80        63
           1       0.68      0.57      0.62        37

    accuracy                           0.74       100
   macro avg       0.72      0.70      0.71       100
weighted avg       0.73      0.74      0.73       100

```


In [5]:
def resample(X, y):
  """Undersample to match classes 1:1"""
  sampler = RandomUnderSampler(random_state=42)
  X_resampled, y_resampled = sampler.fit_resample(X, y)
  logger.info('Class distribution before sampling: %s', np.bincount(y))
  logger.info('Class distribution after sampling: %s', np.bincount(y_resampled))
  return X_resampled, y_resampled

In [6]:
resample_flag = True
drop_features_flag = True

# Load data
df_full = load_data('../data/train.csv')

# Process data
lr_obj = process_data_for_lr(df_full, mode='train')

X = lr_obj['X']
y = lr_obj['y']
feature_names = lr_obj['feature_names']
preprocessor = lr_obj['preprocessor']
scaler = preprocessor['scaler']
encoder = preprocessor['encoder']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

if resample_flag:
    X_train, y_train = resample(X_train, y_train)
    best_params.setdefault('class_weight', 'balanced')

lr_model = LogisticRegression(**best_params)
lr_model.fit(X, y)

y_predictions = lr_model.predict(X_test)
y_probabilities = lr_model.predict_proba(X_test)[:, 1]

lr_metrics, lr_markdown = evaluate_model(
    y_test,
    y_predictions,
    y_probabilities,
    model_name="Logistic Regression"
)


# dump scaler and encoder
os.makedirs("models", exist_ok=True)
joblib.dump(lr_model, "models/logistic_regression_model.pkl")
joblib.dump(scaler, "models/lr_scaler.pkl")
joblib.dump(encoder, "models/lr_encoder.pkl")
joblib.dump(preprocessor, "models/lr_preprocessor.pkl")

display(Markdown(lr_markdown))

# Logistic Regression Evaluation Results

## Metrics

| Metric                    | Score    |
|---------------------------|----------|
| Validation Accuracy       | 0.7276 |
| Validation F1 Score (weighted) | 0.7298 |
| Validation Precision (weighted) | 0.7353 |
| Validation Recall (class 1) | 0.7136 |
| Binary Precision (class 1) | 0.6363 |
| Binary F1 Score (class 1) | 0.6727 |
| ROC-AUC Score             | 0.8029 |

## Detailed Classification Report

```
              precision    recall  f1-score   support

           0       0.80      0.74      0.77     24933
           1       0.64      0.71      0.67     16100

    accuracy                           0.73     41033
   macro avg       0.72      0.73      0.72     41033
weighted avg       0.74      0.73      0.73     41033

```


In [7]:
# Load validation data
val_df = pd.read_csv('../data/validation.csv')

# Process validation data
lr_obj_test = process_data_for_lr(val_df, preprocessor=preprocessor, mode='inference')
lr_X = lr_obj['X']
lr_y = lr_obj['y']


In [8]:

lr_y_predictions = lr_model.predict(lr_X)
lr_y_probabilities = lr_model.predict_proba(lr_X)[:, 1]
# For Logistic Regression


In [9]:
lr_metrics, lr_markdown = evaluate_model(
    lr_y,
    lr_y_predictions,
    y_prob=lr_y_probabilities,
    model_name="Logistic Regression"
)

display(Markdown(lr_markdown))


# Logistic Regression Evaluation Results

## Metrics

| Metric                    | Score    |
|---------------------------|----------|
| Validation Accuracy       | 0.7267 |
| Validation F1 Score (weighted) | 0.7289 |
| Validation Precision (weighted) | 0.7343 |
| Validation Recall (class 1) | 0.7115 |
| Binary Precision (class 1) | 0.6355 |
| Binary F1 Score (class 1) | 0.6714 |
| ROC-AUC Score             | 0.8022 |

## Detailed Classification Report

```
              precision    recall  f1-score   support

           0       0.80      0.74      0.77    249302
           1       0.64      0.71      0.67    161024

    accuracy                           0.73    410326
   macro avg       0.72      0.72      0.72    410326
weighted avg       0.73      0.73      0.73    410326

```


The logistic regression model demonstrates moderate and balanced performance on the validation set. With an accuracy of 73.3%, the model correctly classifies roughly three out of every four instances. Precision (73.0%) and recall (73.3%) indicate that the model maintains a consistent ability to identify positive cases while minimizing false positives. The F1 score of 72.8% reinforces this balance, while a ROC AUC of 0.790 shows that the model has a fair ability to discriminate between classes. Overall, these results justify the decision to adopt this final model for the problem at hand.