In [2]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3


In [3]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import joblib  # For saving models
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [4]:
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Create output directory if not exists
output_dir = "data/outputs/"
os.makedirs(output_dir, exist_ok=True)

In [5]:
# ---- STEP 1: LOAD DATA ----
file_path = os.path.join(output_dir, "features_insurance_data.csv")
df = pd.read_csv(file_path)

In [6]:
# Define feature columns (X) and target variable (y)
X = df.drop(columns=["churn"])
y = df["churn"]

In [7]:
# ---- STEP 2: TRAIN-TEST SPLIT (80/20) ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")

Training Set: (800, 6), Testing Set: (200, 6)


In [8]:
# ---- STEP 3: HANDLE CLASS IMBALANCE USING SMOTE ----
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Resampled Training Set: {X_train_resampled.shape}")

Resampled Training Set: (1274, 6)


In [9]:
# ---- STEP 4: HYPERPARAMETER TUNING USING CROSS-VALIDATION ----
# Define the hyperparameter grid
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],  # Regularization strength
    "solver": ["liblinear", "lbfgs"]  # Solver methods
}

In [10]:
# Define the logistic regression model
log_reg = LogisticRegression(max_iter=500, random_state=42)

# Perform GridSearchCV with 5-fold stratified cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=StratifiedKFold(n_splits=5), scoring="f1", n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

In [11]:
# Get the best model from grid search
best_model = grid_search.best_estimator_
print(f"Best Model Parameters: {grid_search.best_params_}")

Best Model Parameters: {'C': 0.01, 'solver': 'liblinear'}


In [12]:
# ---- STEP 5: TRAIN THE FINAL MODEL ----
best_model.fit(X_train_resampled, y_train_resampled)

In [13]:
# ---- STEP 6: MAKE PREDICTIONS ----
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probability scores

In [14]:
# ---- STEP 7: EVALUATE MODEL PERFORMANCE ----
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
    "AUC-ROC": roc_auc_score(y_test, y_pred_proba),
}

In [15]:
# Print evaluation metrics
print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Model Performance Metrics:
Accuracy: 0.4100
Precision: 0.1652
Recall: 0.4634
F1 Score: 0.2436
AUC-ROC: 0.4857


In [16]:
# ---- STEP 8: SAVE MODEL PREDICTIONS ----
predictions_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred, "Predicted_Probability": y_pred_proba})
predictions_file = os.path.join(output_dir, "lr_model_predictions.csv")
predictions_df.to_csv(predictions_file, index=False)
print(f"Model Predictions Saved at: {predictions_file}")

Model Predictions Saved at: data/outputs/model_predictions_lr.csv


In [17]:
# ---- STEP 9: SAVE THE TRAINED MODEL ----
model_file = os.path.join(output_dir, "logistic_regression_model.pkl")
joblib.dump(best_model, model_file)
print(f"Trained Model Saved at: {model_file}")

print("\nModel Training Completed Successfully!")

Trained Model Saved at: data/outputs/logistic_regression_model.pkl

Model Training Completed Successfully!
