In [41]:
!pip install imblearn



In [42]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
#import joblib  # For saving models
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [43]:
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Create output directory if not exists
output_dir = "data/outputs/"
os.makedirs(output_dir, exist_ok=True)

In [44]:
# ---- STEP 1: LOAD DATA ----
file_path = os.path.join(output_dir, "features_insurance_data.csv")
df = pd.read_csv(file_path)

In [45]:
df.head()

Unnamed: 0,customer_id,age,annual_premium,claims_count,churn,policy_type_home,policy_type_life
0,1,0.321429,0.419226,0.25,0,False,False
1,2,0.857143,0.485536,0.75,0,True,False
2,3,0.178571,0.705607,1.0,0,False,False
3,4,0.285714,0.663329,0.75,0,False,True
4,5,0.678571,0.655541,0.75,0,False,False


In [46]:
# Define feature columns (X) and target variable (y)
X = df.drop(columns=["churn","customer_id"])
y = df["churn"]

In [47]:
# ---- STEP 2: TRAIN-TEST SPLIT (80/20) ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")

Training Set: (800, 5), Testing Set: (200, 5)


In [48]:
# ---- STEP 3: HANDLE CLASS IMBALANCE USING SMOTE ----
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Resampled Training Set: {X_train_resampled.shape}")

Resampled Training Set: (1274, 5)


In [49]:
# ---- STEP 4: HYPERPARAMETER TUNING USING CROSS-VALIDATION ----
# Define the hyperparameter grid
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],  # Regularization strength
    "solver": ["liblinear", "lbfgs"]  # Solver methods
}

In [50]:
# Define the logistic regression model
log_reg = LogisticRegression(max_iter=500, random_state=42)

# Perform GridSearchCV with 5-fold stratified cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=StratifiedKFold(n_splits=5), scoring="f1", n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

In [51]:
# Get the best model from grid search
best_model = grid_search.best_estimator_
print(f"Best Model Parameters: {grid_search.best_params_}")

Best Model Parameters: {'C': 0.01, 'solver': 'liblinear'}


In [52]:
# ---- STEP 5: TRAIN THE FINAL MODEL ----
best_model.fit(X_train_resampled, y_train_resampled)

In [53]:
# ---- STEP 6: MAKE PREDICTIONS ----
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probability scores

In [54]:
# ---- STEP 7: EVALUATE MODEL PERFORMANCE ----
metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1 Score": f1_score(y_test, y_pred),
    "AUC-ROC": roc_auc_score(y_test, y_pred_proba),
}

In [55]:
# Print evaluation metrics
print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Model Performance Metrics:
Accuracy: 0.4550
Precision: 0.2167
Recall: 0.6341
F1 Score: 0.3230
AUC-ROC: 0.5418


In [56]:
# ---- STEP 8: SAVE MODEL PREDICTIONS ----
predictions_df = pd.concat([X_test,pd.DataFrame({"Actual": y_test, 
                                                 "Predicted": y_pred,
                                                 "Predicted_Probability": y_pred_proba})],
                                                 axis=1
                            )
predictions_file = os.path.join(output_dir, "lr_model_predictions.csv")
predictions_df.to_csv(predictions_file, index=False)
print(f"Model Predictions Saved at: {predictions_file}")

Model Predictions Saved at: data/outputs/lr_model_predictions.csv


In [57]:
# ---- STEP 9: SAVE THE TRAINED MODEL ----
# model_file = os.path.join(output_dir, "logistic_regression_model.pkl")
# joblib.dump(best_model, model_file)
# print(f"Trained Model Saved at: {model_file}")

# Save the model using pickle
model_filename = os.path.join(output_dir, "logistic_regression_model.pkl")
with open(model_filename, "wb") as f:
    pickle.dump(best_model, f)

print("\nModel Training Completed Successfully!")


Model Training Completed Successfully!
