In [1]:
# Parameters
test_data_path = "test_dataset_bb67b6b30aa7405f9ee9762d2c05c3db.csv"
metrics_output_path = "metrics_output.json"


In [2]:
#Customer Retention Prediction Model - AMEX Shadowing Program


# Load libraries
import pandas as pd
from sklearn.ensemble import BaggingClassifier # Bagging Classifier
from sklearn.ensemble import RandomForestClassifier # Random Forest Classifier
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.linear_model import LogisticRegression # Import Logistic Regression Classifier
from sklearn.svm import SVC # Import SVM classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_selection import RFECV


!pip install xgboost
!pip install imbalanced-learn
!pip install lightgbm catboost imbalanced-learn

!source myenv/bin/activate  # Unix/MacOS

!pip install numpy pandas scikit-learn xgboost lightgbm catboost imbalanced-learn

!pip install papermill

!pip install flask nbformat papermill lightgbm imbalanced-learn scikit-learn


# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preprocessing and modeling
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

# Suppress warnings (optional)
import warnings
warnings.filterwarnings('ignore')

# -------------------------
# Parameters
test_data_path = 'Churn_Modelling_Test.csv'  # Default value, overridden by papermill
metrics_output_path = 'metrics_output.json'  # Default value, overridden by papermill

# Feature Engineering Function
def add_features(data):
    data['BalanceSalaryRatio'] = data['Balance'] / (data['EstimatedSalary'] + 1)
    data['CreditScoreAgeRatio'] = data['CreditScore'] / (data['Age'] + 1)
    data['TenureByAge'] = data['Tenure'] / (data['Age'] + 1)
    data['AgeBalanceInteraction'] = data['Age'] * data['Balance']
    data['HasCrCardAndActive'] = data['HasCrCard'] * data['IsActiveMember']
    return data

# Load datasets
train_data = pd.read_csv('Churn_Modelling_Train.csv')
test_data = pd.read_csv(test_data_path)  # Use the parameter for test data path

# Add features
train_data = add_features(train_data)
test_data = add_features(test_data)

# Preprocess data function

# def preprocess_data(data):
#     # Keep CustomerId and Surname separately
#     # customer_info = data[['CustomerId', 'Surname']].copy()
    
#     # Drop unnecessary columns
#     data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, errors='ignore')
    
#     X = data.drop('Exited', axis=1)
#     y = data['Exited']
    
#     return X, y

# def preprocess_data(data):
#     # Keep CustomerId and Surname separately
#     customer_info = data[['CustomerId', 'Surname']].copy()
    
#     # Drop unnecessary columns
#     data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, errors='ignore')
    
#     X = data.drop('Exited', axis=1)
#     y = data['Exited']
    
#     return X, y, customer_info

# # X_train_raw, y_train_raw = preprocess_data(train_data)
# # X_test_raw, y_test = preprocess_data(test_data)

# # Preprocess data
# X_train_raw, y_train_raw, _ = preprocess_data(train_data)  # The underscore (_) ignores the third returned value for training data
# X_test_raw, y_test, test_customers = preprocess_data(test_data)  # Assign the third returned value to test_customers

# Preprocess data function
def preprocess_data(data):
    # Keep CustomerId and Surname separately
    customer_info = data[['CustomerId', 'Surname']].copy()
    
    # Drop unnecessary columns
    data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, errors='ignore')
    
    X = data.drop('Exited', axis=1)
    y = data['Exited']
    
    return X, y, customer_info

# Preprocess data
X_train_raw, y_train_raw, _ = preprocess_data(train_data)  # Ignore customer_info for training data
X_test_raw, y_test, test_customers = preprocess_data(test_data)  # Capture customer_info for test data



# Define categorical and numerical columns
categorical_cols = ['Geography', 'Gender']
numerical_cols = [col for col in X_train_raw.columns if col not in categorical_cols]

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Bundle preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model (LightGBM)
lgb_classifier = lgb.LGBMClassifier(random_state=42, class_weight='balanced')

# Simplified hyperparameter grid
param_dist = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__num_leaves': [31, 50],
    'classifier__min_child_samples': [20, 30],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0],
}

# Custom scorer
f1_scorer = make_scorer(f1_score)

# Pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42, k_neighbors=5)),
    ('classifier', lgb_classifier)
])

# Cross-validation strategy
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Randomized Search
n_iter_search = 20  # Number of parameter settings that are sampled
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=n_iter_search,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Fit the model
random_search.fit(X_train_raw, y_train_raw)

# Best estimator
best_model = random_search.best_estimator_
print(f"Best parameters for LightGBM: {random_search.best_params_}")
print(f"Best cross-validated F1 score: {random_search.best_score_:.4f}")

# Transform the test data using the fitted preprocessor from the pipeline
X_test_transformed = best_model.named_steps['preprocessor'].transform(X_test_raw)

# Predict probabilities on test data
y_probs = best_model.named_steps['classifier'].predict_proba(X_test_transformed)[:, 1]

# Optimize threshold
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores = [f1_score(y_test, (y_probs >= t).astype(int)) for t in thresholds]
optimal_threshold = thresholds[np.argmax(f1_scores)]
max_f1_score = max(f1_scores)

print(f"\nOptimal Threshold: {optimal_threshold:.2f}")
print(f"Max F1 Score on Test Data: {max_f1_score:.4f}")

# Final predictions
y_pred_optimal = (y_probs >= optimal_threshold).astype(int)

test_customers['Exited_Predicted'] = y_pred_optimal

# Evaluate the model
accuracy_test = accuracy_score(y_test, y_pred_optimal)
precision_test = precision_score(y_test, y_pred_optimal)
recall_test = recall_score(y_test, y_pred_optimal)
f1_test = f1_score(y_test, y_pred_optimal)

print(f"\nOptimized Model Performance on Test Data:")
print(f"Accuracy: {accuracy_test:.4f}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1_score: {f1_test:.4f}")

# Save the metrics to a JSON file
metrics = {
    'Accuracy': accuracy_test,
    'Precision': precision_test,
    'Recall': recall_test,
    'F1_score': f1_test
}

import json

with open(metrics_output_path, 'w') as f:
    json.dump(metrics, f)


# Filter customers who exited
exited_customers = test_customers[test_customers['Exited_Predicted'] == 1][['CustomerId', 'Surname']]
exited_customers.to_json('exited_customers.json', orient = 'records')








'source' is not recognized as an internal or external command,
operable program or batch file.








Fitting 3 folds for each of 20 candidates, totalling 60 fits


[LightGBM] [Info] Number of positive: 6353, number of negative: 6353
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3352
[LightGBM] [Info] Number of data points in the train set: 12706, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Best parameters for LightGBM: {'classifier__subsample': 1.0, 'classifier__num_leaves': 50, 'classifier__n_estimators': 200, 'classifier__min_child_samples': 30, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.05, 'classifier__colsample_bytree': 0.8}
Best cross-validated F1 score: 0.6270



Optimal Threshold: 0.47
Max F1 Score on Test Data: 0.6172

Optimized Model Performance on Test Data:
Accuracy: 0.8400
Precision: 0.5785
Recall: 0.6615
F1_score: 0.6172
