<a href="https://colab.research.google.com/github/jyotheeswar42/customer-churn-predictor/blob/main/Churn_Prediction_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ## Final Project: Optimized Customer Churn Prediction

# First, let's pull in all the libraries we'll need.
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report

# We'll grab the dataset directly from a URL. It's just easier that way.
url = 'https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv'
df = pd.read_csv(url)
print("Dataset loaded successfully.")

# Time for some basic cleanup.
# The customerID is useless for prediction, so let's drop it.
df = df.drop('customerID', axis=1)

# TotalCharges has some weird empty strings instead of numbers.
# We'll force them to be numeric, which turns the weird ones into NaNs, then just drop those rows.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)

# The model needs a number to predict, so let's map Churn's 'Yes'/'No' to 1/0.
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})
print("Initial cleaning is done.")

# This is where we can get a real edge: creating smarter features.
print("Starting feature engineering...")

# Tenure is a number, but maybe the *stage* of the customer matters more.
# Let's group them into New, Established, and Loyal customers.
bins = [0, 12, 48, 72]
labels = ['New', 'Established', 'Loyal']
df['tenure_group'] = pd.cut(df['tenure'], bins=bins, labels=labels)

# Maybe the sheer number of services a customer has is a good predictor.
service_cols = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df['total_services'] = df[service_cols].apply(lambda row: (row == 'Yes').sum(), axis=1)
print("New features have been created.")

# Now we need to define what we're predicting (y) and what we'll use to predict it (X).
X = df.drop('Churn', axis=1)
y = df['Churn']

# We have to tell the model which columns have text and which have numbers.
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=np.number).columns

# This preprocessor is a neat way to bundle all our data prep.
# It scales the numbers and one-hot encodes the text columns, all in one step.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Let's split the data before we do anything else to avoid any information leaks.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Our dataset has way more non-churners than churners. This can bias the model.
# We'll calculate a weight to force the model to pay more attention to the churners.
class_counts = y_train.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
print(f"\nCalculated imbalance weight: {scale_pos_weight:.2f}")

# Here's our full pipeline. It preprocesses the data then runs it through the XGBoost model.
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight, # Here's where we use the weight
        random_state=42
    ))
])

# The default model settings are okay, but we can do better.
# Let's define a grid of settings for GridSearchCV to try out.
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 4, 5],
    'classifier__learning_rate': [0.1, 0.2]
}

# GridSearchCV will automatically test all combinations and find the best one.
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=2)

# Time to train. This is the part that can take a few minutes.
print("\nStarting hyperparameter tuning...")
grid_search.fit(X_train, y_train)

# Okay, training's done. Let's see what the best model is.
best_model = grid_search.best_estimator_
print("\nBest parameters found: ", grid_search.best_params_)

# Now for the final evaluation on our test set.
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
final_auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"\nFinal Optimized Model ROC AUC Score: {final_auc_score:.4f}")

# And a more detailed report to see how we did on precision and recall.
y_pred_binary = best_model.predict(X_test)
print("\nFinal Classification Report:")
print(classification_report(y_test, y_pred_binary))

Dataset loaded successfully.
Initial cleaning is done.
Starting feature engineering...
New features have been created.

Calculated imbalance weight: 2.76

Starting hyperparameter tuning...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}

Final Optimized Model ROC AUC Score: 0.8390

Final Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1033
           1       0.50      0.81      0.62       374

    accuracy                           0.73      1407
   macro avg       0.70      0.76      0.70      1407
weighted avg       0.80      0.73      0.74      1407



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
