In [1]:
# Customer Churn Prediction Jupyter Notebook

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from imblearn.over_sampling import SMOTE

In [2]:
# Step 2: Load Datasets
train_data = pd.read_csv("customer_churn_dataset-training-master.csv")
test_data = pd.read_csv("customer_churn_dataset-testing-master.csv")

In [3]:
# Step 3: Explore the Datasets
print("Training Data Head:")
print(train_data.head())
print("\nTraining Data Info:")
print(train_data.info())
print("\nTesting Data Head:")
print(test_data.head())
print("\nTesting Data Info:")
print(test_data.info())

Training Data Head:
   CustomerID   Age  Gender  Tenure  Usage Frequency  Support Calls  \
0         2.0  30.0  Female    39.0             14.0            5.0   
1         3.0  65.0  Female    49.0              1.0           10.0   
2         4.0  55.0  Female    14.0              4.0            6.0   
3         5.0  58.0    Male    38.0             21.0            7.0   
4         6.0  23.0    Male    32.0             20.0            5.0   

   Payment Delay Subscription Type Contract Length  Total Spend  \
0           18.0          Standard          Annual        932.0   
1            8.0             Basic         Monthly        557.0   
2           18.0             Basic       Quarterly        185.0   
3            7.0          Standard         Monthly        396.0   
4            8.0             Basic         Monthly        617.0   

   Last Interaction  Churn  
0              17.0    1.0  
1               6.0    1.0  
2               3.0    1.0  
3              29.0    1.0  
4    

In [4]:
# Step 4: Check for Missing Values
print("\nMissing Values in Training Data:")
print(train_data.isnull().sum())

print("\nMissing Values in Testing Data:")
print(test_data.isnull().sum())


Missing Values in Training Data:
CustomerID           1
Age                  1
Gender               1
Tenure               1
Usage Frequency      1
Support Calls        1
Payment Delay        1
Subscription Type    1
Contract Length      1
Total Spend          1
Last Interaction     1
Churn                1
dtype: int64

Missing Values in Testing Data:
CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64


In [5]:
# Step 5: Handle Missing Values (if any)
# Fill missing numerical values with the median and categorical values with the mode
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
    else:
        train_data[column].fillna(train_data[column].median(), inplace=True)

for column in test_data.columns:
    if test_data[column].dtype == 'object':
        test_data[column].fillna(test_data[column].mode()[0], inplace=True)
    else:
        test_data[column].fillna(test_data[column].median(), inplace=True)

In [6]:
# Step 6: Encode Categorical Variables
label_encoders = {}
for column in train_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.transform(test_data[column])
    label_encoders[column] = le

In [7]:
# Step 7: Separate Features and Target
X_train = train_data.drop(columns=['Churn'])
y_train = train_data['Churn']

X_test = test_data.drop(columns=['Churn'])
y_test = test_data['Churn']

In [8]:
# Step 8: Handle Class Imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [9]:
# Step 9: Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 10.1: Train and Evaluate Random Forest
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train_resampled)
y_pred_rf = rf_model.predict(X_test_scaled)

precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Training Random Forest...


In [None]:
# Step 10.2: Train and Evaluate Logistic Regression
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train_resampled)
y_pred_lr = lr_model.predict(X_test_scaled)

precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

In [None]:
# Step 10.3: Train and Evaluate Decision Tree Classifier
print("\nTraining Decision Tree Classifier...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train_resampled)
y_pred_dt = dt_model.predict(X_test_scaled)

precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
print("\nDecision Tree Classifier Classification Report:")
print(classification_report(y_test, y_pred_dt))

In [None]:
# Update Results Summary
results = [
    {"Model": "Random Forest", "Precision": precision_rf, "Recall": recall_rf},
    {"Model": "Logistic Regression", "Precision": precision_lr, "Recall": recall_lr},
    {"Model": "Decision Tree Classifier", "Precision": precision_dt, "Recall": recall_dt}
]
results_df = pd.DataFrame(results)
print("\nModel Performance Summary:")
print(results_df)

In [None]:
# Update Best Model Evaluation
best_model_name = max(results, key=lambda x: x['Recall'])['Model']
print(f"\nBest Model: {best_model_name}")

if best_model_name == "Random Forest":
    final_model = rf_model
    y_pred_best = y_pred_rf
elif best_model_name == "Logistic Regression":
    final_model = lr_model
    y_pred_best = y_pred_lr
else:
    final_model = dt_model
    y_pred_best = y_pred_dt

sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix ({best_model_name})')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Step 13: Save the Outputs
test_data['Predicted_Churn'] = y_pred_best
test_data.to_csv("test_predictions.csv", index=False)

In [None]:
# Step 14: Save Metrics Summary
results_df.to_csv("model_performance_summary.csv", index=False)