In [None]:
# telco_churn_analysis.py

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

from imblearn.over_sampling import SMOTE


# 1. Load dataset
data_path = "/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(data_path)


# 2. Initial data inspection
print("Dataset shape (rows, columns):", df.shape)
print("\nSample data:")
print(df.head())

print("\nMissing values per column:")
print(df.isnull().sum())

# Convert 'TotalCharges' to numeric, coercing errors to NaN
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')

print("\nNumerical columns description:")
print(df.describe())

print("\nCategorical columns description:")
print(df.describe(include='object'))

# Show first 5 unique values of each column for quick overview
for col in df.columns:
    print(f"{col}: {df[col].unique()[:5]}")


# 3. Data cleaning and preprocessing

replace_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in replace_cols:
    df[col] = df[col].replace({'No internet service': 'No', 'No phone service': 'No'})

binary_cols = replace_cols + ['Churn']
for col in binary_cols:
    df[col] = df[col].replace({'Yes': 1, 'No': 0})

df.drop('customerID', axis=1, inplace=True)

df = pd.get_dummies(df, drop_first=True)


# 4. EDA
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.show()

numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Churn', y=col, data=df)
    plt.title(f'{col} vs Churn')
    plt.show()

categorical_cols = ['gender_Male', 'MultipleLines_1', 'InternetService_Fiber optic',
                    'OnlineSecurity_1', 'TechSupport_1', 'Contract_One year', 'Contract_Two year']

for col in categorical_cols:
    plt.figure(figsize=(6,4))
    sns.barplot(x=df[col], y=df['Churn'])
    plt.title(f'Churn rate by {col}')
    plt.show()


# 5. Correlation and features
pd.set_option('display.max_columns', None)

corr_matrix = df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

churn_corr = df.corr()['Churn'].sort_values(ascending=False)
print("\nCorrelation with Churn:")
print(churn_corr)

selected_features = [
    'tenure',
    'MonthlyCharges',
    'TotalCharges',
    'InternetService_Fiber optic',
    'InternetService_No',
    'PaymentMethod_Electronic check',
    'Contract_One year',
    'Contract_Two year',
    'PaperlessBilling_Yes',
    'OnlineSecurity_1',
    'TechSupport_1',
    'SeniorCitizen',
    'Partner_Yes',
    'Dependents_Yes',
    'StreamingTV_1',
    'StreamingMovies_1'
]

df.dropna(subset=['TotalCharges'], inplace=True)

X = df[selected_features]
y = df['Churn']


# 6. Split dataset
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# 7. Models

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("\nLogistic Regression Performance:")
print(f"Accuracy : {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred_lr):.4f}")

cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6,4))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr, target_names=['No Churn', 'Churn']))


rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\nRandom Forest Performance:")
print(f"Accuracy : {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred_rf):.4f}")

cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf, target_names=['No Churn', 'Churn']))


xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\nXGBoost Performance:")
print(f"Accuracy : {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_xgb):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_xgb):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred_xgb):.4f}")

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6,4))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Oranges', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGBoost')
plt.show()

print("\nClassification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb, target_names=['No Churn', 'Churn']))


# GridSearch for Random Forest
rf_base = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

grid_search = GridSearchCV(rf_base, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("\nBest Parameters (GridSearch):")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_

y_pred_best_rf = best_rf.predict(X_test)

print("\nBest Random Forest Performance (GridSearch):")
print(f"Accuracy : {accuracy_score(y_test, y_pred_best_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_best_rf):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_best_rf):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred_best_rf):.4f}")

cm_best_rf = confusion_matrix(y_test, y_pred_best_rf)
plt.figure(figsize=(6,4))
sns.heatmap(cm_best_rf, annot=True, fmt='d', cmap='BuPu', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Tuned Random Forest')
plt.show()

print("\nClassification Report (Tuned Random Forest):")
print(classification_report(y_test, y_pred_best_rf, target_names=['No Churn', 'Churn']))


# SMOTE for balancing
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_train, y_train)

print("\nShape before SMOTE:", X_train.shape)
print("Shape after SMOTE :", X_balanced.shape)

rf_smote = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=1,
                                  min_samples_split=2, class_weight=None, random_state=42)
rf_smote.fit(X_balanced, y_balanced)
y_pred_smote = rf_smote.predict(X_test)

print("\nRandom Forest Performance with SMOTE Balanced Data:")
print(f"Accuracy : {accuracy_score(y_test, y_pred_smote):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_smote):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_smote):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred_smote):.4f}")

print("\nClassification Report (Random Forest + SMOTE):")
print(classification_report(y_test, y_pred_smote, target_names=['No Churn', 'Churn']))


# Feature importance plot
importances = rf_smote.feature_importances_
features = X.columns

feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance_df.head(10))

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15))
plt.title("Top 15 Feature Importances (Random Forest with SMOTE)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
