# Customer Churn Prediction - Jupyter Notebook

**Business Question:** Can we predict which customers are likely to churn?

In [None]:
# 1️⃣ Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# 2️⃣ Load CSV
df = pd.read_csv('data/telco_churn.csv')
df.head()

In [None]:
# 3️⃣ Clean Data
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df.head()

In [None]:
# 4️⃣ Explore: Tenure Distribution by Churn
sns.histplot(data=df, x='tenure', hue='Churn', multiple='stack')
plt.title('Tenure Distribution by Churn')
plt.xlabel('Tenure (months)')
plt.ylabel('Number of Customers')
plt.show()

In [None]:
# Explore: Monthly Charges by Churn
sns.boxplot(data=df, x='Churn', y='MonthlyCharges')
plt.title('Monthly Charges vs Churn')
plt.xlabel('Churn')
plt.ylabel('Monthly Charges ($)')
plt.show()

In [None]:
# 5️⃣ Statistical Test: T-test on Tenure
churned = df[df['Churn'] == 1]['tenure']
retained = df[df['Churn'] == 0]['tenure']
t_stat, p_value = ttest_ind(churned, retained)
print(f"T-test p-value for tenure difference = {p_value:.4f}")

In [None]:
# 6️⃣ Build Logistic Regression Model
features = ['gender', 'SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
X = df[features]
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# 7️⃣ Predict Risk Scores for Automation
df['RiskScore'] = model.predict_proba(X)[:, 1]
df[['customerID', 'RiskScore']].sort_values(by='RiskScore', ascending=False).head(10)