# Customer Churn Prediction
This notebook demonstrates a simple churn prediction pipeline using a public Telco dataset.
It downloads the dataset from a public raw GitHub URL when run locally.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from src.data_preprocessing import load_data_from_url, basic_cleaning
from src.model_training import train_logistic_regression, train_random_forest, evaluate_model, save_model
from src.visualization import plot_churn_distribution, plot_feature_importance
import os


## 1) Download dataset
The notebook will download the dataset from a public raw GitHub URL. If you prefer, you can put the CSV into `data/` folder and skip the download.

In [None]:
# Public Telco dataset raw URL (IBM sample on GitHub)
DATA_URL = 'https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv'
data_path = 'data/Telco-Customer-Churn.csv'
os.makedirs('data', exist_ok=True)
try:
    df = load_data_from_url(DATA_URL, save_path=data_path)
    print('Dataset downloaded to', data_path)
except Exception as e:
    print('Failed to download dataset automatically. Please download manually and place in data/ folder.', e)
    if os.path.exists(data_path):
        df = pd.read_csv(data_path)
    else:
        raise


In [None]:
df.shape


In [None]:
# Basic cleaning
df = basic_cleaning(df)
if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})
df.head()


In [None]:
# Quick EDA: churn distribution
plot_churn_distribution(df)


In [None]:
# Prepare data: simple encoding
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
df_enc = pd.get_dummies(df, columns=cat_cols, drop_first=True)
X = df_enc.drop('Churn', axis=1)
y = df_enc['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Train models
log_model = train_logistic_regression(X_train_scaled, y_train)
rf_model = train_random_forest(X_train, y_train)

# Evaluate
print('Logistic Regression:')
print(evaluate_model(log_model, X_test_scaled, y_test))
print('\nRandom Forest:')
print(evaluate_model(rf_model, X_test, y_test))

# Save models
os.makedirs('models', exist_ok=True)
save_model(rf_model, 'models/rf_model.joblib')
save_model(log_model, 'models/log_model.joblib')


In [None]:
# Feature importance (RF)
try:
    import numpy as np
    feats = X.columns.tolist()
    imps = rf_model.feature_importances_
    plot_feature_importance(feats, imps, top_n=20, save_path='models/feature_importance.png')
except Exception as e:
    print('Could not plot feature importance.', e)


## Conclusion
- This notebook provides a baseline churn prediction pipeline. Improve by hyperparameter tuning, feature engineering, and using cross-validation.