# Telco Churn — mini project (Hex / Python)

**Goal:** quick churn analysis + simple ML churn prediction (student-style).  
File: `cell2cell_duke.csv` • Label: `Churn` (Yes/No → 1/0)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report, RocCurveDisplay


## 1) Load data

In [None]:
DATA_PATH = "cell2cell_duke.csv"   # change if needed

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
df.head()

## 2) Clean + label

Keep it simple:
- drop rows with missing `Churn`
- create `churn_label`
- drop `Churn` and ID columns like `CustomerID` if present


In [None]:
df = df.replace(r"^\s*$", np.nan, regex=True).copy()
df = df.dropna(subset=["Churn"])

df["churn_label"] = (df["Churn"].astype(str).str.strip() == "Yes").astype(int)

drop_cols = [c for c in ["Churn", "CustomerID"] if c in df.columns]
df = df.drop(columns=drop_cols)

print("Rows after cleaning:", len(df))
print("Churn rate:", round(df["churn_label"].mean(), 4))
df["churn_label"].value_counts()

## 3) Quick EDA (only essentials)

- churn distribution
- churn rate by 1–2 categorical segments (auto-picked)
- 1–2 numeric variables compared by churn (auto-picked)


In [None]:
# churn distribution
plt.figure()
df["churn_label"].value_counts().sort_index().plot(kind="bar")
plt.xticks([0,1], ["No churn (0)", "Churn (1)"], rotation=0)
plt.title("Churn distribution")
plt.ylabel("count")
plt.show()

# choose up to 2 categorical columns with decent coverage and small-ish cardinality
cat_cols = [c for c in df.columns if df[c].dtype == "object"]
good_cats = []
for c in cat_cols:
    coverage = df[c].notna().mean()
    card = df[c].nunique(dropna=True)
    if coverage > 0.6 and 2 <= card <= 30:
        good_cats.append((c, coverage, card))
good_cats = sorted(good_cats, key=lambda x: (-x[1], x[2]))[:2]

for c, _, _ in good_cats:
    seg = (
        df.groupby(c)["churn_label"]
          .agg(n="size", churn_rate="mean")
          .sort_values("n", ascending=False)
          .head(10)
    )
    print("\nSegment churn (top 10):", c)
    display(seg)

    plt.figure()
    seg["churn_rate"].plot(kind="bar")
    plt.title(f"Churn rate by {c} (top 10 by count)")
    plt.ylabel("churn rate")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

# numeric columns: pick top 2 by absolute correlation with churn_label
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != "churn_label"]
cors = []
for c in num_cols:
    if df[c].notna().mean() > 0.6:
        cor = df[[c, "churn_label"]].dropna().corr().iloc[0,1]
        if not np.isnan(cor):
            cors.append((c, abs(cor)))
cors = [c for c, _ in sorted(cors, key=lambda x: -x[1])[:2]]

for c in cors:
    x0 = df.loc[df["churn_label"]==0, c].dropna()
    x1 = df.loc[df["churn_label"]==1, c].dropna()
    plt.figure()
    plt.boxplot([x0, x1], labels=["No churn", "Churn"])
    plt.title(f"{c} by churn")
    plt.ylabel(c)
    plt.show()


## 4) ML (LogReg baseline + Decision Tree)

We’ll do:
- basic preprocessing (median for numeric, most frequent for categorical + one-hot)
- train/test split 80/20
- evaluate with AUC + accuracy + confusion matrix + ROC curve


In [None]:
X = df.drop(columns=["churn_label"])
y = df["churn_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline(steps=[
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

lr = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=300))
])

tree = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", DecisionTreeClassifier(max_depth=6, min_samples_leaf=50, random_state=42))
])

lr.fit(X_train, y_train)
tree.fit(X_train, y_train)


In [None]:
def evaluate(name, model):
    proba = model.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)

    auc = roc_auc_score(y_test, proba)
    acc = accuracy_score(y_test, pred)
    cm = confusion_matrix(y_test, pred)

    print(f"\n{name}")
    print("AUC:", round(auc, 4), "| Accuracy:", round(acc, 4))
    print("Confusion matrix:\n", cm)
    print(classification_report(y_test, pred, digits=3))
    return proba

p_lr = evaluate("Logistic Regression", lr)
p_tree = evaluate("Decision Tree", tree)

plt.figure()
RocCurveDisplay.from_predictions(y_test, p_lr, name="LogReg")
RocCurveDisplay.from_predictions(y_test, p_tree, name="DecisionTree")
plt.title("ROC curves")
plt.show()


## 5) Conclusion (short)

- This dataset has a certain churn rate (shown above).
- We tested 2 simple models (LogReg + Decision Tree).
- If the Decision Tree gives a better AUC, we keep it; otherwise LogReg is fine.
- If performance is only “a bit above average”, that’s still normal without heavy feature engineering.
