# 01 â€” Exploratory Data Analysis (EDA)

Explore data quality and churn patterns.

In [None]:
# If running on Colab and you keep data on Drive, you can mount Drive:
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
data_full = PROJECT_ROOT / "data" / "WA_Fn-UseC_-Telco-Customer-Churn.csv"
data_sample = PROJECT_ROOT / "data" / "sample_telco.csv"

DATA_PATH = data_full if data_full.exists() else data_sample
df = pd.read_csv(DATA_PATH)

print("Loaded:", DATA_PATH)
print("Shape:", df.shape)
df.head()


In [None]:
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["axes.grid"] = True
plt.rcParams["font.size"] = 11

df.info()


In [None]:
df.isna().sum().sort_values(ascending=False).head(15)

In [None]:
counts = df["Churn"].value_counts()
plt.figure()
plt.bar(counts.index.astype(str), counts.values)
plt.title("Churn Distribution (Counts)")
plt.xlabel("Churn")
plt.ylabel("Count")
plt.show()

churn_pct = df["Churn"].value_counts(normalize=True) * 100
churn_pct


In [None]:
import numpy as np

def churn_rate_by_category(dataframe, col):
    tmp = dataframe.copy()
    tmp["Churn_bin"] = tmp["Churn"].map({"No": 0, "Yes": 1})
    return tmp.groupby(col)["Churn_bin"].mean().sort_values(ascending=False)

for col in ["Contract", "PaymentMethod", "InternetService"]:
    if col in df.columns:
        rates = churn_rate_by_category(df, col)
        plt.figure()
        plt.bar(rates.index.astype(str), rates.values)
        plt.title(f"Churn Rate by {col}")
        plt.xlabel(col)
        plt.ylabel("Churn Rate")
        plt.xticks(rotation=15)
        plt.show()


In [None]:
if "tenure" in df.columns:
    tmp = df.copy()
    tmp["Churn_bin"] = tmp["Churn"].map({"No": 0, "Yes": 1})
    tenure_no = tmp[tmp["Churn_bin"] == 0]["tenure"].values
    tenure_yes = tmp[tmp["Churn_bin"] == 1]["tenure"].values

    plt.figure()
    plt.hist(tenure_no, bins=30, alpha=0.7, label="Churn = No")
    plt.hist(tenure_yes, bins=30, alpha=0.7, label="Churn = Yes")
    plt.title("Tenure Distribution by Churn")
    plt.xlabel("Tenure")
    plt.ylabel("Count")
    plt.legend()
    plt.show()
