# Bank Customer Churn EDA

Exploratory analysis of the `Churn_Modelling.csv` dataset to surface feature distributions, churn drivers, and data quality concerns.

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

DATA_PATH = Path("../../data/raw/bank_churn/Churn_Modelling.csv").resolve()
DATA_PATH

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
df_shape = df.shape
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=["number"]).columns.tolist()

print("Shape:", df_shape)
print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))
df[numeric_cols].head()

In [None]:
numeric_summary = df[numeric_cols].describe().T
numeric_summary.head()

In [None]:
missing = (df.isna().sum() / len(df)).sort_values(ascending=False)
missing[missing > 0]

In [None]:
churn_counts = df['Exited'].value_counts().rename({0: "Retained", 1: "Churned"})
churn_ratio = churn_counts / churn_counts.sum()

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
churn_counts.plot(kind='bar', ax=ax[0], color=["#2b8cbe", "#de2d26"])
ax[0].set_title("Customer Count by Status")
ax[0].set_ylabel("Count")

churn_ratio.plot(kind='bar', ax=ax[1], color=["#2b8cbe", "#de2d26"])
ax[1].set_title("Status Proportion")
ax[1].set_ylabel("Ratio")

plt.tight_layout()
plt.show()

churn_counts, churn_ratio

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
selected_features = ['Age', 'Balance', 'EstimatedSalary', 'CreditScore']

for ax, feature in zip(axes.flatten(), selected_features):
    sns.kdeplot(data=df, x=feature, hue='Exited', common_norm=False, ax=ax, fill=True)
    ax.set_title(f'Distribution by Churn Status: {feature}')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='Geography', hue='Exited')
plt.title('Geography vs Churn Status')
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

## Candidate Data Quality Checks

- `CustomerId` must be unique.
- `Exited` should only take {0, 1}.
- No missing values in mandatory demographic fields.
- Credit score, balance, and salary should remain within observed realistic ranges.
- Categorical domains (`Geography`, `Gender`) consistent with training taxonomy.