# 02 - Exploratory Data Analysis

This notebook performs exploratory data analysis to understand the dataset and identify patterns.


In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Dataset shape: {df.shape}")
df.head()


## Target Variable Distribution


## Numerical Features Analysis


In [None]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Numerical features distribution by churn
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for idx, col in enumerate(numerical_cols):
    df.boxplot(column=col, by='Churn', ax=axes[idx])
    axes[idx].set_title(f'{col} by Churn')
    axes[idx].set_xlabel('Churn')
plt.tight_layout()
plt.show()

# Summary statistics by churn
print("\nSummary Statistics by Churn:")
print(df.groupby('Churn')[numerical_cols].describe())


## Categorical Features Analysis


In [None]:
# Analyze churn rate by categorical features
categorical_cols = ['gender', 'Partner', 'Dependents', 'Contract', 'PaymentMethod', 'InternetService']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, col in enumerate(categorical_cols):
    if col in df.columns:
        churn_by_cat = df.groupby(col)['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100)
        churn_by_cat.plot(kind='bar', ax=axes[idx], color='coral')
        axes[idx].set_title(f'Churn Rate by {col}')
        axes[idx].set_ylabel('Churn Rate (%)')
        axes[idx].tick_params(axis='x', rotation=45)
    else:
        axes[idx].axis('off')

plt.tight_layout()
plt.show()


## Correlation Analysis


In [None]:
# Encode target for correlation
df_encoded = df.copy()
df_encoded['Churn_encoded'] = (df_encoded['Churn'] == 'Yes').astype(int)

# Select numerical columns
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn_encoded']
correlation = df_encoded[num_cols].corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nCorrelation with Churn:")
print(correlation['Churn_encoded'].sort_values(ascending=False))


## Key Insights


In [None]:
# Key insights summary
print("="*60)
print("KEY INSIGHTS FROM EDA")
print("="*60)

print(f"\n1. Overall Churn Rate: {churn_counts['Yes'] / len(df) * 100:.2f}%")
print(f"2. Total Customers: {len(df)}")
print(f"3. Churned Customers: {churn_counts['Yes']}")

# Top churn risk factors
print("\n4. Top Churn Risk Factors:")
contract_churn = df.groupby('Contract')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100).sort_values(ascending=False)
print(f"   - Contract Type: {contract_churn.index[0]} has {contract_churn.iloc[0]:.1f}% churn rate")

payment_churn = df.groupby('PaymentMethod')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100).sort_values(ascending=False)
print(f"   - Payment Method: {payment_churn.index[0]} has {payment_churn.iloc[0]:.1f}% churn rate")

# Average tenure for churners vs non-churners
avg_tenure_churn = df[df['Churn'] == 'Yes']['tenure'].mean()
avg_tenure_no_churn = df[df['Churn'] == 'No']['tenure'].mean()
print(f"\n5. Average Tenure:")
print(f"   - Churners: {avg_tenure_churn:.1f} months")
print(f"   - Non-Churners: {avg_tenure_no_churn:.1f} months")
print(f"   - Difference: {avg_tenure_no_churn - avg_tenure_churn:.1f} months")


In [None]:
# Churn distribution
churn_counts = df['Churn'].value_counts()
print(churn_counts)
print(f"\nChurn rate: {churn_counts['Yes'] / len(df) * 100:.2f}%")

# Visualize
plt.figure(figsize=(8, 6))
churn_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
