In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ Libraries loaded successfully!")

‚úÖ Libraries loaded successfully!


In [None]:
# Cell 2: Load Data
df = pd.read_csv('../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')

print("üìä Dataset Shape:", df.shape)
print(f"\n‚úÖ Loaded {df.shape[0]:,} customers with {df.shape[1]} features")

# First look
df.head()

In [None]:
# Cell 3: Basic Information
print("üìã Dataset Info:")
print("="*60)
df.info()

print("\nüìä Statistical Summary:")
print("="*60)
df.describe()

In [None]:
# Cell 4: Check Target Variable
print("üéØ Target Variable Distribution:")
print("="*60)
print(df['Churn'].value_counts())
print("\nPercentages:")
print(df['Churn'].value_counts(normalize=True) * 100)

# Visualize
plt.figure(figsize=(8, 5))
df['Churn'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('Customer Churn Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../results/churn_distribution.png', dpi=300)
plt.show()
```

**Mental Model:**
```
Understanding the data = Doctor examining patient

Questions to ask:
1. How much data do we have? (7,043 customers ‚úì)
2. What features exist? (21 columns)
3. What are we predicting? (Churn: Yes/No)
4. Is data balanced? (26.5% churn - somewhat imbalanced)
5. Any missing values?
6. What types of features? (Numerical, categorical)

This is like a doctor's initial assessment
Before treatment, understand the patient

In [None]:
# Cell 5: Missing Values Check
print("üîç Missing Values:")
print("="*60)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

if missing.sum() == 0:
    print("‚úÖ No missing values found!")

In [None]:
# Cell 6: Feature Types
print("üìä Feature Types:")
print("="*60)

numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nüî¢ Numerical Features ({len(numerical_features)}):")
print(numerical_features)

print(f"\nüìù Categorical Features ({len(categorical_features)}):")
print(categorical_features)

In [None]:
# Cell 7: Explore Numerical Features
print("üìà Numerical Features Distribution:")

numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col].dropna(), bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/numerical_distributions.png', dpi=300)
plt.show()

In [None]:
# Cell 8: Categorical Features Analysis
categorical_cols = ['Contract', 'PaymentMethod', 'InternetService']

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(categorical_cols):
    df[col].value_counts().plot(kind='bar', ax=axes[idx], color='coral', edgecolor='black')
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/categorical_distributions.png', dpi=300)
plt.show()

In [None]:
# Cell 9: Churn Analysis by Key Features
print("üéØ Churn Rate by Key Features:")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Contract Type
contract_churn = df.groupby('Contract')['Churn'].apply(lambda x: (x=='Yes').sum() / len(x) * 100)
axes[0, 0].bar(contract_churn.index, contract_churn.values, color='steelblue', edgecolor='black')
axes[0, 0].set_title('Churn Rate by Contract Type', fontweight='bold')
axes[0, 0].set_ylabel('Churn Rate (%)')
axes[0, 0].grid(axis='y', alpha=0.3)

# Internet Service
internet_churn = df.groupby('InternetService')['Churn'].apply(lambda x: (x=='Yes').sum() / len(x) * 100)
axes[0, 1].bar(internet_churn.index, internet_churn.values, color='salmon', edgecolor='black')
axes[0, 1].set_title('Churn Rate by Internet Service', fontweight='bold')
axes[0, 1].set_ylabel('Churn Rate (%)')
axes[0, 1].grid(axis='y', alpha=0.3)

# Tenure vs Churn
churn_yes = df[df['Churn'] == 'Yes']['tenure']
churn_no = df[df['Churn'] == 'No']['tenure']
axes[1, 0].hist([churn_no, churn_yes], bins=30, label=['No Churn', 'Churn'], 
                color=['green', 'red'], alpha=0.6, edgecolor='black')
axes[1, 0].set_title('Tenure Distribution by Churn', fontweight='bold')
axes[1, 0].set_xlabel('Tenure (months)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# Monthly Charges vs Churn
churn_yes_charges = df[df['Churn'] == 'Yes']['MonthlyCharges']
churn_no_charges = df[df['Churn'] == 'No']['MonthlyCharges']
axes[1, 1].hist([churn_no_charges, churn_yes_charges], bins=30, 
                label=['No Churn', 'Churn'], color=['green', 'red'], 
                alpha=0.6, edgecolor='black')
axes[1, 1].set_title('Monthly Charges by Churn', fontweight='bold')
axes[1, 1].set_xlabel('Monthly Charges ($)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/churn_analysis.png', dpi=300)
plt.show()

In [None]:
# Cell 10: Correlation Heatmap
print("üî• Feature Correlations:")

# Convert categorical to numerical for correlation
df_corr = df.copy()
df_corr['Churn'] = (df_corr['Churn'] == 'Yes').astype(int)

# Select numerical columns
num_cols = df_corr.select_dtypes(include=[np.number]).columns
correlation_matrix = df_corr[num_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../results/correlation_heatmap.png', dpi=300)
plt.show()

# Show top correlations with Churn
print("\nüìä Top Correlations with Churn:")
churn_corr = correlation_matrix['Churn'].sort_values(ascending=False)
print(churn_corr)

In [None]:
# Cell 11: Key Insights Summary
print("üí° KEY INSIGHTS FROM EDA:")
print("="*60)
print("""
1. DATASET:
   - 7,043 customers
   - 26.5% churn rate (somewhat imbalanced)
   - No missing values
   
2. IMPORTANT PATTERNS:
   - Month-to-month contracts have MUCH higher churn
   - Fiber optic internet users churn more
   - New customers (low tenure) churn more
   - Higher monthly charges correlate with churn
   
3. FEATURES TO FOCUS ON:
   - Contract type (very predictive)
   - Tenure (strong signal)
   - Monthly charges (important)
   - Internet service type
   
4. NEXT STEPS:
   - Handle TotalCharges (convert to numeric)
   - Encode categorical variables
   - Handle class imbalance
   - Feature engineering (tenure groups, charge ratios)
""")
```

**Mental Model:**
```
EDA = Detective work

You're looking for clues:
‚úì Month-to-month ‚Üí High churn (red flag!)
‚úì Long tenure ‚Üí Low churn (loyalty signal)
‚úì High charges ‚Üí More churn (price sensitivity)

Like a doctor:
  Symptom: "Customer has month-to-month contract"
  Diagnosis: "High risk of churn"
  Treatment: "Retention campaign"

This understanding guides feature engineering!