# ChurnGuard - Data Exploration

This notebook explores the customer churn dataset and performs initial data profiling.

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Imports successful!")

## Load Data

In [None]:
# Load processed data
df = pl.read_csv('../data/processed/churn_processed.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns}")
df.head()

## Data Overview

In [None]:
# Basic statistics
print("\nDataset Summary:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.null_count())

## Churn Analysis

In [None]:
# Churn rate
if 'churn' in df.columns:
    churn_counts = df.group_by('churn').count()
    print("\nChurn Distribution:")
    print(churn_counts)
    
    # Calculate churn rate
    total = len(df)
    churned = df.filter(pl.col('churn') == True).height
    churn_rate = (churned / total) * 100
    
    print(f"\nChurn Rate: {churn_rate:.2f}%")

## Feature Distributions

In [None]:
# Plot key features
numeric_cols = ['total_day_minutes', 'total_eve_minutes', 'total_night_minutes', 'customer_service_calls']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numeric_cols):
    if col in df.columns:
        df_pandas = df.select(col).to_pandas()
        axes[i].hist(df_pandas[col], bins=30, edgecolor='black')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Convert to pandas for correlation (polars doesn't have built-in correlation yet)
df_pandas = df.to_pandas()

# Select numeric columns
numeric_df = df_pandas.select_dtypes(include=[np.number])

# Calculate correlation
corr = numeric_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## Insights

Key findings from the exploration:
1. Churn rate in the dataset
2. Distribution of key features
3. Correlation between features
4. Potential data quality issues