## Load and Inspect Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load processed data
df = pd.read_csv('../data/processed/combined_nps_revenue.csv')
df['date_nps'] = pd.to_datetime(df['date_nps'])

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print('Dataset Overview:')
print(f'Shape: {df.shape}')
print(f'\nColumns: {df.columns.tolist()}')
print(f'\nData types:\n{df.dtypes}')
print(f'\nMissing values:\n{df.isnull().sum()}')

## Univariate Analysis: NPS Distribution

In [None]:
# NPS score distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram
axes[0, 0].hist(df['nps_mean'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].set_title('Distribution of Mean NPS by Clinic-Month', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('NPS Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['nps_mean'].mean(), color='red', linestyle='--', label=f'Mean: {df["nps_mean"].mean():.1f}')
axes[0, 0].axvline(df['nps_mean'].median(), color='green', linestyle='--', label=f'Median: {df["nps_mean"].median():.1f}')
axes[0, 0].legend()

# Box plot
axes[0, 1].boxplot(df['nps_mean'], vert=True)
axes[0, 1].set_title('NPS Box Plot', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('NPS Score')
axes[0, 1].grid(True, alpha=0.3)

# Q-Q plot for normality
stats.probplot(df['nps_mean'], dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot: NPS Normality Check', fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# KDE plot
df['nps_mean'].plot(kind='kde', ax=axes[1, 1], color='steelblue', linewidth=2)
axes[1, 1].set_title('NPS Kernel Density Estimate', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('NPS Score')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/eda_nps_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('\nNPS Statistics:')
print(df['nps_mean'].describe())
print(f'\nSkewness: {df["nps_mean"].skew():.3f}')
print(f'Kurtosis: {df["nps_mean"].kurtosis():.3f}')

## Univariate Analysis: Revenue Distribution

In [None]:
# Revenue distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram
axes[0, 0].hist(df['revenue_total'], bins=30, edgecolor='black', alpha=0.7, color='darkgreen')
axes[0, 0].set_title('Distribution of Total Revenue by Clinic-Month', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Revenue (R$)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['revenue_total'].mean(), color='red', linestyle='--', label=f'Mean: R${df["revenue_total"].mean():.0f}')
axes[0, 0].legend()

# Box plot
axes[0, 1].boxplot(df['revenue_total'], vert=True)
axes[0, 1].set_title('Revenue Box Plot', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Revenue (R$)')
axes[0, 1].grid(True, alpha=0.3)

# Log-scale histogram (check for exponential distribution)
axes[1, 0].hist(np.log10(df['revenue_total']), bins=30, edgecolor='black', alpha=0.7, color='darkgreen')
axes[1, 0].set_title('Log10(Revenue) Distribution', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Log10(Revenue)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)

# KDE plot
df['revenue_total'].plot(kind='kde', ax=axes[1, 1], color='darkgreen', linewidth=2)
axes[1, 1].set_title('Revenue Kernel Density Estimate', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Revenue (R$)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/eda_revenue_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('\nRevenue Statistics:')
print(df['revenue_total'].describe())
print(f'\nSkewness: {df["revenue_total"].skew():.3f}')
print(f'Kurtosis: {df["revenue_total"].kurtosis():.3f}')

## Temporal Trends

In [None]:
# Aggregate by month across all clinics
df_monthly = df.groupby('date_nps').agg({
    'nps_mean': 'mean',
    'revenue_total': 'sum',
    'clinic_id': 'count'
}).reset_index()
df_monthly.columns = ['date', 'avg_nps', 'total_revenue', 'clinic_count']

# Temporal plot
fig, ax1 = plt.subplots(figsize=(14, 6))

ax1.plot(df_monthly['date'], df_monthly['avg_nps'], marker='o', linewidth=2, color='steelblue', label='Avg NPS')
ax1.set_xlabel('Date', fontsize=11)
ax1.set_ylabel('Average NPS', fontsize=11, color='steelblue')
ax1.tick_params(axis='y', labelcolor='steelblue')
ax1.grid(True, alpha=0.3)

ax2 = ax1.twinx()
ax2.plot(df_monthly['date'], df_monthly['total_revenue'], marker='s', linewidth=2, color='darkgreen', label='Total Revenue')
ax2.set_ylabel('Total Revenue (R$)', fontsize=11, color='darkgreen')
ax2.tick_params(axis='y', labelcolor='darkgreen')

plt.title('NPS and Revenue Trends Over Time', fontsize=13, fontweight='bold')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.tight_layout()
plt.savefig('../data/processed/eda_temporal_trends.png', dpi=300, bbox_inches='tight')
plt.show()

print('Monthly Aggregates:')
print(df_monthly.head(10))

## Clinic-Level Patterns

In [None]:
# Clinic-level aggregates
clinic_summary = df.groupby('clinic_id').agg({
    'nps_mean': ['mean', 'std', 'min', 'max'],
    'revenue_total': ['mean', 'std', 'min', 'max'],
    'nps_responses': 'sum',
    'year_month': 'count'
}).round(2)

clinic_summary.columns = ['nps_avg', 'nps_std', 'nps_min', 'nps_max', 
                          'rev_avg', 'rev_std', 'rev_min', 'rev_max',
                          'total_responses', 'months_observed']

clinic_summary = clinic_summary.sort_values('nps_avg', ascending=False)

print('\nTop 10 Clinics by Average NPS:')
print(clinic_summary.head(10))
print(f'\nTotal clinics: {len(clinic_summary)}')

In [None]:
# Clinic scatter: NPS vs Revenue
clinic_agg = df.groupby('clinic_id').agg({
    'nps_mean': 'mean',
    'revenue_total': 'mean'
}).reset_index()

fig, ax = plt.subplots(figsize=(12, 7))
ax.scatter(clinic_agg['nps_mean'], clinic_agg['revenue_total'], s=100, alpha=0.6, color='steelblue', edgecolors='black')
ax.set_xlabel('Average NPS Score', fontsize=11)
ax.set_ylabel('Average Monthly Revenue (R$)', fontsize=11)
ax.set_title('Clinic-Level: NPS vs Revenue', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(clinic_agg['nps_mean'], clinic_agg['revenue_total'], 1)
p = np.poly1d(z)
ax.plot(clinic_agg['nps_mean'], p(clinic_agg['nps_mean']), "r--", linewidth=2, label=f'Trend (slope={z[0]:.1f})')
ax.legend()

plt.tight_layout()
plt.savefig('../data/processed/eda_clinic_scatter.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate correlation
corr_clinic = clinic_agg['nps_mean'].corr(clinic_agg['revenue_total'])
print(f'\nClinic-level NPS-Revenue Correlation: {corr_clinic:.3f}')

## Month-over-Month Change Analysis

In [None]:
# Filter rows with valid changes
df_with_changes = df.dropna(subset=['nps_change', 'revenue_change'])

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# NPS change distribution
axes[0].hist(df_with_changes['nps_change'], bins=40, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(0, color='red', linestyle='--', linewidth=2, label='No Change')
axes[0].set_title('Distribution of Month-over-Month NPS Change', fontsize=12, fontweight='bold')
axes[0].set_xlabel('NPS % Change')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Revenue change distribution
axes[1].hist(df_with_changes['revenue_change'], bins=40, edgecolor='black', alpha=0.7, color='darkgreen')
axes[1].axvline(0, color='red', linestyle='--', linewidth=2, label='No Change')
axes[1].set_title('Distribution of Month-over-Month Revenue Change', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Revenue % Change')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/eda_changes_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('\nNPS Change Statistics:')
print(df_with_changes['nps_change'].describe())
print('\nRevenue Change Statistics:')
print(df_with_changes['revenue_change'].describe())

## Initial Correlation Overview

In [None]:
# Select numeric columns for correlation
numeric_cols = ['nps_mean', 'nps_median', 'nps_std', 'revenue_total', 
                 'nps_change', 'revenue_change', 'nps_lag1', 'revenue_per_response']
available_cols = [col for col in numeric_cols if col in df.columns]

corr_matrix = df[available_cols].corr()

# Correlation heatmap
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Correlation Matrix: NPS, Revenue, and Derived Metrics', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/processed/eda_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print('\nKey Correlations with Revenue:')
revenue_corrs = corr_matrix['revenue_total'].sort_values(ascending=False)
print(revenue_corrs)

## Summary Statistics Table

In [None]:
# Create comprehensive summary
summary_stats = pd.DataFrame({
    'Variable': ['NPS Mean', 'Revenue Total', 'NPS Change (%)', 'Revenue Change (%)', 'Revenue per Response'],
    'Mean': [df['nps_mean'].mean(), df['revenue_total'].mean(), df['nps_change'].mean(), df['revenue_change'].mean(), df['revenue_per_response'].mean()],
    'Median': [df['nps_mean'].median(), df['revenue_total'].median(), df['nps_change'].median(), df['revenue_change'].median(), df['revenue_per_response'].median()],
    'Std Dev': [df['nps_mean'].std(), df['revenue_total'].std(), df['nps_change'].std(), df['revenue_change'].std(), df['revenue_per_response'].std()],
    'Min': [df['nps_mean'].min(), df['revenue_total'].min(), df['nps_change'].min(), df['revenue_change'].min(), df['revenue_per_response'].min()],
    'Max': [df['nps_mean'].max(), df['revenue_total'].max(), df['nps_change'].max(), df['revenue_change'].max(), df['revenue_per_response'].max()]
})

print('\nEDA Summary Statistics:')
print(summary_stats.to_string(index=False))

# Save summary
summary_stats.to_csv('../data/processed/eda_summary_statistics.csv', index=False)
print('\nâœ“ Summary statistics saved')