## Load and Prepare Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import warnings

warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('../data/processed/combined_nps_revenue.csv')
df['date_nps'] = pd.to_datetime(df['date_nps'])

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print('Data loaded:')
print(f'Shape: {df.shape}')
print(f'Date range: {df["date_nps"].min()} to {df["date_nps"].max()}')
print(f'Unique clinics: {df["clinic_id"].nunique()}')

## Overall Correlation: NPS vs Revenue

In [None]:
# Remove rows with missing values for correlation analysis
df_clean = df[['nps_mean', 'revenue_total']].dropna()

# Pearson correlation (assumes linearity, normality)
pearson_r, pearson_p = pearsonr(df_clean['nps_mean'], df_clean['revenue_total'])

# Spearman correlation (rank-based, non-parametric)
spearman_r, spearman_p = spearmanr(df_clean['nps_mean'], df_clean['revenue_total'])

# Effect size (R²)
r_squared = pearson_r ** 2

print('=' * 70)
print('OVERALL NPS-REVENUE CORRELATION')
print('=' * 70)
print(f'\nPearson Correlation:')
print(f'  r = {pearson_r:.4f}')
print(f'  p-value = {pearson_p:.6f}')
print(f'  R² = {r_squared:.4f} ({r_squared*100:.2f}% variance explained)')
print(f'  Significance: {"***" if pearson_p < 0.001 else "**" if pearson_p < 0.01 else "*" if pearson_p < 0.05 else "ns"}')

print(f'\nSpearman Correlation (rank-based):')
print(f'  ρ = {spearman_r:.4f}')
print(f'  p-value = {spearman_p:.6f}')
print(f'  Significance: {"***" if spearman_p < 0.001 else "**" if spearman_p < 0.01 else "*" if spearman_p < 0.05 else "ns"}')

print(f'\nn = {len(df_clean)} clinic-months')

## Scatter Plot with Regression Line

In [None]:
# Scatter plot with trend line
fig, ax = plt.subplots(figsize=(12, 7))

ax.scatter(df_clean['nps_mean'], df_clean['revenue_total'], s=80, alpha=0.5, color='steelblue', edgecolors='black')

# Add regression line
z = np.polyfit(df_clean['nps_mean'], df_clean['revenue_total'], 1)
p = np.poly1d(z)
nps_range = np.linspace(df_clean['nps_mean'].min(), df_clean['nps_mean'].max(), 100)
ax.plot(nps_range, p(nps_range), 'r-', linewidth=2.5, label=f'Linear fit: y={z[0]:.0f}x+{z[1]:.0f}')

ax.set_xlabel('Average NPS Score', fontsize=12, fontweight='bold')
ax.set_ylabel('Monthly Revenue (R$)', fontsize=12, fontweight='bold')
ax.set_title(f'NPS vs Revenue Correlation\nPearson r={pearson_r:.3f} (p={pearson_p:.6f}), R²={r_squared:.3f}', 
            fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.legend(fontsize=11)

# Add confidence interval
from scipy.stats import t as t_dist
residuals = df_clean['revenue_total'] - p(df_clean['nps_mean'])
std_err = np.sqrt(np.sum(residuals**2) / (len(residuals) - 2))
pred_err = std_err * np.sqrt(1 + 1/len(residuals) + (nps_range - df_clean['nps_mean'].mean())**2 / np.sum((df_clean['nps_mean'] - df_clean['nps_mean'].mean())**2))
t_val = t_dist.ppf(0.975, len(residuals) - 2)
ax.fill_between(nps_range, p(nps_range) - t_val * pred_err, p(nps_range) + t_val * pred_err, alpha=0.2, color='red', label='95% CI')

plt.tight_layout()
plt.savefig('../data/processed/corr_nps_revenue_scatter.png', dpi=300, bbox_inches='tight')
plt.show()

## Lagged Correlation Analysis

Does NPS in month *t* predict revenue in month *t+k*?

In [None]:
# Lagged correlation for different time lags
lag_results = []

for lag in range(0, 7):  # 0 to 6 months
    df['nps_lag'] = df.groupby('clinic_id')['nps_mean'].shift(lag)
    
    # Remove NaN values
    df_lag = df[['nps_lag', 'revenue_total']].dropna()
    
    if len(df_lag) > 10:
        r, p_val = pearsonr(df_lag['nps_lag'], df_lag['revenue_total'])
        lag_results.append({
            'Lag (months)': lag,
            'Correlation': r,
            'P-value': p_val,
            'Significant': 'Yes' if p_val < 0.05 else 'No',
            'N': len(df_lag)
        })

lag_df = pd.DataFrame(lag_results)
print('\nLagged Correlation Analysis:')
print(lag_df.to_string(index=False))

# Save results
lag_df.to_csv('../data/processed/corr_lagged_results.csv', index=False)

In [None]:
# Visualize lagged correlations
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Line plot of correlation by lag
ax1.plot(lag_df['Lag (months)'], lag_df['Correlation'], marker='o', linewidth=2.5, markersize=8, color='steelblue')
ax1.axhline(0, color='red', linestyle='--', alpha=0.5)
ax1.fill_between(lag_df['Lag (months)'], lag_df['Correlation'], alpha=0.3, color='steelblue')
ax1.set_xlabel('Lag (months)', fontsize=11, fontweight='bold')
ax1.set_ylabel('Pearson Correlation', fontsize=11, fontweight='bold')
ax1.set_title('NPS-Revenue Correlation by Time Lag', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.set_xticks(range(0, 7))

# Bar plot with significance
colors = ['green' if p < 0.05 else 'gray' for p in lag_df['P-value']]
ax2.bar(lag_df['Lag (months)'], lag_df['Correlation'], color=colors, alpha=0.7, edgecolor='black')
ax2.axhline(0, color='black', linestyle='-', linewidth=0.8)
ax2.set_xlabel('Lag (months)', fontsize=11, fontweight='bold')
ax2.set_ylabel('Correlation Coefficient', fontsize=11, fontweight='bold')
ax2.set_title('NPS-Revenue Correlation (Green = p<0.05)', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')
ax2.set_xticks(range(0, 7))

plt.tight_layout()
plt.savefig('../data/processed/corr_lagged_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## Segmented Analysis: By Clinic

In [None]:
# Correlation by clinic
clinic_corr = []

for clinic in df['clinic_id'].unique():
    clinic_data = df[df['clinic_id'] == clinic][['nps_mean', 'revenue_total']].dropna()
    
    if len(clinic_data) >= 5:  # Minimum 5 observations
        r, p_val = pearsonr(clinic_data['nps_mean'], clinic_data['revenue_total'])
        clinic_corr.append({
            'Clinic': clinic,
            'Correlation': r,
            'P-value': p_val,
            'Significant': 'Yes' if p_val < 0.05 else 'No',
            'N': len(clinic_data)
        })

clinic_corr_df = pd.DataFrame(clinic_corr).sort_values('Correlation', ascending=False)

print('\nClinic-Level Correlation Analysis (Top 15):')
print(clinic_corr_df.head(15).to_string(index=False))

print('\nCorrelation Summary:')
print(f'  Mean correlation: {clinic_corr_df["Correlation"].mean():.4f}')
print(f'  Median correlation: {clinic_corr_df["Correlation"].median():.4f}')
print(f'  Clinics with positive correlation: {(clinic_corr_df["Correlation"] > 0).sum()}/{len(clinic_corr_df)}')
print(f'  Clinics with significant correlation (p<0.05): {(clinic_corr_df["Significant"] == "Yes").sum()}/{len(clinic_corr_df)}')

# Save results
clinic_corr_df.to_csv('../data/processed/corr_by_clinic.csv', index=False)

In [None]:
# Visualize clinic-level correlations
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Top 15 clinics by correlation
top_clinics = clinic_corr_df.head(15)
colors = ['green' if p == 'Yes' else 'gray' for p in top_clinics['Significant']]

axes[0].barh(range(len(top_clinics)), top_clinics['Correlation'], color=colors, alpha=0.7, edgecolor='black')
axes[0].set_yticks(range(len(top_clinics)))
axes[0].set_yticklabels(top_clinics['Clinic'])
axes[0].set_xlabel('Correlation Coefficient', fontsize=11, fontweight='bold')
axes[0].set_title('Top 15 Clinics: NPS-Revenue Correlation (Green = p<0.05)', fontsize=12, fontweight='bold')
axes[0].axvline(0, color='red', linestyle='--', alpha=0.5)
axes[0].grid(True, alpha=0.3, axis='x')

# Distribution of correlations
axes[1].hist(clinic_corr_df['Correlation'], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
axes[1].axvline(clinic_corr_df['Correlation'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {clinic_corr_df["Correlation"].mean():.3f}')
axes[1].axvline(clinic_corr_df['Correlation'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {clinic_corr_df["Correlation"].median():.3f}')
axes[1].set_xlabel('Correlation Coefficient', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Number of Clinics', fontsize=11, fontweight='bold')
axes[1].set_title('Distribution of Clinic-Level NPS-Revenue Correlations', fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../data/processed/corr_by_clinic_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

## Segmented Analysis: By Time Period

In [None]:
# Extract year from date
df['year'] = df['date_nps'].dt.year

# Correlation by year
year_corr = []

for year in sorted(df['year'].unique()):
    year_data = df[df['year'] == year][['nps_mean', 'revenue_total']].dropna()
    
    if len(year_data) >= 10:
        r, p_val = pearsonr(year_data['nps_mean'], year_data['revenue_total'])
        year_corr.append({
            'Year': year,
            'Correlation': r,
            'P-value': p_val,
            'Significant': 'Yes' if p_val < 0.05 else 'No',
            'N': len(year_data)
        })

year_corr_df = pd.DataFrame(year_corr)

print('\nYear-Level Correlation Analysis:')
print(year_corr_df.to_string(index=False))

# Save results
year_corr_df.to_csv('../data/processed/corr_by_year.csv', index=False)

In [None]:
# Visualize temporal variation in correlation
fig, ax = plt.subplots(figsize=(12, 6))

colors = ['green' if sig == 'Yes' else 'gray' for sig in year_corr_df['Significant']]
ax.bar(year_corr_df['Year'].astype(str), year_corr_df['Correlation'], color=colors, alpha=0.7, edgecolor='black', width=0.6)
ax.axhline(0, color='red', linestyle='-', linewidth=1, alpha=0.5)
ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Correlation Coefficient', fontsize=12, fontweight='bold')
ax.set_title('NPS-Revenue Correlation Stability Over Time (Green = p<0.05)', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim([-1, 1])

# Add value labels on bars
for i, (year, corr) in enumerate(zip(year_corr_df['Year'], year_corr_df['Correlation'])):
    ax.text(i, corr + 0.05, f'{corr:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../data/processed/corr_by_year_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

## Effect Size and Practical Significance

In [None]:
# Interpret correlation strength
def interpret_correlation(r):
    """Cohen's interpretation of correlation strength"""
    r_abs = abs(r)
    if r_abs < 0.1:
        return 'Negligible'
    elif r_abs < 0.3:
        return 'Small'
    elif r_abs < 0.5:
        return 'Medium'
    else:
        return 'Large'

print('\n' + '='*70)
print('EFFECT SIZE INTERPRETATION')
print('='*70)

print(f'\nOverall NPS-Revenue Correlation:')
print(f'  Pearson r = {pearson_r:.4f}')
print(f'  Strength: {interpret_correlation(pearson_r)}')
print(f'  R² = {r_squared:.4f}')
print(f'  Interpretation: {r_squared*100:.1f}% of revenue variance explained by NPS')
print(f'  Practical significance: {"Strong" if abs(pearson_r) > 0.5 else "Moderate" if abs(pearson_r) > 0.3 else "Weak"}')

print(f'\nClinic-Level Variation:')
print(f'  Range: {clinic_corr_df["Correlation"].min():.4f} to {clinic_corr_df["Correlation"].max():.4f}')
print(f'  Mean: {clinic_corr_df["Correlation"].mean():.4f}')
print(f'  Std Dev: {clinic_corr_df["Correlation"].std():.4f}')
print(f'  Clinics with meaningful correlation (|r| > 0.5): {((clinic_corr_df["Correlation"].abs() > 0.5).sum())}/{len(clinic_corr_df)}')

## Summary: Key Findings

In [None]:
summary_findings = {
    'Metric': [
        'Overall Pearson Correlation',
        'P-value (Overall)',
        'Effect Size (R²)',
        'Statistical Significance',
        'Strongest Lagged Effect',
        'Clinics with Positive Correlation',
        'Clinics with Significant Correlation',
        'Temporal Stability',
        'Practical Recommendation'
    ],
    'Finding': [
        f'{pearson_r:.4f}',
        f'{pearson_p:.6f}',
        f'{r_squared:.4f} ({r_squared*100:.2f}%)',
        'Yes***' if pearson_p < 0.001 else 'Yes**' if pearson_p < 0.01 else 'Yes*' if pearson_p < 0.05 else 'No',
        f'Lag {lag_df.loc[lag_df["Correlation"].idxmax(), "Lag (months)"]} months (r={lag_df["Correlation"].max():.3f})',
        f'{(clinic_corr_df["Correlation"] > 0).sum()}/{len(clinic_corr_df)} ({100*(clinic_corr_df["Correlation"] > 0).sum()/len(clinic_corr_df):.1f}%)',
        f'{(clinic_corr_df["Significant"] == "Yes").sum()}/{len(clinic_corr_df)} ({100*(clinic_corr_df["Significant"] == "Yes").sum()/len(clinic_corr_df):.1f}%)',
        'Moderate consistency across years' if year_corr_df['Correlation'].std() < 0.2 else 'High variability across years',
        'NPS has modest predictive power; clinic-specific and temporal factors matter'
    ]
}

summary_df = pd.DataFrame(summary_findings)

print('\n' + '='*70)
print('CORRELATION ASSESSMENT SUMMARY')
print('='*70)
print(summary_df.to_string(index=False))

# Save summary
summary_df.to_csv('../data/processed/corr_summary_findings.csv', index=False)
print('\n✓ All correlation results saved to processed/ folder')