# Notebook 9: Feature Correlations

## Purpose
Understand relationships between features and identify multicollinearity issues.

## Key Questions
- Which feature pairs are highly correlated?
- Are there redundant features?
- Which features are most correlated with the target?
- Do correlations change seasonally?

## Key Observations to Look For
- **High Correlations**: |r| > 0.8 indicates potential redundancy
- **VIF > 10**: Severe multicollinearity problem
- **Target Correlation**: Identifies most predictive features
- **Non-linear Relationships**: May need polynomial terms

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import pointbiserialr
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Determine project root and output directories
possible_roots = [
    Path('.'),  # If running from project root
    Path('..'),  # If running from notebooks directory
    Path('../..'),  # If running from subdirectory
]

data_root = None
for root in possible_roots:
    if (root / 'data' / 'features').exists():
        data_root = root / 'data'
        break

if data_root is None:
    data_root = Path('../data')

# Create output directories relative to project root
figures_dir = data_root / 'figures'
reports_dir = data_root / 'reports'
figures_dir.mkdir(parents=True, exist_ok=True)
reports_dir.mkdir(parents=True, exist_ok=True)

print(f'✓ Setup complete')
print(f'  Output directory: {data_root.absolute()}')

## 1. Load Data

In [None]:
# Load data
df = pd.read_csv('data/features/complete_context.csv')

# Detect key columns
timestamp_col = None
presence_col = None

for col in df.columns:
    if any(x in col.lower() for x in ['timestamp', 'date', 'time']):
        timestamp_col = col
    if col.lower() in ['presence', 'target', 'label', 'is_presence']:
        presence_col = col

if timestamp_col:
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
    df['month'] = df[timestamp_col].dt.month

print(f'Dataset shape: {df.shape}')
print(f'Presence column: {presence_col}')

# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f'Numeric columns: {len(numeric_cols)}')

## 2. Correlation Matrix

In [None]:
# Calculate Pearson correlation matrix
corr_matrix = df[numeric_cols].corr()

# Create large annotated heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(
    corr_matrix,
    cmap='RdBu_r',
    center=0,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.5,
    cbar_kws={'label': 'Pearson Correlation', 'shrink': 0.8}
)
plt.title('Feature Correlation Matrix', fontsize=16, pad=20)
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(rotation=0, fontsize=8)
plt.tight_layout()
plt.savefig(figures_dir / 'correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print('✓ Saved correlation matrix')

In [None]:
# Identify high correlations
high_corr_pairs = []

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.8:
            high_corr_pairs.append({
                'feature1': corr_matrix.columns[i],
                'feature2': corr_matrix.columns[j],
                'correlation': corr_val
            })

if len(high_corr_pairs) > 0:
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', key=abs, ascending=False)
    print(f'\n⚠ Found {len(high_corr_pairs)} feature pairs with |r| > 0.8:')
    print(high_corr_df)
    
    print('\nThese features may be redundant. Consider:')
    print('- Dropping one feature from each pair')
    print('- Using PCA for dimensionality reduction')
    print('- Using regularization (L1/L2) in modeling')
else:
    print('\n✓ No feature pairs with |r| > 0.8 (no severe multicollinearity)')

## 3. Key Feature Pair Scatter Plots

In [None]:
# Create scatter plots for most correlated pairs
if len(high_corr_pairs) > 0:
    top_pairs = high_corr_df.head(6)
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, (_, row) in enumerate(top_pairs.iterrows()):
        if idx >= 6:
            break
        
        ax = axes[idx]
        feat1 = row['feature1']
        feat2 = row['feature2']
        corr = row['correlation']
        
        # Sample if too many points
        plot_df = df[[feat1, feat2]].dropna()
        if len(plot_df) > 5000:
            plot_df = plot_df.sample(n=5000, random_state=42)
        
        ax.scatter(plot_df[feat1], plot_df[feat2], alpha=0.3, s=10)
        
        # Add regression line
        z = np.polyfit(plot_df[feat1], plot_df[feat2], 1)
        p = np.poly1d(z)
        x_line = np.linspace(plot_df[feat1].min(), plot_df[feat1].max(), 100)
        ax.plot(x_line, p(x_line), 'r-', linewidth=2, alpha=0.7)
        
        ax.set_xlabel(feat1, fontsize=9)
        ax.set_ylabel(feat2, fontsize=9)
        ax.set_title(f'{feat1} vs {feat2}\nr = {corr:.3f}', fontsize=10)
        ax.grid(alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(top_pairs), 6):
        axes[idx].axis('off')
    
    plt.suptitle('Key Feature Pair Correlations', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'key_correlations.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved key correlation scatter plots')
else:
    print('No high correlations to plot')

## 4. Seasonal Correlations

In [None]:
# Calculate correlation matrices by season
if timestamp_col and 'month' in df.columns:
    # Define seasons
    seasons = {
        'Winter': [12, 1, 2],
        'Spring': [3, 4, 5],
        'Summer': [6, 7, 8],
        'Fall': [9, 10, 11]
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 14))
    axes = axes.flatten()
    
    for idx, (season_name, months) in enumerate(seasons.items()):
        season_df = df[df['month'].isin(months)]
        season_corr = season_df[numeric_cols].corr()
        
        ax = axes[idx]
        sns.heatmap(
            season_corr,
            cmap='RdBu_r',
            center=0,
            vmin=-1,
            vmax=1,
            square=True,
            cbar_kws={'label': 'Correlation'},
            ax=ax,
            xticklabels=False,
            yticklabels=False
        )
        ax.set_title(f'{season_name} (n={len(season_df):,})', fontsize=13)
    
    plt.suptitle('Seasonal Correlation Matrices', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'seasonal_correlations.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved seasonal correlation matrices')
else:
    print('⚠ Cannot analyze seasonal correlations without timestamp column')

## 5. Multicollinearity Detection (VIF)

In [None]:
# Calculate Variance Inflation Factor
try:
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    
    # Select features for VIF (exclude target, timestamps, etc.)
    vif_features = [col for col in numeric_cols 
                   if col not in [presence_col, 'month', 'year']][:20]  # Limit for performance
    
    vif_data = df[vif_features].dropna()
    
    if len(vif_data) > 0:
        vif_results = []
        
        for i, col in enumerate(vif_features):
            try:
                vif = variance_inflation_factor(vif_data.values, i)
                vif_results.append({'feature': col, 'VIF': vif})
            except:
                vif_results.append({'feature': col, 'VIF': np.nan})
        
        vif_df = pd.DataFrame(vif_results).sort_values('VIF', ascending=False)
        
        print('\nVariance Inflation Factors:')
        print(vif_df)
        
        # Plot VIF
        plt.figure(figsize=(12, 8))
        plt.barh(range(len(vif_df)), vif_df['VIF'], color='steelblue', alpha=0.7)
        plt.axvline(10, color='red', linestyle='--', linewidth=2, label='VIF=10 threshold')
        plt.yticks(range(len(vif_df)), vif_df['feature'])
        plt.xlabel('VIF', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.title('Variance Inflation Factors (VIF > 10 indicates multicollinearity)', fontsize=14, pad=20)
        plt.legend()
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.savefig(figures_dir / 'vif_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print('\n✓ Saved VIF analysis')
        
        # Flag high VIF
        high_vif = vif_df[vif_df['VIF'] > 10]
        if len(high_vif) > 0:
            print(f'\n⚠ WARNING: {len(high_vif)} features with VIF > 10:')
            for _, row in high_vif.iterrows():
                print(f"  - {row['feature']}: VIF = {row['VIF']:.2f}")
        else:
            print('\n✓ No features with VIF > 10')
    else:
        print('⚠ Insufficient data for VIF calculation')
        
except ImportError:
    print('⚠ statsmodels not available for VIF calculation')
except Exception as e:
    print(f'⚠ Could not calculate VIF: {e}')

## 6. Target Correlation

In [None]:
# Calculate correlation with target variable
if presence_col:
    target_corr = []
    
    for col in numeric_cols:
        if col != presence_col:
            # Use point-biserial correlation for binary target
            data = df[[col, presence_col]].dropna()
            
            if len(data) > 0:
                try:
                    corr, p_value = pointbiserialr(data[presence_col], data[col])
                    target_corr.append({
                        'feature': col,
                        'correlation': corr,
                        'p_value': p_value,
                        'abs_correlation': abs(corr)
                    })
                except:
                    pass
    
    if len(target_corr) > 0:
        target_corr_df = pd.DataFrame(target_corr).sort_values('abs_correlation', ascending=False)
        
        print('\nFeature correlation with target:')
        print(target_corr_df.head(20))
        
        # Plot top correlations
        top_n = min(15, len(target_corr_df))
        top_corr = target_corr_df.head(top_n)
        
        plt.figure(figsize=(12, 8))
        colors = ['green' if x > 0 else 'red' for x in top_corr['correlation']]
        plt.barh(range(len(top_corr)), top_corr['correlation'], color=colors, alpha=0.7)
        plt.yticks(range(len(top_corr)), top_corr['feature'])
        plt.xlabel('Point-Biserial Correlation with Target', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.title(f'Top {top_n} Features by Target Correlation', fontsize=14, pad=20)
        plt.axvline(0, color='black', linewidth=1)
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.savefig(figures_dir / 'target_correlations.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print('\n✓ Saved target correlation plot')
        
        # Identify top predictive features
        print(f'\nTop 10 most correlated features with target:')
        for _, row in target_corr_df.head(10).iterrows():
            print(f"  - {row['feature']}: r = {row['correlation']:.3f}, p = {row['p_value']:.2e}")
    else:
        print('⚠ Could not calculate target correlations')
else:
    print('⚠ Cannot calculate target correlation without presence column')

## 7. Non-linear Relationships

In [None]:
# Examine non-linear relationships with target
if presence_col and len(target_corr) > 0:
    top_features = target_corr_df.head(6)['feature'].tolist()
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for idx, feat in enumerate(top_features):
        ax = axes[idx]
        
        # Sample data
        plot_df = df[[feat, presence_col]].dropna()
        if len(plot_df) > 5000:
            plot_df = plot_df.sample(n=5000, random_state=42)
        
        # Scatter plot with jitter on y-axis
        jitter = np.random.normal(0, 0.02, len(plot_df))
        ax.scatter(plot_df[feat], plot_df[presence_col] + jitter, alpha=0.1, s=5)
        
        # Add LOESS smoothing
        try:
            from scipy.interpolate import UnivariateSpline
            
            # Sort by feature value
            sorted_df = plot_df.sort_values(feat)
            
            # Bin and calculate mean presence rate
            n_bins = 20
            bins = pd.qcut(sorted_df[feat], n_bins, duplicates='drop')
            binned = sorted_df.groupby(bins)[presence_col].mean()
            bin_centers = sorted_df.groupby(bins)[feat].mean()
            
            ax.plot(bin_centers, binned, 'r-', linewidth=3, label='Binned mean')
        except:
            pass
        
        ax.set_xlabel(feat, fontsize=10)
        ax.set_ylabel('Presence', fontsize=10)
        ax.set_title(f'{feat}', fontsize=11)
        ax.set_ylim(-0.1, 1.1)
        ax.legend()
        ax.grid(alpha=0.3)
    
    plt.suptitle('Non-linear Relationships with Target', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'nonlinear_relationships.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved non-linear relationship plots')
else:
    print('⚠ Cannot analyze non-linear relationships')

## 8. Correlation Summary

In [None]:
# Create comprehensive correlation summary
summary = {
    'high_correlations': len(high_corr_pairs) if len(high_corr_pairs) > 0 else 0,
    'features_analyzed': len(numeric_cols)
}

if presence_col and len(target_corr) > 0:
    summary['top_predictive_features'] = target_corr_df.head(10)['feature'].tolist()
    summary['strongest_correlation'] = target_corr_df.iloc[0]['correlation']

print('\n' + '='*70)
print('CORRELATION ANALYSIS SUMMARY')
print('='*70)
print(f"\nFeatures analyzed: {summary['features_analyzed']}")
print(f"High correlation pairs (|r|>0.8): {summary['high_correlations']}")

if 'top_predictive_features' in summary:
    print(f"\nTop 10 predictive features:")
    for i, feat in enumerate(summary['top_predictive_features'], 1):
        corr_val = target_corr_df[target_corr_df['feature']==feat].iloc[0]['correlation']
        print(f"  {i:2d}. {feat}: r = {corr_val:.3f}")

print('\n' + '='*70)

# Save correlation analysis report
if len(high_corr_pairs) > 0:
    high_corr_df.to_csv(reports_dir / 'correlation_analysis.csv', index=False)
    print(f'\n✓ Saved correlation analysis to {reports_dir / "correlation_analysis.csv"}')

if presence_col and len(target_corr) > 0:
    target_corr_df.to_csv(reports_dir / 'target_correlations.csv', index=False)
    print(f'✓ Saved target correlations to {reports_dir / "target_correlations.csv"}')

## Summary

This notebook analyzed feature correlations:

1. **Correlation Matrix**: Identified highly correlated feature pairs
2. **Key Pairs**: Visualized strongest correlations
3. **Seasonal Correlations**: Examined how relationships change by season
4. **Multicollinearity (VIF)**: Flagged features with VIF > 10
5. **Target Correlation**: Ranked features by predictive power
6. **Non-linear Relationships**: Identified features needing polynomial terms

**Key Findings**:
- Review `data/reports/correlation_analysis.csv` for redundant features
- Review `data/reports/target_correlations.csv` for feature selection
- Consider dropping one feature from highly correlated pairs
- Use regularization to handle multicollinearity

**Recommendations**:
- Features with VIF > 10 should be dropped or combined
- Top correlated features are good candidates for modeling
- Non-linear relationships may benefit from feature engineering

**Next Steps**:
- Proceed to Notebook 10 for heuristic validation