# Notebook 7: Feature Distributions

## Purpose
Understand the distribution and characteristics of each feature in the PathWild dataset.

## Key Questions
- Which features are normally distributed vs heavily skewed?
- Do any features need transformation?
- Which features show strongest separation between presence/absence?
- Are there unexpected bimodal distributions?

## Key Observations to Look For
- **Normal vs Skewed**: Identify features needing transformation
- **Seasonal Patterns**: NDVI should peak in summer, snow in winter
- **Presence/Absence Separation**: Features with clear separation are good predictors
- **Statistical Significance**: p-values and effect sizes for discrimination

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro, ttest_ind
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Determine project root and output directories
possible_roots = [
    Path('.'),  # If running from project root
    Path('..'),  # If running from notebooks directory
    Path('../..'),  # If running from subdirectory
]

data_root = None
for root in possible_roots:
    if (root / 'data' / 'features').exists():
        data_root = root / 'data'
        break

if data_root is None:
    data_root = Path('../data')

# Create output directories relative to project root
figures_dir = data_root / 'figures'
reports_dir = data_root / 'reports'
figures_dir.mkdir(parents=True, exist_ok=True)
reports_dir.mkdir(parents=True, exist_ok=True)

print(f'✓ Setup complete')
print(f'  Output directory: {data_root.absolute()}')

## 1. Load Data and Detect Columns

In [None]:
# Load data
from pathlib import Path

# Try multiple possible paths
possible_paths = [
    Path('data/features/complete_context.csv'),  # From project root
    Path('../data/features/complete_context.csv'),  # From notebooks directory
    Path('../../data/features/complete_context.csv'),  # From subdirectory
]

data_path = None
for path in possible_paths:
    if path.exists():
        data_path = path
        break

if data_path is None:
    raise FileNotFoundError(
        f'Data file not found. Tried: {[str(p) for p in possible_paths]}\n'
        f'Please run: python scripts/combine_feature_files.py\n'
        f'Or ensure you are running the notebook from the project root directory.'
    )

print(f'Loading data from: {data_path}')
df = pd.read_csv(data_path)

# Detect key columns
timestamp_col = None
presence_col = None
month_col = None

# Look for timestamp
for col in df.columns:
    if any(x in col.lower() for x in ['timestamp', 'date', 'time']):
        timestamp_col = col
        break

# Look for month column (may exist directly in data)
if 'month' in df.columns:
    month_col = 'month'
elif timestamp_col:
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
    df['month'] = df[timestamp_col].dt.month
    month_col = 'month'

# Look for presence/target
for col in df.columns:
    if col.lower() in ['presence', 'target', 'label', 'is_presence', 'elk_present']:
        presence_col = col
        break

print(f'Dataset shape: {df.shape}')
print(f'Timestamp column: {timestamp_col}')
print(f'Month column: {month_col}')
print(f'Presence column: {presence_col}')

# Identify numeric and categorical columns
# Exclude index columns
exclude_cols = ['point_index']
if 'year' in df.columns and df['year'].dtype in [np.int64, np.float64]:
    exclude_cols.append('year')

numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns.tolist() 
                if col not in exclude_cols]
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f'\nNumeric columns: {len(numeric_cols)}')
print(f'Categorical columns: {len(categorical_cols)}')

## 2. Univariate Distributions

In [None]:
# Create histograms for all numeric features
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

for idx, col in enumerate(numeric_cols):
    ax = axes[idx]
    data = df[col].dropna()
    
    if len(data) > 0:
        # Histogram with KDE
        ax.hist(data, bins=50, alpha=0.7, color='steelblue', edgecolor='black', density=True)
        
        # KDE overlay
        try:
            from scipy.stats import gaussian_kde
            kde = gaussian_kde(data)
            x_range = np.linspace(data.min(), data.max(), 100)
            ax.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
        except:
            pass
        
        # Add statistics
        mean_val = data.mean()
        median_val = data.median()
        std_val = data.std()
        
        ax.axvline(mean_val, color='red', linestyle='--', linewidth=1, alpha=0.7)
        ax.axvline(median_val, color='blue', linestyle='--', linewidth=1, alpha=0.7)
        
        ax.set_title(f'{col}\nMean: {mean_val:.2f}, Std: {std_val:.2f}', fontsize=10)
        ax.set_xlabel(col, fontsize=9)
        ax.set_ylabel('Density', fontsize=9)
        ax.grid(alpha=0.3)

# Hide extra subplots
for idx in range(len(numeric_cols), len(axes)):
    axes[idx].axis('off')

plt.suptitle('Feature Distributions (Histograms with KDE)', fontsize=16, y=1.00)
plt.tight_layout()
plt.savefig(figures_dir / 'feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print('✓ Saved feature distributions')

In [None]:
# Identify skewed features
skewness_data = []

for col in numeric_cols:
    data = df[col].dropna()
    if len(data) > 0:
        skew = data.skew()
        skewness_data.append({'feature': col, 'skewness': skew})

skewness_df = pd.DataFrame(skewness_data).sort_values('skewness', key=abs, ascending=False)

print('\nFeature skewness (sorted by absolute value):')
print(skewness_df)

highly_skewed = skewness_df[abs(skewness_df['skewness']) > 1]
print(f'\n⚠ {len(highly_skewed)} features with |skewness| > 1 (may need transformation):')
for _, row in highly_skewed.iterrows():
    print(f"  - {row['feature']}: {row['skewness']:.2f}")

## 3. Categorical Features

In [None]:
# Analyze categorical features
if len(categorical_cols) > 0:
    for col in categorical_cols:
        if col not in [timestamp_col]:  # Skip timestamp
            print(f'\n{col}:')
            print(df[col].value_counts())
            
            # Create count plot
            plt.figure(figsize=(12, 6))
            value_counts = df[col].value_counts()
            plt.bar(range(len(value_counts)), value_counts.values, color='steelblue', alpha=0.7)
            plt.xlabel(col, fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.title(f'{col} Distribution', fontsize=14)
            plt.xticks(range(len(value_counts)), value_counts.index, rotation=45, ha='right')
            plt.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            plt.savefig(figures_dir / f'{col}_distribution.png', dpi=300, bbox_inches='tight')
            plt.show()
else:
    print('No categorical features found')

## 4. Seasonal Patterns

In [None]:
# Analyze seasonal patterns
if timestamp_col and 'month' in df.columns:
    # Select key features for seasonal analysis
    seasonal_features = [col for col in numeric_cols if col not in ['month', 'year']][:12]
    
    n_cols = 3
    n_rows = (len(seasonal_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(seasonal_features):
        ax = axes[idx]
        monthly_mean = df.groupby('month')[col].mean()
        
        ax.plot(monthly_mean.index, monthly_mean.values, marker='o', linewidth=2, markersize=8)
        ax.set_xlabel('Month', fontsize=10)
        ax.set_ylabel(f'Mean {col}', fontsize=10)
        ax.set_title(col, fontsize=11)
        ax.set_xticks(range(1, 13))
        ax.set_xticklabels(['J', 'F', 'M', 'A', 'M', 'J', 'J', 'A', 'S', 'O', 'N', 'D'])
        ax.grid(alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(seasonal_features), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('Seasonal Patterns (Monthly Means)', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'seasonal_patterns.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved seasonal patterns')
else:
    print('Cannot analyze seasonal patterns without timestamp column')

## 5. Presence vs Absence Distributions

In [None]:
# Compare distributions for presence vs absence
if presence_col:
    # Select top features by variance for visualization
    feature_variance = df[numeric_cols].var().sort_values(ascending=False)
    top_features = feature_variance.head(8).index.tolist()
    
    fig, axes = plt.subplots(4, 2, figsize=(14, 16))
    axes = axes.flatten()
    
    for idx, col in enumerate(top_features):
        ax = axes[idx]
        
        presence_data = df[df[presence_col] == 1][col].dropna()
        absence_data = df[df[presence_col] == 0][col].dropna()
        
        # Overlapping histograms
        ax.hist(presence_data, bins=30, alpha=0.5, color='blue', label='Presence', density=True)
        ax.hist(absence_data, bins=30, alpha=0.5, color='red', label='Absence', density=True)
        
        # Add means
        ax.axvline(presence_data.mean(), color='blue', linestyle='--', linewidth=2)
        ax.axvline(absence_data.mean(), color='red', linestyle='--', linewidth=2)
        
        # Calculate mean difference
        mean_diff = presence_data.mean() - absence_data.mean()
        
        ax.set_xlabel(col, fontsize=10)
        ax.set_ylabel('Density', fontsize=10)
        ax.set_title(f'{col}\nMean Diff: {mean_diff:.2f}', fontsize=11)
        ax.legend()
        ax.grid(alpha=0.3)
    
    plt.suptitle('Presence vs Absence Distributions', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'presence_absence_distributions.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved presence vs absence distributions')
else:
    print('Cannot compare presence/absence without target column')

## 6. Statistical Tests

In [None]:
# Perform t-tests for presence vs absence
if presence_col:
    test_results = []
    
    for col in numeric_cols:
        presence_data = df[df[presence_col] == 1][col].dropna()
        absence_data = df[df[presence_col] == 0][col].dropna()
        
        if len(presence_data) > 0 and len(absence_data) > 0:
            # T-test
            t_stat, p_value = ttest_ind(presence_data, absence_data)
            
            # Cohen's d (effect size)
            mean_diff = presence_data.mean() - absence_data.mean()
            pooled_std = np.sqrt((presence_data.std()**2 + absence_data.std()**2) / 2)
            cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0
            
            test_results.append({
                'feature': col,
                'p_value': p_value,
                'cohens_d': cohens_d,
                'presence_mean': presence_data.mean(),
                'absence_mean': absence_data.mean(),
                'mean_diff': mean_diff
            })
    
    results_df = pd.DataFrame(test_results).sort_values('p_value')
    
    print('\nStatistical test results (sorted by p-value):')
    print(results_df)
    
    # Save results
    results_df.to_csv(reports_dir / 'feature_discrimination.csv', index=False)
    print('\n✓ Saved feature discrimination results')
    
    # Flag highly discriminative features
    significant = results_df[(results_df['p_value'] < 0.01) & (abs(results_df['cohens_d']) > 0.5)]
    print(f'\n✓ {len(significant)} features with p<0.01 and |Cohen\'s d|>0.5:')
    for _, row in significant.iterrows():
        print(f"  - {row['feature']}: p={row['p_value']:.2e}, d={row['cohens_d']:.2f}")
else:
    print('Cannot perform statistical tests without target column')

## 7. Normality Tests

In [None]:
# Create Q-Q plots for features with strong skew
from scipy.stats import probplot

highly_skewed_features = skewness_df[abs(skewness_df['skewness']) > 1]['feature'].head(6).tolist()

if len(highly_skewed_features) > 0:
    n_cols = 3
    n_rows = (len(highly_skewed_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(highly_skewed_features):
        ax = axes[idx]
        data = df[col].dropna()
        
        if len(data) > 0:
            probplot(data, dist="norm", plot=ax)
            ax.set_title(f'{col}\n(Skew: {data.skew():.2f})', fontsize=11)
            ax.grid(alpha=0.3)
    
    # Hide extra subplots
    for idx in range(len(highly_skewed_features), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('Q-Q Plots for Skewed Features (Normal Distribution)', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'normality_qqplots.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved Q-Q plots for skewed features')
else:
    print('No highly skewed features found for Q-Q plots')


In [None]:
# Test normality for each feature
normality_results = []

for col in numeric_cols[:20]:  # Limit to first 20 for performance
    data = df[col].dropna()
    if len(data) > 3 and len(data) < 5000:  # Shapiro-Wilk works best with smaller samples
        stat, p_value = shapiro(data)
        normality_results.append({
            'feature': col,
            'shapiro_stat': stat,
            'p_value': p_value,
            'is_normal': p_value > 0.05
        })

if len(normality_results) > 0:
    normality_df = pd.DataFrame(normality_results).sort_values('p_value', ascending=False)
    
    print('\nNormality test results:')
    print(normality_df)
    
    non_normal = normality_df[normality_df['p_value'] < 0.05]
    print(f'\n⚠ {len(non_normal)} features fail normality test (p<0.05)')
    print('These may benefit from transformation (log, sqrt, box-cox)')

## 8. Feature Summary Table

In [None]:
# Create comprehensive feature summary
summary_data = []

for col in numeric_cols:
    data = df[col].dropna()
    
    if len(data) > 0:
        summary = {
            'feature': col,
            'dtype': df[col].dtype,
            'missing_pct': (df[col].isnull().sum() / len(df) * 100),
            'min': data.min(),
            'max': data.max(),
            'mean': data.mean(),
            'std': data.std(),
            'skewness': data.skew(),
            'kurtosis': data.kurtosis()
        }
        
        # Add normality test if available
        norm_result = [r for r in normality_results if r['feature'] == col]
        if norm_result:
            summary['normality_p'] = norm_result[0]['p_value']
        
        # Add discrimination metrics if available
        if presence_col:
            disc_result = results_df[results_df['feature'] == col]
            if len(disc_result) > 0:
                summary['discrimination_p'] = disc_result.iloc[0]['p_value']
                summary['cohens_d'] = disc_result.iloc[0]['cohens_d']
        
        summary_data.append(summary)

summary_df = pd.DataFrame(summary_data)

print('\nFeature summary:')
print(summary_df)

# Save summary
summary_df.to_csv(reports_dir / 'feature_summary.csv', index=False)
print(f'\n✓ Saved feature summary to {reports_dir / "feature_summary.csv"}')

## Summary

This notebook analyzed feature distributions:

1. **Univariate Distributions**: Identified normal vs skewed features
2. **Categorical Features**: Analyzed category distributions
3. **Seasonal Patterns**: Examined temporal variation
4. **Presence/Absence**: Compared distributions between classes
5. **Statistical Tests**: Identified discriminative features
6. **Normality Tests**: Flagged features needing transformation

**Key Findings**:
- Review `data/reports/feature_discrimination.csv` for predictive features
- Review `data/reports/feature_summary.csv` for comprehensive statistics
- Features with high |Cohen's d| are strong predictors

**Next Steps**:
- Proceed to Notebook 08 for spatial-temporal analysis