In [None]:
# Imports and setup
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure output directories exist
os.makedirs('/home/jovyan/work/outputs', exist_ok=True)
os.makedirs('/home/jovyan/work/outputs/maps', exist_ok=True)
os.makedirs('/home/jovyan/work/outputs/figures', exist_ok=True)

print('Environment ready.')


In [None]:
# Load datasets
energy_path = '/home/jovyan/work/datasets/energyreporter_municipality_historized.csv'
wealth_path = '/home/jovyan/work/datasets/data_7354970.csv'
geojson_path = '/home/jovyan/work/datasets/zh-municipalities.geojson'

energy = pd.read_csv(energy_path, parse_dates=['energyreporter_date'])
wealth = pd.read_csv(wealth_path, sep=';')

geo = None
if os.path.exists(geojson_path):
    try:
        geo = gpd.read_file(geojson_path)
        # The GeoJSON uses 'id' field for BFS numbers, extract it
        if 'id' in geo.columns or hasattr(geo.iloc[0] if len(geo) > 0 else None, 'id'):
            # GeoJSON ID is stored as feature ID, need to extract it
            geo['bfs_nr'] = geo.index if 'id' not in geo.columns else geo['id']
        print(f'GeoJSON loaded: {len(geo)} features')
    except Exception as e:
        print(f'Warning: Failed to load GeoJSON: {e}')
        geo = None
else:
    print('Info: GeoJSON not found; maps will be skipped until provided.')

print(energy.shape, wealth.shape, 'geo loaded' if geo is not None else 'no geo')


In [None]:
# Filter to Zurich and essential columns - including both total and household electricity
energy_zh = energy.loc[energy['canton'] == 'ZH', [
    'bfs_nr', 
    'energyreporter_date', 
    'elec_consumption_mwh_per_year_per_capita',
    'elec_consumption_households_mwh_per_year_per_capita'
]].copy()
# normalize types
energy_zh['bfs_nr'] = energy_zh['bfs_nr'].astype(int)

wealth_small = wealth.loc[:, ['BFS_NR', 'INDIKATOR_ID', 'INDIKATOR_NAME', 'INDIKATOR_JAHR', 'INDIKATOR_VALUE']].copy()
wealth_small['BFS_NR'] = wealth_small['BFS_NR'].astype(int)

if geo is not None:
    # Try to find or create bfs_nr column
    if 'bfs_nr' not in geo.columns:
        # Try common BFS field names
        for cand in ['id', 'BFS', 'BFS_NUMMER', 'BFS_NUM', 'GEMEINDENO', 'GMDNR']:
            if cand in geo.columns:
                geo = geo.rename(columns={cand: 'bfs_nr'})
                break
        # If still no bfs_nr, the GeoJSON stores it as feature ID in index
        if 'bfs_nr' not in geo.columns:
            # GeoPandas puts GeoJSON 'id' in the index by default
            geo = geo.reset_index().rename(columns={'index': 'bfs_nr'})
    
    geo['bfs_nr'] = pd.to_numeric(geo['bfs_nr'], errors='coerce').astype('Int64')
    print(f'GeoJSON: {len(geo)} features, bfs_nr range: {geo["bfs_nr"].min()}-{geo["bfs_nr"].max()}')

print(energy_zh.shape, wealth_small.shape)


In [None]:
# Aggregate energy 2024 monthly → yearly average per capita (both total and household)
energy_2024 = energy_zh[energy_zh['energyreporter_date'].dt.year == 2024].copy()
consum = (energy_2024
          .groupby('bfs_nr', as_index=False)
          .agg({
              'elec_consumption_mwh_per_year_per_capita': 'mean',
              'elec_consumption_households_mwh_per_year_per_capita': 'mean'
          })
          .rename(columns={
              'elec_consumption_mwh_per_year_per_capita': 'total_elec_per_capita_mwh_2024',
              'elec_consumption_households_mwh_per_year_per_capita': 'household_elec_per_capita_mwh_2024'
          }))

consum.head()


In [None]:
# Wealth 2022: select median/avg/total-per-capita for income and capital, pivot wide
wealth_2022 = wealth_small[wealth_small['INDIKATOR_JAHR'] == 2022].copy()
patterns = {
    'income_median_2022': r'^Steuerb\. Einkommen.*Median',
    'income_avg_2022': r'^Steuerb\. Einkommen.*Durchschn',
    'income_total_per_capita_2022': r'^Steuerb\. Einkommen.*(je Einwohner|pro Kopf)',
    'capital_median_2022': r'^Steuerb\. Vermögen.*Median',
    'capital_avg_2022': r'^Steuerb\. Vermögen.*Durchschn',
    'capital_total_per_capita_2022': r'^Steuerb\. Vermögen.*(je Einwohner|pro Kopf)'
}
wealth_2022['metric'] = np.nan
for col, pat in patterns.items():
    m = wealth_2022['INDIKATOR_NAME'].str.contains(pat, regex=True, na=False)
    wealth_2022.loc[m, 'metric'] = col

wide = (wealth_2022.dropna(subset=['metric'])
        .pivot_table(index='BFS_NR', columns='metric', values='INDIKATOR_VALUE', aggfunc='first')
        .reset_index()
        .rename(columns={'BFS_NR': 'bfs_nr'}))

wide.head()


In [None]:
# Join and derive ratios
merged = consum.merge(wide, on='bfs_nr', how='inner')

# Replace 0 to avoid div by zero
for col in [
    'income_median_2022','income_avg_2022','income_total_per_capita_2022',
    'capital_median_2022','capital_avg_2022','capital_total_per_capita_2022'
]:
    if col in merged.columns:
        merged[col] = pd.to_numeric(merged[col], errors='coerce')
        merged[col] = merged[col].replace(0, np.nan)

# Ratios (using household electricity for backward compatibility)
if 'capital_median_2022' in merged.columns:
    merged['ratio_consum_to_capital_median'] = merged['household_elec_per_capita_mwh_2024'] / merged['capital_median_2022']
if 'capital_avg_2022' in merged.columns:
    merged['ratio_consum_to_capital_avg'] = merged['household_elec_per_capita_mwh_2024'] / merged['capital_avg_2022']
if 'capital_total_per_capita_2022' in merged.columns:
    merged['ratio_consum_to_capital_totalpc'] = merged['household_elec_per_capita_mwh_2024'] / merged['capital_total_per_capita_2022']
if 'income_median_2022' in merged.columns:
    merged['ratio_consum_to_income_median'] = merged['household_elec_per_capita_mwh_2024'] / merged['income_median_2022']
if 'income_avg_2022' in merged.columns:
    merged['ratio_consum_to_income_avg'] = merged['household_elec_per_capita_mwh_2024'] / merged['income_avg_2022']
if 'income_total_per_capita_2022' in merged.columns:
    merged['ratio_consum_to_income_totalpc'] = merged['household_elec_per_capita_mwh_2024'] / merged['income_total_per_capita_2022']

# Also keep legacy column name for backward compatibility
merged['consum_per_capita_mwh_2024'] = merged['household_elec_per_capita_mwh_2024']

# Save merged data
merged.to_csv('/home/jovyan/work/outputs/merged_zh_wealth_energy.csv', index=False)
merged.head()


In [None]:
# Choropleth maps (if geo available)
if geo is not None:
    geo_merged = geo.merge(merged, on='bfs_nr', how='left')
    vars_to_map = [
        'household_elec_per_capita_mwh_2024',
        'total_elec_per_capita_mwh_2024',
        'capital_median_2022',
        'income_median_2022',
        'ratio_consum_to_capital_median'
    ]
    for var in vars_to_map:
        if var in geo_merged.columns:
            ax = geo_merged.plot(column=var, scheme='Quantiles', k=5, legend=True, figsize=(8,8),
                                 cmap='viridis', missing_kwds={'color': 'lightgrey', 'hatch': '///', 'label': 'No data'})
            ax.set_axis_off()
            ax.set_title(var.replace('_', ' ').title())
            plt.tight_layout()
            outpath = f'/home/jovyan/work/outputs/maps/{var}.png'
            plt.savefig(outpath, dpi=150)
            plt.close()
    print(f'Saved {len(vars_to_map)} choropleth maps.')
else:
    print('GeoJSON not available; skipping maps.')


In [None]:
# Correlations and scatter with regression
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression

corr_rows = []
wealth_cols = [c for c in [
    'income_median_2022','income_avg_2022','income_total_per_capita_2022',
    'capital_median_2022','capital_avg_2022','capital_total_per_capita_2022'
] if c in merged.columns]

for col in wealth_cols:
    df = merged[['consum_per_capita_mwh_2024', col]].dropna()
    if df.empty:
        continue
    pr, pp = pearsonr(df['consum_per_capita_mwh_2024'], df[col])
    sr, sp = spearmanr(df['consum_per_capita_mwh_2024'], df[col])
    corr_rows.append({'metric': col, 'pearson_r': pr, 'pearson_p': pp, 'spearman_r': sr, 'spearman_p': sp, 'n': len(df)})

corr_df = pd.DataFrame(corr_rows).sort_values('pearson_r', ascending=False)
corr_df.to_csv('/home/jovyan/work/outputs/correlations.csv', index=False)
corr_df


In [None]:
# Comprehensive correlation analysis for all combinations
# Energy types: household, total
# Wealth metrics: income (median, avg, total per capita), capital (median, avg, total per capita)

from scipy.stats import pearsonr, spearmanr

# Define energy columns
energy_cols = ['household_elec_per_capita_mwh_2024', 'total_elec_per_capita_mwh_2024']
wealth_cols = [c for c in [
    'income_median_2022','income_avg_2022','income_total_per_capita_2022',
    'capital_median_2022','capital_avg_2022','capital_total_per_capita_2022'
] if c in merged.columns]

# Calculate correlations for all combinations
all_corr_rows = []
for energy_col in energy_cols:
    for wealth_col in wealth_cols:
        df = merged[[energy_col, wealth_col]].dropna()
        if df.empty or len(df) < 3:
            continue
        pr, pp = pearsonr(df[energy_col], df[wealth_col])
        sr, sp = spearmanr(df[energy_col], df[wealth_col])
        all_corr_rows.append({
            'energy_type': energy_col.replace('_per_capita_mwh_2024', '').replace('_', ' ').title(),
            'wealth_metric': wealth_col.replace('_2022', '').replace('_', ' ').title(),
            'pearson_r': pr, 
            'pearson_p': pp, 
            'spearman_r': sr, 
            'spearman_p': sp, 
            'n': len(df)
        })

all_corr_df = pd.DataFrame(all_corr_rows).sort_values('pearson_r', key=abs, ascending=False)
all_corr_df.to_csv('/home/jovyan/work/outputs/comprehensive_correlations.csv', index=False)

print("Comprehensive Correlation Analysis")
print("=" * 80)
print(all_corr_df.to_string(index=False))
all_corr_df

In [None]:
# Scatter plots: Income/Capital (Y-axis) vs Energy Consumption (X-axis)
# Create plots for both household and total electricity

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
sns.set_style('whitegrid')

# Define nice labels
energy_labels = {
    'household_elec_per_capita_mwh_2024': 'Household Electricity (MWh/capita)',
    'total_elec_per_capita_mwh_2024': 'Total Electricity (MWh/capita)'
}

wealth_labels = {
    'income_median_2022': 'Median Income (CHF)',
    'income_avg_2022': 'Average Income (CHF)',
    'income_total_per_capita_2022': 'Income per Capita (CHF)',
    'capital_median_2022': 'Median Capital (CHF)',
    'capital_avg_2022': 'Average Capital (CHF)',
    'capital_total_per_capita_2022': 'Capital per Capita (CHF)'
}

# Create scatter plots for all combinations
for energy_col in energy_cols:
    for wealth_col in wealth_cols:
        if wealth_col not in merged.columns:
            continue
        
        df = merged[[energy_col, wealth_col]].dropna()
        if df.empty:
            continue
        
        # Calculate correlation for title
        pr, _ = pearsonr(df[energy_col], df[wealth_col])
        
        plt.figure(figsize=(8, 6))
        sns.regplot(data=df, x=energy_col, y=wealth_col, 
                   scatter_kws={'alpha':0.6, 's':50}, 
                   line_kws={'color':'red', 'linewidth':2})
        
        plt.xlabel(energy_labels.get(energy_col, energy_col), fontsize=11)
        plt.ylabel(wealth_labels.get(wealth_col, wealth_col), fontsize=11)
        plt.title(f'{wealth_labels.get(wealth_col, wealth_col)} vs {energy_labels.get(energy_col, energy_col)}\n' + 
                 f'Pearson r = {pr:.3f}, n = {len(df)}', fontsize=12, pad=15)
        plt.tight_layout()
        
        # Save with descriptive filename
        filename = f'scatter_{wealth_col}_vs_{energy_col}.png'
        plt.savefig(f'/home/jovyan/work/outputs/figures/{filename}', dpi=150, bbox_inches='tight')
        plt.close()

print(f'Saved {len(energy_cols) * len(wealth_cols)} scatter plots to outputs/figures/')

In [None]:
# Create correlation heatmap for easy visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data for heatmap
corr_matrix = []
energy_labels_short = ['Household Elec', 'Total Elec']
wealth_labels_short = [
    wealth_labels.get(col, col).replace(' (CHF)', '').replace(' 2022', '') 
    for col in wealth_cols
]

for energy_col in energy_cols:
    row = []
    for wealth_col in wealth_cols:
        df = merged[[energy_col, wealth_col]].dropna()
        if df.empty or len(df) < 3:
            row.append(np.nan)
        else:
            pr, _ = pearsonr(df[energy_col], df[wealth_col])
            row.append(pr)
    corr_matrix.append(row)

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 4))
corr_df_heatmap = pd.DataFrame(corr_matrix, 
                                index=energy_labels_short,
                                columns=wealth_labels_short)

sns.heatmap(corr_df_heatmap, annot=True, fmt='.3f', cmap='RdBu_r', 
            center=0, vmin=-1, vmax=1, cbar_kws={'label': 'Pearson r'},
            linewidths=0.5, ax=ax)

plt.title('Correlation Matrix: Energy Consumption vs Wealth Metrics\n(Pearson correlation coefficients)', 
         fontsize=13, pad=15)
plt.xlabel('Wealth Metrics', fontsize=11)
plt.ylabel('Energy Type', fontsize=11)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('/home/jovyan/work/outputs/figures/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print('Correlation heatmap saved.')

In [None]:
# Scatterplots with regression lines
sns.set_context('notebook')
for col in wealth_cols:
    if col not in merged.columns:
        continue
    df = merged[['consum_per_capita_mwh_2024', col]].dropna()
    if df.empty:
        continue
    plt.figure(figsize=(6,5))
    sns.regplot(data=df, x='consum_per_capita_mwh_2024', y=col, scatter_kws={'alpha':0.6})
    plt.title(f'Energy per capita vs {col}')
    plt.xlabel('Energy per capita (MWh) 2024')
    plt.ylabel(col)
    plt.tight_layout()
    plt.savefig(f'/home/jovyan/work/outputs/figures/scatter_{col}.png', dpi=150)
    plt.close()
print('Saved scatter plots.')


In [None]:
# Outlier detection via linear regression residuals
from sklearn.preprocessing import StandardScaler

outlier_rows = []
for col in wealth_cols:
    df = merged[['bfs_nr','consum_per_capita_mwh_2024', col]].dropna()
    if df.shape[0] < 10:
        continue
    X = df[[col]].values.reshape(-1,1)
    y = df['consum_per_capita_mwh_2024'].values
    model = LinearRegression().fit(X, y)
    y_pred = model.predict(X)
    resid = y - y_pred
    z = (resid - resid.mean()) / (resid.std(ddof=1) if resid.std(ddof=1) else 1.0)
    df_out = pd.DataFrame({'bfs_nr': df['bfs_nr'].values, 'metric': col, 'residual': resid, 'zscore': z})
    outlier_rows.append(df_out)

outliers = pd.concat(outlier_rows, ignore_index=True) if outlier_rows else pd.DataFrame(columns=['bfs_nr','metric','residual','zscore'])
outliers_sorted = outliers.reindex(outliers['zscore'].abs().sort_values(ascending=False).index)
outliers.to_csv('/home/jovyan/work/outputs/outliers.csv', index=False)
outliers_sorted.head(10)


## Summary of Analysis

This notebook analyzes the relationship between wealth metrics and energy consumption in Zurich municipalities.

### Data Sources
- **Energy**: Total and household electricity consumption per capita (2024)
- **Wealth**: Income and capital metrics (median, average, total per capita) from 2022
- **Geography**: Municipality boundaries for Zurich canton

### Analysis Performed

1. **Comprehensive Correlation Analysis**
   - Calculated Pearson and Spearman correlations for all combinations
   - Energy types: Household electricity, Total electricity
   - Wealth metrics: Income (median/avg/per capita), Capital (median/avg/per capita)
   - Results saved to `comprehensive_correlations.csv`

2. **Visualizations**
   - Scatter plots with wealth metrics on Y-axis and energy consumption on X-axis
   - Correlation heatmap showing all relationships at once
   - Choropleth maps showing spatial distribution of metrics
   - All figures saved to `outputs/figures/` and `outputs/maps/`

3. **Outlier Detection**
   - Identified municipalities with unusual energy consumption patterns
   - Based on regression residuals and z-scores

### Key Findings
Check the correlation tables and plots to identify:
- Which wealth metrics correlate most strongly with energy consumption
- Whether household or total electricity shows stronger correlations
- Spatial patterns in the relationship between wealth and energy use