# Daten analysieren und auswerten

Dieses Notebook lädt die vorbereiteten Daten und berechnet Korrelationen sowie Ausreißerlisten.


In [None]:

from pathlib import Path
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

project_root = Path('..').resolve()
processed_dir = project_root / 'outputs' / 'processed'
output_dir = project_root / 'outputs'

merged = pd.read_csv(processed_dir / 'merged_energy_wealth.csv')
print('Merged dataset loaded:', merged.shape)


In [None]:

# Einfache Korrelationsanalyse für Haushaltsstrom
corr_rows = []
wealth_cols = [c for c in [
    'income_median_2022','income_avg_2022','income_total_per_capita_2022',
    'capital_median_2022','capital_avg_2022','capital_total_per_capita_2022'
] if c in merged.columns]
for col in wealth_cols:
    df = merged[['household_elec_per_capita_mwh_2024', col]].dropna()
    if len(df) < 2:
        continue
    pr, pp = pearsonr(df['household_elec_per_capita_mwh_2024'], df[col])
    sr, sp = spearmanr(df['household_elec_per_capita_mwh_2024'], df[col])
    corr_rows.append({'metric': col, 'pearson_r': pr, 'pearson_p': pp, 'spearman_r': sr, 'spearman_p': sp, 'n': len(df)})

corr_df = pd.DataFrame(corr_rows)
if not corr_df.empty:
    corr_df = corr_df.sort_values('pearson_r', ascending=False)
corr_df.to_csv(output_dir / 'correlations.csv', index=False)
corr_df


In [None]:

# Umfassende Korrelationsmatrix für beide Energievarianten
energy_cols = ['household_elec_per_capita_mwh_2024', 'total_elec_per_capita_mwh_2024']
all_corr_rows = []
for energy_col in energy_cols:
    if energy_col not in merged.columns:
        continue
    for wealth_col in wealth_cols:
        if wealth_col not in merged.columns:
            continue
        df = merged[[energy_col, wealth_col]].dropna()
        if len(df) < 2:
            continue
        pr, pp = pearsonr(df[energy_col], df[wealth_col])
        sr, sp = spearmanr(df[energy_col], df[wealth_col])
        all_corr_rows.append({
            'energy_metric': energy_col,
            'wealth_metric': wealth_col,
            'pearson_r': pr,
            'pearson_p': pp,
            'spearman_r': sr,
            'spearman_p': sp,
            'n': len(df)
        })

all_corr_df = pd.DataFrame(all_corr_rows)
if not all_corr_df.empty:
    all_corr_df = all_corr_df.sort_values('pearson_r', key=lambda s: s.abs(), ascending=False)
all_corr_df.to_csv(output_dir / 'comprehensive_correlations.csv', index=False)
all_corr_df


In [None]:

# Ausreißererkennung über Regressionsresiduen
outlier_rows = []
features = ['household_elec_per_capita_mwh_2024', 'total_elec_per_capita_mwh_2024']
model = LinearRegression()

for col in wealth_cols:
    cols_available = [f for f in features if f in merged.columns]
    if not cols_available:
        continue
    df = merged[['bfs_nr', col] + cols_available].dropna()
    if df.empty:
        continue
    X = df[cols_available]
    y = df[col]
    model.fit(X, y)
    y_pred = model.predict(X)
    resid = y - y_pred
    scaler = StandardScaler()
    z = scaler.fit_transform(resid.to_frame(name='residual')).flatten()
    outlier_rows.append(pd.DataFrame({'bfs_nr': df['bfs_nr'].values, 'metric': col, 'residual': resid, 'zscore': z}))

outliers = pd.concat(outlier_rows, ignore_index=True) if outlier_rows else pd.DataFrame(columns=['bfs_nr','metric','residual','zscore'])
outliers_sorted = outliers.reindex(outliers['zscore'].abs().sort_values(ascending=False).index)
outliers.to_csv(output_dir / 'outliers.csv', index=False)
outliers_sorted.head(10)
