# Daten vorbereiten

Dieses Notebook bereitet die geladenen Datensätze auf, erzeugt Aggregationen und erstellt eine gemeinsame Tabelle.


In [None]:

from pathlib import Path
import numpy as np
import pandas as pd


def resolve_project_root():
    candidates = [Path.cwd()]
    if '__file__' in globals():
        candidates.append(Path(__file__).resolve().parent)
    candidates.append(Path.cwd() / 'zhaw-scripting')

    checked = []
    for start in candidates:
        if start is None:
            continue
        base = start if start.is_dir() else start.parent
        for candidate in [base, *base.parents]:
            data_dir = candidate / 'datasets'
            checked.append(data_dir)
            if (data_dir / 'energyreporter_municipality_historized.csv').exists():
                return candidate
    raise FileNotFoundError(
        "Project root could not be resolved. Checked: "
        + ", ".join(str(p) for p in checked)
    )


project_root = resolve_project_root()
processed_dir = project_root / 'outputs' / 'processed'
output_dir = project_root / 'outputs'
processed_dir.mkdir(parents=True, exist_ok=True)

energy_zh = pd.read_csv(processed_dir / 'energy_zh.csv', parse_dates=['energyreporter_date'])
wealth_small = pd.read_csv(processed_dir / 'wealth_small.csv')

print('Loaded prepared inputs', energy_zh.shape, wealth_small.shape)


In [None]:

# Energiekennzahlen 2024 aggregieren
energy_2024 = energy_zh[energy_zh['energyreporter_date'].dt.year == 2024].copy()
consum = (
    energy_2024
    .groupby('bfs_nr', as_index=False)
    .agg({
        'elec_consumption_mwh_per_year_per_capita': 'mean',
        'elec_consumption_households_mwh_per_year_per_capita': 'mean'
    })
    .rename(columns={
        'elec_consumption_mwh_per_year_per_capita': 'total_elec_per_capita_mwh_2024',
        'elec_consumption_households_mwh_per_year_per_capita': 'household_elec_per_capita_mwh_2024'
    })
)

consum.to_csv(processed_dir / 'energy_consumption_2024.csv', index=False)
consum.head()


In [None]:

# Vermögenskennzahlen 2022 extrahieren
wealth_2022 = wealth_small[wealth_small['INDIKATOR_JAHR'] == 2022].copy()
patterns = {
    'income_median_2022': r'^Steuerb\. Einkommen.*Median',
    'income_avg_2022': r'^Steuerb\. Einkommen.*Durchschn',
    'income_total_per_capita_2022': r'^Steuerb\. Einkommen.*(je Einwohner|pro Kopf)',
    'capital_median_2022': r'^Steuerb\. Vermögen.*Median',
    'capital_avg_2022': r'^Steuerb\. Vermögen.*Durchschn',
    'capital_total_per_capita_2022': r'^Steuerb\. Vermögen.*(je Einwohner|pro Kopf)'
}
wealth_2022['metric'] = np.nan
for col, pattern in patterns.items():
    match_mask = wealth_2022['INDIKATOR_NAME'].str.contains(pattern, regex=True, na=False)
    wealth_2022.loc[match_mask, 'metric'] = col

wide = (
    wealth_2022.dropna(subset=['metric'])
    .pivot_table(index='BFS_NR', columns='metric', values='INDIKATOR_VALUE', aggfunc='first')
    .reset_index()
    .rename(columns={'BFS_NR': 'bfs_nr'})
)
wide.to_csv(processed_dir / 'wealth_metrics_2022.csv', index=False)
wide.head()


In [None]:

# Energie- und Vermögensdaten kombinieren
municipality_lookup = energy_zh[['bfs_nr', 'municipality']].drop_duplicates(subset='bfs_nr')
merged = consum.merge(wide, on='bfs_nr', how='inner')
merged = merged.merge(municipality_lookup, on='bfs_nr', how='left')

# Fallback für Gemeindebezeichnungen aus der Vermögensdatei
wealth_names = wealth_small[['BFS_NR', 'GEBIET_NAME']].drop_duplicates(subset='BFS_NR')
wealth_names = wealth_names.rename(columns={'BFS_NR': 'bfs_nr', 'GEBIET_NAME': 'municipality_wealth'})
merged = merged.merge(wealth_names, on='bfs_nr', how='left')
if 'municipality' not in merged.columns:
    merged['municipality'] = merged['municipality_wealth']
else:
    merged['municipality'] = merged['municipality'].fillna(merged['municipality_wealth'])
merged = merged.drop(columns=['municipality_wealth'])

# Numerische Spalten bereinigen und Verhältnisse bilden
value_cols = [
    'income_median_2022','income_avg_2022','income_total_per_capita_2022',
    'capital_median_2022','capital_avg_2022','capital_total_per_capita_2022'
]
for col in value_cols:
    if col in merged.columns:
        merged[col] = pd.to_numeric(merged[col], errors='coerce').replace(0, np.nan)

if 'capital_median_2022' in merged.columns:
    merged['ratio_consum_to_capital_median'] = merged['household_elec_per_capita_mwh_2024'] / merged['capital_median_2022']
if 'capital_avg_2022' in merged.columns:
    merged['ratio_consum_to_capital_avg'] = merged['household_elec_per_capita_mwh_2024'] / merged['capital_avg_2022']
if 'capital_total_per_capita_2022' in merged.columns:
    merged['ratio_consum_to_capital_total'] = merged['household_elec_per_capita_mwh_2024'] / merged['capital_total_per_capita_2022']

merged.to_csv(processed_dir / 'merged_energy_wealth.csv', index=False)
print(f"Merged dataset saved to {processed_dir / 'merged_energy_wealth.csv'}")
merged.head()
