# Imputation Evaluation Notebook
In diesem Notebook vergleichen wir verschiedene Imputationsmethoden für Wetterzeitreihen.
## 1. Setup und Daten laden

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer, KNNImputer
from scipy.interpolate import interp1d

# Daten laden
df = pd.read_csv('../data/wetterdaten_top150.csv.gz', compression='gzip', parse_dates=['timestamp'])
df = df.sort_values(['station_id', 'timestamp']).set_index('timestamp')
df.head()

Unnamed: 0_level_0,station_id,tl,rf,ff,ffx,cglo,so_h,rr,rrm,tb10,tb20
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-01-01 00:00:00+00:00,2,-2.7,93.0,1.7,2.6,0.0,0.0,0.0,0.0,0.5,1.1
2014-01-01 01:00:00+00:00,2,-2.6,92.0,1.5,3.2,0.0,0.0,0.0,0.0,0.5,1.1
2014-01-01 02:00:00+00:00,2,-2.3,92.0,1.0,2.6,0.0,0.0,0.0,0.0,0.5,1.1
2014-01-01 03:00:00+00:00,2,-2.2,90.0,1.2,3.4,0.0,0.0,0.0,0.0,0.5,1.1
2014-01-01 04:00:00+00:00,2,-2.4,90.0,2.4,3.6,0.0,0.0,0.0,0.0,0.5,1.1


## 2. Künstliche Lücken erzeugen
Funktion zum Einführen von fehlenden Daten in unterschiedlichen Szenarien.

In [2]:
def create_gaps(series, gap_type):
    s = series.copy()
    n = len(s)
    if gap_type == 'kurz':
        lengths = np.random.randint(1, 4, size=5)
    elif gap_type == 'mittel':
        lengths = np.random.randint(4, 25, size=3)
    else:
        lengths = np.random.randint(24, 168, size=2)
    for L in lengths:
        start = np.random.randint(0, n-L)
        s.iloc[start:start+L] = np.nan
    return s

## 3. Imputationsmethoden definieren

In [3]:
methods = {
    'linear': lambda x: x.interpolate(method='time', limit=3),
    'spline2': lambda x: x.interpolate(method='spline', order=2),
    'iterative': lambda x: pd.Series(
        IterativeImputer(max_iter=10, random_state=0)
        .fit_transform(x.values.reshape(-1,1)).ravel(), index=x.index),
    'knn': lambda x: pd.Series(
        KNNImputer(n_neighbors=5)
        .fit_transform(x.values.reshape(-1,1)).ravel(), index=x.index)
}

## 4. Lücken füllen und Performance messen

In [None]:
results = []
stations = df['station_id'].unique()[:5]
variables = ['tl','rf','ff','cglo']
scenarios = ['kurz','mittel','lang']

for station in stations:
    df_s = df[df['station_id']==station]
    for var in variables:
        orig = df_s[var].dropna()
        for scen in scenarios:
            masked = create_gaps(orig, scen)
            gap_idx = masked[masked.isna()].index

            for name, method in methods.items():
                filled = method(masked)

                # nur Indizes, die überhaupt imputiert wurden
                valid = gap_idx.intersection(filled.dropna().index)
                if len(valid)==0:
                    mae = np.nan
                    rmse = np.nan
                else:
                    y_true = orig.loc[valid]
                    y_pred = filled.loc[valid]
                    mae  = mean_absolute_error(y_true, y_pred)
                    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

                results.append((station, var, scen, name, mae, rmse))

results_df = pd.DataFrame(
    results,
    columns=['station','var','scenario','method','MAE','RMSE']
)
results_df.head()


In [None]:
import seaborn as sns
pivot = results_df.pivot_table(index='scenario', columns='method', values='MAE', aggfunc='mean')
sns.heatmap(pivot, annot=True)
plt.title('Durchschnittlicher MAE pro Szenario und Methode')
plt.show()