# Regressionsmodell: Quadratmeterpreis vs. Endbestand

Dieses Notebook erstellt ein Regressionsmodell zur Vorhersage von `ds10200_quadratmeterpreis_chf` basierend auf `ds10680_endbestand`.

## 1. Import erforderlicher Bibliotheken

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.api as sm
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Daten laden

In [None]:
# Daten laden
df = pd.read_csv('/workspaces/ARM_Gruppe-5/data/ARM_Master_Table_final.csv')

# Relevant Spalten wählen
columns = ['ds10200_quadratmeterpreis_chf', 'ds10680_endbestand']
df_model = df[columns].copy()

print("Datensatz geladen:")
print(f"Form: {df_model.shape}")
print(f"\nErstellt 5 Zeilen:")
print(df_model.head())

## 3. Datenüberblick & Bereinigung

In [None]:
# Info über Datensatz
print("Dateninfo:")
print(df_model.info())
print("\n" + "="*50 + "\n")

# Beschreibende Statistik
print("Beschreibende Statistik:")
print(df_model.describe())
print("\n" + "="*50 + "\n")

# Konvertiere zu numerisch mit error handling
df_model['ds10200_quadratmeterpreis_chf'] = pd.to_numeric(df_model['ds10200_quadratmeterpreis_chf'], errors='coerce')
df_model['ds10680_endbestand'] = pd.to_numeric(df_model['ds10680_endbestand'], errors='coerce')

# Fehler Werte Check
print(f"Fehlende Werte:")
print(df_model.isnull().sum())
print("\n" + "="*50 + "\n")

# Entferne NaN-Werte
df_model_clean = df_model.dropna(subset=['ds10200_quadratmeterpreis_chf', 'ds10680_endbestand'])

print(f"Datensatz nach Bereinigung: {df_model_clean.shape}")
print(f"Entfernte Zeilen: {df_model.shape[0] - df_model_clean.shape[0]}")

## 4. Explorative Datenanalyse (EDA)

In [None]:
# Scatterplot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Scatterplot
axes[0, 0].scatter(df_model_clean['ds10680_endbestand'], 
                    df_model_clean['ds10200_quadratmeterpreis_chf'], alpha=0.6)
axes[0, 0].set_xlabel('Endbestand (ds10680_endbestand)')
axes[0, 0].set_ylabel('Quadratmeterpreis CHF (ds10200_quadratmeterpreis_chf)')
axes[0, 0].set_title('Scatterplot: Quadratmeterpreis vs. Endbestand')
axes[0, 0].grid(True, alpha=0.3)

# Histogramm Zielwert
axes[0, 1].hist(df_model_clean['ds10200_quadratmeterpreis_chf'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Quadratmeterpreis CHF')
axes[0, 1].set_ylabel('Häufigkeit')
axes[0, 1].set_title('Verteilung Quadratmeterpreis')

# Histogramm Prädiktor
axes[1, 0].hist(df_model_clean['ds10680_endbestand'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1, 0].set_xlabel('Endbestand')
axes[1, 0].set_ylabel('Häufigkeit')
axes[1, 0].set_title('Verteilung Endbestand')

# Boxplot
df_model_clean.boxplot(ax=axes[1, 1])
axes[1, 1].set_title('Boxplot: Beide Variablen')

plt.tight_layout()
plt.show()

print("\n" + "="*50 + "\n")

# Korrelation berechnen
correlation = df_model_clean['ds10200_quadratmeterpreis_chf'].corr(df_model_clean['ds10680_endbestand'])
pearson_r, p_value = stats.pearsonr(df_model_clean['ds10200_quadratmeterpreis_chf'], 
                                      df_model_clean['ds10680_endbestand'])

print(f"Pearson-Korrelation: {correlation:.4f}")
print(f"Pearson-Korrelationskoeffizient: {pearson_r:.4f}")
print(f"P-Wert: {p_value:.4e}")
print(f"Signifikant: {'Ja' if p_value < 0.05 else 'Nein'}")

## 5. Feature Engineering

In [None]:
# Prüfe auf Ausreißer und ggf. Transformationen
print("Originalwerte:")
print(f"Quadratmeterpreis - Min: {df_model_clean['ds10200_quadratmeterpreis_chf'].min()}, "
      f"Max: {df_model_clean['ds10200_quadratmeterpreis_chf'].max()}")
print(f"Endbestand - Min: {df_model_clean['ds10680_endbestand'].min()}, "
      f"Max: {df_model_clean['ds10680_endbestand'].max()}")

# Erstelle Feature: Log-Transformationen (optional für bessere Linearität)
df_model_clean['log_quadratmeterpreis'] = np.log1p(df_model_clean['ds10200_quadratmeterpreis_chf'])
df_model_clean['log_endbestand'] = np.log1p(df_model_clean['ds10680_endbestand'])

# Vergleich der Korrelationen
corr_original = df_model_clean['ds10200_quadratmeterpreis_chf'].corr(df_model_clean['ds10680_endbestand'])
corr_log = df_model_clean['log_quadratmeterpreis'].corr(df_model_clean['log_endbestand'])

print(f"\nKorrelation (Original): {corr_original:.4f}")
print(f"Korrelation (Log-transformiert): {corr_log:.4f}")

# Entferne Nullwerte für Preis (> 0)
df_model_clean = df_model_clean[df_model_clean['ds10200_quadratmeterpreis_chf'] > 0]
print(f"\nDatensatz nach Ausreißer-Entfernung (Preis > 0): {df_model_clean.shape}")

## 6. Train-/Test-Split

In [None]:
# Vorbereitung für Modellierung
# Verwende Original-Werte
X = df_model_clean[['ds10680_endbestand']].values
y = df_model_clean['ds10200_quadratmeterpreis_chf'].values

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training-Set Größe: {X_train.shape[0]}")
print(f"Test-Set Größe: {X_test.shape[0]}")
print(f"Train/Test Ratio: {X_train.shape[0] / X_test.shape[0]:.2f}")

## 7. Lineares Regressionsmodell (Baseline)

In [None]:
# Trainiere lineares Regressionsmodell
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Koeffizienten
print("Lineares Regressionsmodell")
print("="*50)
print(f"Modellgleichung: y = β₀ + β₁*x + ε")
print(f"Intercept (β₀): {lr_model.intercept_:.4f}")
print(f"Koeffizient (β₁): {lr_model.coef_[0]:.6f}")

print("\nInterpretation:")
print(f"- Pro 1 Einheit Endbestand-Erhöhung steigt Preis um {lr_model.coef_[0]:.4f} CHF/m²")

# Vorhersagen auf Testset
y_pred_lr = lr_model.predict(X_test)

print(f"\nVorhersagen auf Test-Set (erste 10):")
print(f"{'Tatsächlich':<12} {'Vorhersage':<12} {'Fehler':<12}")
for i in range(min(10, len(y_test))):
    error = y_test[i] - y_pred_lr[i]
    print(f"{y_test[i]:<12.2f} {y_pred_lr[i]:<12.2f} {error:<12.2f}")

## 8. Regularisierte Modelle (Ridge & Lasso)

In [None]:
# Ridge Regression (alpha=1.0)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)

# Lasso Regression (alpha=0.1)
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

print("Ridge & Lasso Modelle")
print("="*50)
print(f"\nRidge (alpha=1.0):")
print(f"  Intercept: {ridge_model.intercept_:.4f}")
print(f"  Koeffizient: {ridge_model.coef_[0]:.6f}")

print(f"\nLasso (alpha=0.1):")
print(f"  Intercept: {lasso_model.intercept_:.4f}")
print(f"  Koeffizient: {lasso_model.coef_[0]:.6f}")

# Vergleich der Koeffizienten
print(f"\nKoeffizientenvergleich:")
print(f"{'Modell':<20} {'Koeffizient':<15}")
print("-" * 35)
print(f"{'Linear Regression':<20} {lr_model.coef_[0]:<15.6f}")
print(f"{'Ridge':<20} {ridge_model.coef_[0]:<15.6f}")
print(f"{'Lasso':<20} {lasso_model.coef_[0]:<15.6f}")

## 9. Hyperparameter-Tuning (GridSearchCV)

In [None]:
# GridSearchCV für Ridge
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Ridge Tuning
ridge_grid = GridSearchCV(Ridge(), param_grid={'alpha': alphas}, cv=5, scoring='r2')
ridge_grid.fit(X_train, y_train)

# Lasso Tuning
lasso_grid = GridSearchCV(Lasso(max_iter=10000), param_grid={'alpha': alphas}, cv=5, scoring='r2')
lasso_grid.fit(X_train, y_train)

print("GridSearchCV Ergebnisse")
print("="*50)
print(f"\nRidge - Beste Alpha: {ridge_grid.best_params_['alpha']}")
print(f"Ridge - Beste CV-Score (R²): {ridge_grid.best_score_:.4f}")

print(f"\nLasso - Beste Alpha: {lasso_grid.best_params_['alpha']}")
print(f"Lasso - Beste CV-Score (R²): {lasso_grid.best_score_:.4f}")

# Verwende beste Modelle für Vorhersagen
best_ridge_model = ridge_grid.best_estimator_
best_lasso_model = lasso_grid.best_estimator_

y_pred_ridge_best = best_ridge_model.predict(X_test)
y_pred_lasso_best = best_lasso_model.predict(X_test)

print(f"\nBeste Ridge Modell - Koeffizient: {best_ridge_model.coef_[0]:.6f}")
print(f"Beste Lasso Modell - Koeffizient: {best_lasso_model.coef_[0]:.6f}")

## 10. Modellbewertung & Metriken

In [None]:
# Modellbewertung auf Test-Set
def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return {
        'Modell': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    }

# Evaluiere alle Modelle
results = []
results.append(evaluate_model(y_test, y_pred_lr, 'Linear Regression'))
results.append(evaluate_model(y_test, y_pred_ridge, 'Ridge (alpha=1.0)'))
results.append(evaluate_model(y_test, y_pred_lasso, 'Lasso (alpha=0.1)'))
results.append(evaluate_model(y_test, y_pred_ridge_best, f'Ridge (best, alpha={ridge_grid.best_params_["alpha"]})'))
results.append(evaluate_model(y_test, y_pred_lasso_best, f'Lasso (best, alpha={lasso_grid.best_params_["alpha"]})'))

# Erstelle Ergebnis-DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R²', ascending=False)

print("Modellbewertung - Test-Set Metriken")
print("="*80)
print(results_df.to_string(index=False))
print("\n" + "="*80)
print(f"\nBestes Modell: {results_df.iloc[0]['Modell']}")
print(f"R²-Score: {results_df.iloc[0]['R²']:.4f}")
print(f"RMSE: {results_df.iloc[0]['RMSE']:.2f} CHF/m²")
print(f"MAE: {results_df.iloc[0]['MAE']:.2f} CHF/m²")

## 11. Residuenanalyse & Annahmenprüfung