In [4]:
import pandas as pd
import numpy as np  # Importieren von numpy
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

# Daten laden
df = pd.read_csv("bfs_municipality_and_tax_data.csv")

# Vorverarbeitung
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
# Clean the data
df['pop'] = df['pop'].str.replace("'", "").astype(float)
df['tax_income'] = df['tax_income'].str.replace("'", "").astype(float)

# Feature Engineering
df['log_pop_dens'] = np.log1p(df['pop_dens'])  # Berechnung des logarithmierten 'pop_dens'

# Daten aufteilen
X = df[['pop', 'log_pop_dens']]
y = df['tax_income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modellpipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Training und Kreuzvalidierung
pipeline.fit(X_train, y_train)
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("MSE:", -scores.mean())

# RandomForest für den Vergleich
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Random Forest MSE:", -rf_scores.mean())

AttributeError: Can only use .str accessor with string values!