In [None]:
# -------------------------------
# 1. Import libraries
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# -------------------------------
# 2. Load dataset (example)
# -------------------------------
# You can replace this with your dataset
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame

# -------------------------------
# 3. Select a small sample (10% for initial testing)
# -------------------------------
sample_df = df.sample(frac=0.1, random_state=42)

# Features and target
X = sample_df.drop(columns='MedHouseVal')
y = sample_df['MedHouseVal']

# -------------------------------
# 4. Identify numerical and categorical columns
# -------------------------------
# In this dataset all columns are numeric
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()  # if you have categorical features

# -------------------------------
# 5. Preprocessing
# -------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
    ]
)

# -------------------------------
# 6. Define models to compare
# -------------------------------
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# -------------------------------
# 7. Cross-validation
# -------------------------------
cv_results = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    # Create a pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    
    # Compute CV scores (negative MSE)
    scores = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_squared_error')
    
    # Convert to positive MSE
    mse_scores = -scores
    rmse_scores = np.sqrt(mse_scores)
    
    cv_results.append({
        'Model': name,
        'RMSE Mean': rmse_scores.mean(),
        'RMSE Std': rmse_scores.std()
    })

# -------------------------------
# 8. Show results
# -------------------------------
cv_df = pd.DataFrame(cv_results).sort_values(by='RMSE Mean')
print(cv_df)
