In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor

# Load the dataset
df = pd.read_csv('your_data.csv')

# Display dataset info
print(f"Dataset Shape: {df.shape}")
print(f"First 5 Rows:\n{df.head()}")

# Separate features and target
X = df.drop(columns=['target_column'], axis=1)
y = df['target_column']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=[np.object_]).columns

# Preprocessing pipelines
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Combine preprocessors into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features)
    ]
)

# Define models for comparison
models = {
    'LinearRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]),
    'RidgeRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', Ridge())
    ]),
    'LassoRegression': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', Lasso())
    ]),
    'RandomForestRegressor': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(random_state=42))
    ]),
    'DecisionTreeRegressor': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor(random_state=42))
    ])
}

# Cross-validation and model evaluation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model_scores = {}

print("\nEvaluating models using cross-validation...\n")
for model_name, model_pipeline in models.items():
    cv_scores = -1 * cross_val_score(model_pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
    model_scores[model_name] = np.mean(cv_scores)
    rmse_scores = np.sqrt(-cv_scores)  # Convert negative MSE to RMSE
    print(f"{model_name}:")
    print(f"  Mean RMSE: {np.mean(rmse_scores):.4f}")
    print(f"  Standard Deviation RMSE: {np.std(rmse_scores):.4f}")
    print(f"  95% Confidence Interval RMSE: {np.quantile(rmse_scores, [0.025, 0.975])}")

# Select the best model based on CV scores
best_model_name = min(model_scores, key=model_scores.get)
print(f"\nBest model based on cross-validation scores: {best_model_name}")

# Plot boxplot for cross-validation results
plt.figure(figsize=(10, 6))
plt.boxplot([np.sqrt(-1 * cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1))
             for pipeline in models.values()],
            labels=models.keys())
plt.title('Model Performance - Cross-Validation (RMSE)')
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.show()

# Hyperparameter tuning
param_grid = {
    'LinearRegression': {},
    'RidgeRegression': {'model__alpha': [0.1, 1.0, 10.0]},
    'LassoRegression': {'model__alpha': [0.1, 1.0, 10.0]},
    'RandomForestRegressor': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5]
    },
    'DecisionTreeRegressor': {
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5]
    }
}

if best_model_name in param_grid:
    print(f"\nPerforming grid search for {best_model_name}...")
    grid_search = GridSearchCV(
        models[best_model_name],
        param_grid[best_model_name],
        cv=kf,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )
    grid_search.fit(X_train, y_train)

    print(f"\nBest parameters for {best_model_name}: {grid_search.best_params_}")
    print(f"Best cross-validated RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

    # Evaluate the best model on the test set
    final_model = grid_search.best_estimator_
    y_pred = final_model.predict(X_test)

    # Calculate all metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Display metrics
    print(f"\nTest Set Metrics for {best_model_name}:")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}")
    print(f"  R2 Score: {r2:.4f}")
else:
    print("No hyperparameters to tune for the selected model.")