In [None]:
# --- 1. Imports ---
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import randint
import joblib
import matplotlib.pyplot as plt

# --- 2. Load Data ---
data = pd.read_csv("your_dataset.csv")   # replace with your dataset
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

X_train = train_set.drop("target_column", axis=1)   # replace target_column
y_train = train_set["target_column"].copy()

X_test = test_set.drop("target_column", axis=1)
y_test = test_set["target_column"].copy()

# --- 3. Preprocessing ---
num_attribs = X_train.select_dtypes(include=[np.number]).columns
cat_attribs = X_train.select_dtypes(exclude=[np.number]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

# --- 4. Train Baseline Model ---
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train_prepared, y_train)

# --- 5. Fine-Tuning (Grid Search) ---
param_grid = [
    {"n_estimators": [30, 100], "max_features": [4, 6, 8]}
]
grid_search = GridSearchCV(
    forest_reg, param_grid, cv=5,
    scoring="neg_mean_squared_error",
    return_train_score=True
)
grid_search.fit(X_train_prepared, y_train)

print("Best Params (GridSearch):", grid_search.best_params_)

# --- 6. Fine-Tuning (Randomized Search) ---
param_distribs = {
    "n_estimators": randint(low=10, high=200),
    "max_features": randint(low=2, high=10)
}
rnd_search = RandomizedSearchCV(
    forest_reg, param_distribs, n_iter=50,
    cv=5, scoring="neg_mean_squared_error",
    random_state=42
)
rnd_search.fit(X_train_prepared, y_train)

print("Best Params (RandomizedSearch):", rnd_search.best_params_)

# --- 7. Analyze Feature Importances ---
final_model = rnd_search.best_estimator_
feature_importances = final_model.feature_importances_

# Match importance with feature names
cat_onehot_features = list(full_pipeline.named_transformers_["cat"]["onehot"].get_feature_names_out(cat_attribs))
attributes = list(num_attribs) + cat_onehot_features
sorted_features = sorted(zip(feature_importances, attributes), reverse=True)

print("\nTop Features:")
for score, name in sorted_features[:10]:
    print(f"{name}: {score:.4f}")

# --- 8. Evaluate on Test Set ---
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_mae = mean_absolute_error(y_test, final_predictions)

print("\nTest RMSE:", final_rmse)
print("Test MAE:", final_mae)

# Confidence Interval
from scipy import stats
errors = final_predictions - y_test
confidence = 0.95
interval = stats.t.interval(
    confidence, len(errors)-1,
    loc=errors.mean(), scale=stats.sem(errors)
)
print("95% Confidence Interval:", interval)

# --- 9. Save Model + Pipeline ---
joblib.dump((final_model, full_pipeline), "final_model.pkl")
