In [None]:
# Cell 1 - Install XGBoost (optional)
!conda install -y -c conda-forge xgboost

In [None]:
# Cell 2 - Configuration
BUCKET_NAME = "cloud-project-cloutserfers"
HOUSING_KEY = "part-0.parquet"
ELECTRICITY_KEY = "electricity_all_cleaned.parquet"
MODEL_DIR = "backend/app/models"
RANDOM_SEED = 123
TEST_SIZE = 0.2

datasets = [
    {"name": "housing", "s3_key": HOUSING_KEY, "target": "price"},
    {"name": "electricity", "s3_key": ELECTRICITY_KEY, "target": "demand_mw"},
]

In [None]:
import os
import boto3
import numpy as np
import pandas as pd

from pandas.api.types import is_string_dtype, is_bool_dtype

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor
import joblib

# Make sure model dir exists
os.makedirs(MODEL_DIR, exist_ok=True)

print("Imports OK; model dir:", MODEL_DIR)


In [None]:
def clean_nullable_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert pandas nullable dtypes into normal dtypes and replace pd.NA with np.nan.
    This helps avoid weird dtype issues when training.
    """
    df = df.copy()
    
    # Convert pandas StringDtype to object
    for col in df.columns:
        if is_string_dtype(df[col]):
            df[col] = df[col].astype("object")
    
    # Convert nullable bool to float
    for col in df.columns:
        if is_bool_dtype(df[col]):
            df[col] = df[col].astype("float64")
    
    # Replace pandas NA with numpy nan
    df = df.replace({pd.NA: np.nan})
    return df

print("clean_nullable_dtypes() is ready.")


In [None]:
s3 = boto3.client("s3")

results = []  # to store metrics for later display

for ds in datasets:
    name   = ds["name"]
    s3_key = ds["s3_key"]
    target = ds["target"]

    print("\n==============================")
    print(f"Dataset: {name}")
    print(f"S3 path: s3://{BUCKET_NAME}/{s3_key}")
    print(f"Target : {target}")
    print("==============================\n")

    # 1) Download parquet from S3
    local_parquet = f"/tmp/{name}.parquet"
    print("Downloading parquet from S3...")
    s3.download_file(BUCKET_NAME, s3_key, local_parquet)
    print("Downloaded to:", local_parquet)

    # 2) Read parquet
    print("Reading parquet into pandas...")
    df = pd.read_parquet(local_parquet)
    print("Raw shape:", df.shape)
    print("Raw columns:", list(df.columns))

    # 3) Clean nullable dtypes
    df = clean_nullable_dtypes(df)

    # 4) Dataset-specific feature engineering and fixed feature list
    if name == "housing":
        # We know housing has region/property_type/duration/year/month/is_new_build, etc.
        # Rename duration -> tenure
        if "tenure" not in df.columns and "duration" in df.columns:
            df = df.rename(columns={"duration": "tenure"})

        # Required raw input columns (as you specified)
        feature_cols = ["region", "property_type", "tenure", "year", "month", "is_new_build"]
        missing = [c for c in feature_cols if c not in df.columns]
        if missing:
            raise ValueError(f"Housing dataset is missing required columns: {missing}")

        # Define which are categorical vs numeric
        cat_cols = ["region", "property_type", "tenure", "is_new_build"]
        num_cols = ["year", "month"]

        # Do NOT force categoricals to numeric; leave them as strings/ints
        # Just make sure numeric ones are numeric:
        for col in num_cols:
            df[col] = pd.to_numeric(df[col])

        print("Housing feature columns:", feature_cols)
        print(df[feature_cols + [target]].head())

    elif name == "electricity":
        # We know electricity has 'ts' and 'demand_mw'
        ts_candidates = ["ts", "timestamp", "datetime"]
        ts_col = None
        for c in ts_candidates:
            if c in df.columns:
                ts_col = c
                break
        if ts_col is None:
            raise ValueError(
                f"Electricity dataset: expected one of {ts_candidates}, found {list(df.columns)}"
            )

        df[ts_col] = pd.to_datetime(df[ts_col])

        # Required raw inputs for electricity:
        # year, month, day, hour, is_weekend
        df["year"]       = df[ts_col].dt.year
        df["month"]      = df[ts_col].dt.month
        df["day"]        = df[ts_col].dt.day
        df["hour"]       = df[ts_col].dt.hour
        df["is_weekend"] = (df[ts_col].dt.dayofweek >= 5).astype(int)

        feature_cols = ["year", "month", "day", "hour", "is_weekend"]
        cat_cols = []                      # all numeric here
        num_cols = feature_cols[:]         # all numeric

        for col in num_cols:
            df[col] = pd.to_numeric(df[col])

        print("Electricity feature columns:", feature_cols)
        print(df[feature_cols + [target]].head())

    else:
        raise ValueError(f"Unknown dataset name: {name}")

    # 5) Ensure target exists and numeric
    if target not in df.columns:
        raise ValueError(f"Target '{target}' not in columns: {list(df.columns)}")
    df[target] = pd.to_numeric(df[target])

    # 6) Build final X, y (with fixed raw feature columns)
    X = df[feature_cols].copy()
    y = df[target].copy()

    print("Final X shape:", X.shape)
    print("Final y length:", len(y))

    # 7) Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
    )

    # Drop rows with NaN
    train_mask = ~X_train.isna().any(axis=1)
    test_mask  = ~X_test.isna().any(axis=1)
    X_train, y_train = X_train[train_mask], y_train[train_mask]
    X_test, y_test   = X_test[test_mask], y_test[test_mask]

    print("Train shape:", X_train.shape)
    print("Test shape :", X_test.shape)

    # 8) Preprocessor: OneHot for categoricals, passthrough numerics
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", "passthrough", num_cols),
        ]
    )

    # ---------- 9) Linear Regression pipeline ----------
    lin_pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", LinearRegression()),
        ]
    )

    lin_pipeline.fit(X_train, y_train)
    y_pred_lin = lin_pipeline.predict(X_test)

    mse_lin  = mean_squared_error(y_test, y_pred_lin)
    rmse_lin = mse_lin ** 0.5
    mae_lin  = mean_absolute_error(y_test, y_pred_lin)
    r2_lin   = r2_score(y_test, y_pred_lin)

    print(f"[{name}] Linear Regression - RMSE: {rmse_lin:.3f}, MAE: {mae_lin:.3f}, R2: {r2_lin:.3f}")

    lin_bundle = {
        "model": lin_pipeline,        # pipeline with preprocessing + model
        "features": feature_cols,     # RAW inputs backend must provide
        "target": target,
        "model_type": "linear_regression",
    }

    lin_filename = os.path.join(MODEL_DIR, f"{name}_linear.joblib")
    joblib.dump(lin_bundle, lin_filename)
    print(f"Saved LinearRegression bundle to: {lin_filename}")

    results.append({
        "dataset": name,
        "model_type": "linear_regression",
        "rmse": rmse_lin,
        "mae": mae_lin,
        "r2": r2_lin,
        "file": lin_filename,
    })

    # ---------- 10) XGBoost pipeline ----------
    xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=RANDOM_SEED,
        tree_method="hist",
        eval_metric="rmse",
    )

    xgb_pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", xgb),
        ]
    )

    param_dist = {
        "model__n_estimators": [200, 400, 600],
        "model__max_depth": [4, 6, 8],
        "model__learning_rate": [0.01, 0.05, 0.1],
        "model__subsample": [0.7, 0.85, 1.0],
        "model__colsample_bytree": [0.7, 0.85, 1.0],
        "model__reg_lambda": [0.5, 1.0, 2.0],
    }

    tuner = RandomizedSearchCV(
        estimator=xgb_pipeline,
        param_distributions=param_dist,
        n_iter=10,
        scoring="neg_mean_squared_error",
        cv=3,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1,
    )

    print(f"Starting XGBoost hyperparameter search for {name}...")
    tuner.fit(X_train, y_train)
    print("Best params:", tuner.best_params_)

    best_xgb_pipeline = tuner.best_estimator_
    y_pred_xgb = best_xgb_pipeline.predict(X_test)

    mse_xgb  = mean_squared_error(y_test, y_pred_xgb)
    rmse_xgb = mse_xgb ** 0.5
    mae_xgb  = mean_absolute_error(y_test, y_pred_xgb)
    r2_xgb   = r2_score(y_test, y_pred_xgb)

    print(f"[{name}] XGBRegressor - RMSE: {rmse_xgb:.3f}, MAE: {mae_xgb:.3f}, R2: {r2_xgb:.3f}")

    xgb_bundle = {
        "model": best_xgb_pipeline,   # pipeline (preprocessor + XGB)
        "features": feature_cols,     # RAW inputs backend must provide
        "target": target,
        "model_type": "xgboost",
    }

    xgb_filename = os.path.join(MODEL_DIR, f"{name}_xgb.joblib")
    joblib.dump(xgb_bundle, xgb_filename)
    print(f"Saved XGBRegressor bundle to: {xgb_filename}")

    results.append({
        "dataset": name,
        "model_type": "xgboost",
        "rmse": rmse_xgb,
        "mae": mae_xgb,
        "r2": r2_xgb,
        "file": xgb_filename,
    })

print("\nTraining finished for all datasets. All 4 models saved in:", MODEL_DIR)


In [None]:
# Interactive prediction for a single house using the housing XGB model

import os
import joblib
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_bool_dtype
from IPython.display import display

# 1) Load housing XGB bundle from backend/app/models
housing_xgb_path = os.path.join(MODEL_DIR, "housing_xgb.joblib")
print("Loading housing XGB bundle from:", housing_xgb_path)
housing_bundle = joblib.load(housing_xgb_path)

model = housing_bundle["model"]       # this is a Pipeline(preprocessor + XGB)
feature_names = housing_bundle["features"]  # ['region','property_type','tenure','year','month','is_new_build']
target_col = housing_bundle["target"]

print("Model type:", housing_bundle.get("model_type"))
print("Features expected by the model:", feature_names)
print("Target:", target_col)

# 2) Reload housing dataset to compute sensible defaults
housing_cfg = [d for d in datasets if d["name"] == "housing"][0]
local_parquet = "/tmp/housing_for_prediction.parquet"
print("\nDownloading housing parquet again for defaults...")
s3.download_file(BUCKET_NAME, housing_cfg["s3_key"], local_parquet)
df = pd.read_parquet(local_parquet)
df = clean_nullable_dtypes(df)

# Make sure 'tenure' exists (from 'duration' if needed)
if "tenure" not in df.columns and "duration" in df.columns:
    df = df.rename(columns={"duration": "tenure"})

required_cols = ["region", "property_type", "tenure", "year", "month", "is_new_build"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Housing parquet is missing required columns {missing} for interactive defaults.")

# Only numeric columns need to be numeric; leave categoricals as strings/ints
numeric_cols = ["year", "month"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

feature_df = df[feature_names].copy()

print("\nSample of training features:")
display(feature_df.head())

# 3) Ask the user for input values for each feature
print("\n=== Interactive input for a single house ===")
print("For each feature, you'll see:")
print(" - Expected or possible values")
print(" - A default (from training data)")
print("Press ENTER to accept the default.\n")

row = {}

for col in feature_names:
    series = feature_df[col].dropna()

    if series.empty:
        default = 0.0
        expected_info = "No data available, defaulting to 0.0."
    else:
        if is_numeric_dtype(series):
            default = float(series.median())
        elif is_bool_dtype(series):
            default = bool(series.mode().iloc[0])
        else:
            default = str(series.mode().iloc[0])

        unique_vals = np.unique(series.values)
        if len(unique_vals) <= 20:
            expected_info = f"Expected values: {list(unique_vals)}"
        else:
            if is_numeric_dtype(series):
                expected_info = (
                    f"Expected numeric range: approx [{series.min():.3g}, {series.max():.3g}] "
                    f"(median={series.median():.3g})"
                )
            else:
                top_vals = series.value_counts().head(10).index.tolist()
                expected_info = f"Most common categories: {top_vals} (total unique={len(unique_vals)})"

    print(f"\nFeature: '{col}'")
    print(" ", expected_info)
    print("  Default:", default)

    user_in = input(f"Enter value for '{col}' (press ENTER for default): ").strip()

    if user_in == "":
        value = default
    else:
        if is_numeric_dtype(series):
            # decide int vs float
            kind = series.dtype.kind
            if kind in ["i", "u"]:
                try:
                    value = int(float(user_in))
                except ValueError:
                    print(f"  Could not parse integer, using default for '{col}'.")
                    value = default
            else:
                try:
                    value = float(user_in)
                except ValueError:
                    print(f"  Could not parse number, using default for '{col}'.")
                    value = default
        elif is_bool_dtype(series):
            value = user_in.lower() in ["1", "true", "yes", "y", "t"]
        else:
            value = user_in

    row[col] = value

# 4) Build DataFrame and predict
input_df = pd.DataFrame([row])[feature_names]
print("\n=== You entered the following values ===")
display(input_df)

pred = model.predict(input_df)[0]
print(f"\nEstimated {target_col} for this house: {pred:,.2f}")
