In [None]:
# Run this ONLY if xgboost is not installed in your environment.
# After running, restart the kernel once.
!conda install -y -c conda-forge xgboost


In [None]:
# ==============================
# CONFIGURATION – EDIT THIS PART
# ==============================

# Your S3 bucket
BUCKET_NAME = "cloudserfers"   # make sure there is NO trailing space

# Exact S3 keys of your parquet files inside the bucket
# Example:
#   s3://cloudserfers/uk_housing.parquet
#   s3://cloudserfers/uk_electricity.parquet
HOUSING_KEY     = "uk_housing.parquet"       # <-- change to your actual housing key
ELECTRICITY_KEY = "uk_electricity.parquet"   # <-- change to your actual electricity key

# Reproducibility + test size
RANDOM_SEED = 123
TEST_SIZE   = 0.2  # 20% test set

# Datasets configuration
# From your earlier error we know housing has:
# ['transaction_unique_identifier', 'price', 'date_of_transfer', 'property_type',
#  'oldnew', 'duration', 'towncity', 'district', 'county', 'ppdcategory_type',
#  'record_status__monthly_file_only', 'year', 'month', 'region', 'is_new_build']
# and electricity has ['ts', 'demand_mw'].

datasets = [
    {
        "name": "housing",
        "s3_key": HOUSING_KEY,
        "target": "price",   # housing price column
        "drop_cols": [
            # Optional. Non-numerics will be dropped anyway, so we can leave this empty
            # "transaction_unique_identifier"
        ],
    },
    {
        "name": "electricity",
        "s3_key": ELECTRICITY_KEY,
        "target": "demand_mw",
        "drop_cols": [
            # nothing needed here
        ],
    },
]

print("Bucket:", BUCKET_NAME)
print("Configured datasets:")
for ds in datasets:
    print(f" - {ds['name']}: s3://{BUCKET_NAME}/{ds['s3_key']} (target='{ds['target']}')")


In [None]:
import os
import boto3
import numpy as np
import pandas as pd

from pandas.api.types import is_string_dtype, is_bool_dtype

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor
import joblib

print("Imports OK")


In [None]:
def clean_nullable_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert pandas nullable dtypes into normal dtypes and replace pd.NA with np.nan.
    This helps avoid weird dtype issues when training.
    """
    df = df.copy()
    
    # Convert pandas StringDtype to object
    for col in df.columns:
        if is_string_dtype(df[col]):
            df[col] = df[col].astype("object")
    
    # Convert nullable bool to float
    for col in df.columns:
        if is_bool_dtype(df[col]):
            df[col] = df[col].astype("float64")
    
    # Replace pandas NA with numpy nan
    df = df.replace({pd.NA: np.nan})
    return df

print("clean_nullable_dtypes() is ready.")


In [None]:
s3 = boto3.client("s3")
results = []

for ds in datasets:
    name      = ds["name"]
    key       = ds["s3_key"]
    target    = ds["target"]
    drop_cols = ds.get("drop_cols", [])

    print("\n==============================")
    print(f"Dataset: {name}")
    print(f"S3 path: s3://{BUCKET_NAME}/{key}")
    print(f"Target : {target}")
    print("==============================\n")

    # 1) Download parquet from S3 to the notebook instance
    local_parquet = f"/tmp/{name}.parquet"
    print("Downloading parquet from S3...")
    s3.download_file(BUCKET_NAME, key, local_parquet)
    print("Downloaded to:", local_parquet)

    # 2) Read parquet
    print("Reading parquet into pandas...")
    df = pd.read_parquet(local_parquet)
    print("Raw shape:", df.shape)
    print("Raw dtypes:\n", df.dtypes)

    # 3) Clean nullable dtypes
    df = clean_nullable_dtypes(df)
    print("Dtypes after cleaning:\n", df.dtypes)

    # 3b) Dataset-specific feature engineering
    # ----------------------------------------
    if name == "housing":
        # Your housing dataset already has 'year' and 'month' columns.
        # Just make sure they are numeric and use them as features.
        if "year" not in df.columns or "month" not in df.columns:
            raise ValueError(
                f"Housing dataset: expected 'year' and 'month' columns, "
                f"found columns: {list(df.columns)}"
            )

        df["year"] = pd.to_numeric(df["year"])
        df["month"] = pd.to_numeric(df["month"])

        print("Housing: using existing 'year' and 'month' columns as features.")
        print(df[["year", "month", target]].head())

    elif name == "electricity":
        # Electricity: create time-based features from 'ts'
        ts_candidates = ["ts", "timestamp", "datetime"]
        ts_col = None
        for c in ts_candidates:
            if c in df.columns:
                ts_col = c
                break

        if ts_col is None:
            raise ValueError(
                f"Electricity dataset: expected a time column in {ts_candidates}, "
                f"found columns: {list(df.columns)}"
            )

        df[ts_col] = pd.to_datetime(df[ts_col])
        df["year"]      = df[ts_col].dt.year
        df["month"]     = df[ts_col].dt.month
        df["dayofweek"] = df[ts_col].dt.dayofweek
        df["hour"]      = df[ts_col].dt.hour

        # Drop original time column (optional, non-numeric)
        df = df.drop(columns=[ts_col])

        print(f"Electricity: created year/month/dayofweek/hour from '{ts_col}'.")
        print(df[["year", "month", "dayofweek", "hour", target]].head())

    # 4) Drop unwanted columns from config (optional)
    drop_now = [c for c in drop_cols if c in df.columns]
    if drop_now:
        df = df.drop(columns=drop_now)
        print("Dropped columns:", drop_now)

    # 5) Make sure target exists
    if target not in df.columns:
        raise ValueError(f"Target '{target}' not found in columns: {list(df.columns)}")

    # 6) Keep only numeric columns (simple approach)
    numeric_df = df.select_dtypes(include=["number"]).copy()
    if target not in numeric_df.columns:
        raise ValueError(
            f"After selecting numeric columns, target '{target}' is missing. "
            f"Numeric columns: {list(numeric_df.columns)}"
        )

    print("Numeric df shape:", numeric_df.shape)

    # 7) Build X and y
    X = numeric_df.drop(columns=[target])
    y = numeric_df[target]

    print("Feature columns used for model:", list(X.columns))
    if X.shape[1] == 0:
        raise ValueError(
            f"Dataset '{name}': no feature columns left after preprocessing. "
            f"Check drop_cols and feature engineering."
        )

    # 8) Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
    )

    print("Train shape:", X_train.shape)
    print("Test shape :", X_test.shape)

    # Drop rows with NaNs in features if needed
    train_nulls = X_train.isna().sum().sum()
    test_nulls  = X_test.isna().sum().sum()
    if train_nulls or test_nulls:
        print(f"Found NaNs in features (train={train_nulls}, test={test_nulls}). Dropping rows with NaNs.")
        train_mask = ~X_train.isna().any(axis=1)
        test_mask  = ~X_test.isna().any(axis=1)
        X_train, y_train = X_train[train_mask], y_train[train_mask]
        X_test, y_test   = X_test[test_mask], y_test[test_mask]
        print("After dropping NaNs:")
        print("Train shape:", X_train.shape)
        print("Test shape :", X_test.shape)

    # 9) Base XGB model
    base_xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=RANDOM_SEED,
        tree_method="hist",     # faster on CPUs
        eval_metric="rmse",
    )

    # 10) Hyperparameter search space (small to avoid burning credits)
    param_dist = {
        "n_estimators": [200, 400, 600],
        "max_depth": [4, 6, 8],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.7, 0.85, 1.0],
        "colsample_bytree": [0.7, 0.85, 1.0],
        "reg_lambda": [0.5, 1.0, 2.0],
    }

    # Use neg_mean_squared_error so it works with older sklearn
    tuner = RandomizedSearchCV(
        estimator=base_xgb,
        param_distributions=param_dist,
        n_iter=10,   # 10 x 3-fold = 30 fits per dataset
        scoring="neg_mean_squared_error",
        cv=3,
        verbose=1,
        random_state=RANDOM_SEED,
        n_jobs=-1,
    )

    print("Starting hyperparameter search...")
    tuner.fit(X_train, y_train)
    print("Best params:", tuner.best_params_)

    best_model = tuner.best_estimator_

    # 11) Evaluate on test set
    y_pred = best_model.predict(X_test)

    mse  = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5      # RMSE
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)

    print(f"Test RMSE: {rmse:.3f}")
    print(f"Test MAE : {mae:.3f}")
    print(f"Test R²  : {r2:.3f}")

    # 12) Save model bundle (model + feature names + target) locally and upload to S3
    feature_names = X.columns.tolist()
    model_bundle = {
        "model": best_model,
        "features": feature_names,
        "target": target,
    }
    model_filename = f"xgb_tuned_{name}_bundle.joblib"
    joblib.dump(model_bundle, model_filename)
    model_s3_key = f"models/{model_filename}"
    s3.upload_file(model_filename, BUCKET_NAME, model_s3_key)
    print(f"Saved model bundle to s3://{BUCKET_NAME}/{model_s3_key}")

    results.append({
        "dataset": name,
        "model_type": "XGBRegressor_tuned",
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "best_params": tuner.best_params_,
        "model_s3_uri": f"s3://{BUCKET_NAME}/{model_s3_key}",
    })

print("\nTraining loop finished.")


In [None]:
if results:
    metrics_df = pd.DataFrame(results)
    display(metrics_df)
    metrics_df.to_csv("aws_xgb_metrics.csv", index=False)
    print("Saved metrics to aws_xgb_metrics.csv")

    try:
        s3.upload_file("aws_xgb_metrics.csv", BUCKET_NAME, "models/aws_xgb_metrics.csv")
        print(f"Uploaded metrics to s3://{BUCKET_NAME}/models/aws_xgb_metrics.csv")
    except Exception as e:
        print("Could not upload metrics to S3 (optional):", e)
else:
    print("No results – check training cell for errors.")


In [None]:
# Export copies of both trained model bundles to a local folder
# so you can download them from the Jupyter / SageMaker UI.

import os
import boto3

EXPORT_DIR = "/home/ec2-user/SageMaker/model_exports"
os.makedirs(EXPORT_DIR, exist_ok=True)

s3 = boto3.client("s3")

for ds in datasets:
    name = ds["name"]
    model_filename = f"xgb_tuned_{name}_bundle.joblib"
    model_s3_key = f"models/{model_filename}"

    print(f"\nDownloading {model_filename} from S3...")
    obj = s3.get_object(Bucket=BUCKET_NAME, Key=model_s3_key)

    local_path = f"{EXPORT_DIR}/{model_filename}"
    with open(local_path, "wb") as f:
        f.write(obj["Body"].read())

    print(f"Exported model saved at:\n  {local_path}")

print("\nAll model files exported.")
print("You can download them from the left sidebar in Jupyter → Files → model_exports/")


In [None]:
# Interactive prediction for a single house using the trained XGBoost housing model

import boto3
import joblib
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_bool_dtype
from IPython.display import display

print("=== Loading trained housing model bundle from S3 ===")

# 1) Find housing config from the global `datasets` list
housing_cfg = [d for d in datasets if d.get("name") == "housing"][0]
housing_target = housing_cfg["target"]
housing_key = housing_cfg["s3_key"]
housing_drop_cols = housing_cfg.get("drop_cols", [])

# 2) Load model bundle (model + feature_names + target) from S3
HOUSING_DATASET_NAME = "housing"  # must match the 'name' in datasets
model_filename = f"xgb_tuned_{HOUSING_DATASET_NAME}_bundle.joblib"
model_s3_key   = f"models/{model_filename}"

s3 = boto3.client("s3")
print(f"Downloading model bundle from s3://{BUCKET_NAME}/{model_s3_key} ...")
obj = s3.get_object(Bucket=BUCKET_NAME, Key=model_s3_key)

local_model_path = f"/tmp/{model_filename}"
with open(local_model_path, "wb") as f:
    f.write(obj["Body"].read())

bundle = joblib.load(local_model_path)
model = bundle["model"]
feature_names = bundle["features"]
target_col = bundle["target"]

print("Model bundle loaded.")
print("Feature columns used by the model:", feature_names)
print("Target column:", target_col)

# 3) Reload housing dataset and preprocess it just like in training
print("\n=== Loading housing dataset for defaults & value ranges ===")
local_parquet = "/tmp/housing_for_prediction.parquet"
s3.download_file(BUCKET_NAME, housing_key, local_parquet)
df = pd.read_parquet(local_parquet)
df = clean_nullable_dtypes(df)

# Ensure 'year' and 'month' exist and are numeric (as in training)
if "year" not in df.columns or "month" not in df.columns:
    # fallback to derive from a date column if needed
    date_candidates = ["date_of_transfer", "date", "Date", "transfer_date"]
    date_col = None
    for c in date_candidates:
        if c in df.columns:
            date_col = c
            break
    if date_col is None:
        raise ValueError(
            f"Housing dataset: expected 'year'/'month' or a date column in {date_candidates}, "
            f"found columns: {list(df.columns)}"
        )
    df[date_col] = pd.to_datetime(df[date_col])
    df["year"] = df[date_col].dt.year
    df["month"] = df[date_col].dt.month

df["year"] = pd.to_numeric(df["year"])
df["month"] = pd.to_numeric(df["month"])

# Drop any configured columns
drop_now = [c for c in housing_drop_cols if c in df.columns]
if drop_now:
    df = df.drop(columns=drop_now)

# Keep only numeric columns and align with model features
numeric_df = df.select_dtypes(include=["number"]).copy()
if target_col not in numeric_df.columns:
    raise ValueError(
        f"After filtering numeric columns, target '{target_col}' is missing. "
        f"Numeric columns: {list(numeric_df.columns)}"
    )

feature_df = numeric_df.drop(columns=[target_col])

# Ensure we only use columns the model was trained on
feature_df = feature_df[feature_names]

print("Sample of numeric training features:")
display(feature_df.head())

# 4) Interactively ask the user for each feature
print("\n=== Interactive input for a single house ===")
print("For each feature, you’ll see:")
print(" - A suggested default value (from the training data)")
print(" - The expected or possible values")
print("Press ENTER to accept the default.\n")

row = {}

for col in feature_names:
    series = feature_df[col].dropna()

    # If column somehow empty, just default to 0
    if series.empty:
        default = 0.0
        expected_info = "No data available (defaulting to 0.0)."
    else:
        # Choose sensible default from training data
        if is_numeric_dtype(series):
            default = float(series.median())
        elif is_bool_dtype(series):
            default = bool(series.mode().iloc[0])
        else:
            default = str(series.mode().iloc[0])

        # Build "expected values" info
        unique_vals = np.unique(series.values)
        if len(unique_vals) <= 20:
            # List all possible values
            expected_info = f"Expected values: {list(unique_vals)}"
        else:
            if is_numeric_dtype(series):
                expected_info = (
                    f"Expected numeric range: approx [{series.min():.3g}, {series.max():.3g}] "
                    f"(median={series.median():.3g})"
                )
            else:
                top_vals = series.value_counts().head(10).index.tolist()
                expected_info = f"Most common categories: {top_vals} (total unique={len(unique_vals)})"

    print(f"\nFeature: '{col}'")
    print(f"  {expected_info}")
    print(f"  Default: {default}")

    user_in = input(f"Enter value for '{col}' (press ENTER for default): ").strip()

    if user_in == "":
        value = default
    else:
        # Try to cast to appropriate type
        if is_numeric_dtype(series):
            # Decide int vs float based on dtype kind
            kind = series.dtype.kind
            if kind in ["i", "u"]:  # integer types
                try:
                    value = int(float(user_in))
                except ValueError:
                    print(f"  Could not parse integer, falling back to default for '{col}'.")
                    value = default
            else:
                try:
                    value = float(user_in)
                except ValueError:
                    print(f"  Could not parse number, falling back to default for '{col}'.")
                    value = default
        elif is_bool_dtype(series):
            value = user_in.lower() in ["1", "true", "yes", "y", "t"]
        else:
            value = user_in  # keep as string (shouldn't happen here; model uses numeric)

    row[col] = value

# 5) Build input DataFrame and predict
input_df = pd.DataFrame([row])[feature_names]
print("\n=== You entered the following values ===")
display(input_df)

pred = model.predict(input_df)[0]

print(f"\nEstimated {target_col} for this house: {pred:,.2f}")
