
# Train & Test (SageMaker SKLearn) — **Updated full demo**

This notebook trains a scikit-learn model on Amazon SageMaker using the provided `src/train_script.py`, then
**tests predictions via Batch Transform** (cheap & no always-on endpoint).

> If you still want a *real-time endpoint*, a ready-to-run cell is included but **left disabled by default** to avoid charges.


In [None]:
# (Optional) Ensure required libs exist. In SageMaker Studio this is usually preinstalled.
# Uncomment if you see import errors.
# %pip install --upgrade sagemaker boto3 pandas scikit-learn


In [None]:
import os, json, boto3, time, logging, pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn

# ---------- Configuration ----------
# REQUIRED: set your S3 bucket name (no s3:// prefix). Example: "harry-ml-demo-ca-central-1"
S3_BUCKET = os.getenv("SM_DEMO_BUCKET", "your-bucket")
# Where to store / find the CSV in the bucket (we will ensure it exists below)
S3_DATA_KEY = "datasets/house_prices.csv"

# Local path to the CSV in this repo (relative to this notebook in notebooks/)
LOCAL_RELATIVE_CSV = "../data/house_prices.csv"

# Entry script and code directory
ENTRY_POINT = "train_script.py"
SOURCE_DIR = "../src"

# Training instance type — ml.m5.large was verified to work in ca-central-1
TRAIN_INSTANCE_TYPE = "ml.m5.large"
FRAMEWORK_VERSION = "1.2-1"  # SageMaker SKLearn image version

# ----------------------------------
sess = sagemaker.Session()
region = sess.boto_region_name
try:
    role = get_execution_role()
except Exception:
    # Fallback for local testing (won't be used in SageMaker Studio)
    iam = boto3.client("iam")
    role = iam.get_role(RoleName="SageMakerExecutionRole")["Role"]["Arn"]

print("Region:", region)
print("Role:", role)
print("S3 bucket:", S3_BUCKET)
print("S3 key:", S3_DATA_KEY)


In [None]:
s3 = boto3.client("s3", region_name=region)

def ensure_data_in_s3(bucket, key, local_path):
    need_upload = False
    try:
        s3.head_object(Bucket=bucket, Key=key)
        print(f"Found existing S3 object: s3://{bucket}/{key}")
    except Exception as e:
        print("S3 object not found, will upload local file...", e)
        need_upload = True

    if need_upload:
        assert os.path.exists(local_path), f"Local file not found: {local_path}"
        s3_res = boto3.resource("s3", region_name=region)
        s3_res.Bucket(bucket).upload_file(local_path, key)
        print(f"Uploaded to s3://{bucket}/{key}")

# Run ensure step
ensure_data_in_s3(S3_BUCKET, S3_DATA_KEY, LOCAL_RELATIVE_CSV)



## Train

We use an **SKLearn** estimator with image version `1.2-1`. The training code lives in `../src/train_script.py`.
Training instance uses **ml.m5.large** (verified available in ca-central-1).


In [None]:
estimator = SKLearn(
    entry_point=ENTRY_POINT,
    role=role,
    framework_version=FRAMEWORK_VERSION,
    instance_type=TRAIN_INSTANCE_TYPE,  # training
    instance_count=1,
    source_dir=SOURCE_DIR,
)

inputs = {'train': f's3://{S3_BUCKET}/{S3_DATA_KEY}'}
estimator.fit(inputs)



## Test using Batch Transform (recommended for demo)

This creates a one-off transform job that runs the model on a small CSV input and writes predictions to S3.
It's cheaper than creating a real-time endpoint and avoids instance-type quirks.


In [None]:
import tempfile, uuid, boto3, os

# Create a tiny test payload (adapt format to your train_script.py expectations)
# Example row: '1200,3'  =>  area=1200, rooms=3
test_payload_local = os.path.join(tempfile.gettempdir(), "bt_input.csv")
with open(test_payload_local, "w") as f:
    f.write("1200,3\n")
    f.write("1500,4\n")

# Upload the test payload to S3
bt_input_key = f"batch-inputs/{uuid.uuid4().hex}/input.csv"
boto3.resource("s3", region_name=region).Bucket(S3_BUCKET).upload_file(test_payload_local, bt_input_key)
s3_input_uri = f"s3://{S3_BUCKET}/{bt_input_key}"
print("Batch Transform input:", s3_input_uri)

# Create transformer
transformer = estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.large",  # short-lived job
    assemble_with="Line",
    accept="text/csv",
)

# Start transform
transformer.transform(
    data=s3_input_uri,
    content_type="text/csv",
    split_type="Line",
)
transformer.wait()

print("Transform output S3:", transformer.output_path)



## (Optional) Real-time endpoint

> **Not run by default** to avoid charges. If you want to try it, uncomment the cell below and run.
> Instance types like `ml.t2.micro` are **not** supported for SKLearn in many regions; `ml.m5.large` is safe but costlier.


In [None]:
# --- UNCOMMENT TO DEPLOY A REAL-TIME ENDPOINT ---
# from sagemaker.serializers import CSVSerializer
# predictor = estimator.deploy(
#     initial_instance_count=1,
#     instance_type="ml.m5.large",
# )
# predictor.serializer = CSVSerializer()
# print("Endpoint name:", predictor.endpoint_name)


In [None]:
# --- UNCOMMENT TO TEST THE ENDPOINT ---
# payload = "1200,3"
# result = predictor.predict(payload)
# print("Prediction:", result)


In [None]:
# --- UNCOMMENT TO DELETE THE ENDPOINT WHEN DONE ---
# try:
#     predictor.delete_endpoint()
#     print("Endpoint deleted:", predictor.endpoint_name)
# except Exception as e:
#     print("Nothing to delete or error deleting endpoint:", e)
