## AAI-540 ML OPS
## Final Project : YELP REVIEWS 
## Team ::: Group 9 
## 1) Sandeep Kumar Jakkaraju
## 2) Harish Archarya
## 3) Angshuman Roy

# 📓 Yelp Reviews: Integrated Pipeline (Supervised Rating + KMeans Analysis)

This single notebook **merges** the two prior notebooks into one coherent workflow that **ends in a trained, evaluated ML model** for **predicting star ratings** from review **Text**.  
It also includes an **optional KMeans analysis** section (from your previous KMeans notebook) for exploration—**it does not alter the supervised pipeline** unless you choose to.

**What you get end-to-end:**
- Data Sources
- Data Engineering (Processing)
- Feature Engineering (TF‑IDF)
- **Supervised Model Training & Evaluation** (Logistic Regression multiclass)
- (Optional) **KMeans** clustering for insight
- (Optional) Feature Store ingest
- (Optional) Deployment + Monitoring
- CI/CD‑friendly layout via helper scripts

> Columns used: **`Text`** (features) and **`Stars (Review)`** (label in 1–5).


## 🔧 Setup

In [150]:
import os, boto3, sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

region = boto3.Session().region_name
sess   = Session()
role   = get_execution_role()
bucket = sess.default_bucket()                # <<< change if you want a fixed bucket
prefix = "yelp-integrated-pipeline"           # <<< tweak as needed
SRC_DIR = "src"
os.makedirs(SRC_DIR, exist_ok=True)

print("region:", region)
print("role:", role)
print("bucket:", bucket)
print("prefix:", prefix)

region: eu-central-1
role: arn:aws:iam::495599743560:role/WNTMLSagemakerTrainingRole
bucket: sagemaker-eu-central-1-495599743560
prefix: yelp-integrated-pipeline


## 🧩 Write Helper Scripts (Processing, Training, Inference, Evaluation)

We generate consistent helper scripts under `src/` so SageMaker jobs can run them.


In [151]:
import os
from pathlib import Path

# Directory for helper scripts
SRC_DIR = Path("src_scripts")
SRC_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------
# 1️⃣ Processing Script
# -------------------------------------------------------------
processing_script = """\
import argparse, os, re, string, time, hashlib, boto3, joblib
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

def basic_clean(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r"\\s+", " ", s)
    s = s.translate(str.maketrans("", "", string.punctuation))
    return s.strip()

def mk_review_id(text):
    return hashlib.sha1(text.encode("utf-8")).hexdigest()

def build_fs_table(df, text_col, label_col):
    event_time = int(time.time())
    out = pd.DataFrame({
        "review_id": df[text_col].astype(str).map(mk_review_id),
        "event_time": event_time,
        "num_chars": df[text_col].str.len(),
        "num_words": df[text_col].str.split().map(len),
        "avg_word_len": df[text_col].str.split().map(lambda w: np.mean([len(x) for x in w]) if w else 0),
        "stars": df[label_col].astype(int)
    })
    return out

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--input_data", required=True)
    p.add_argument("--output_data", required=True)
    p.add_argument("--text_column", default="Text")
    p.add_argument("--label_column", default="Stars (Review)")
    p.add_argument("--max_features", type=int, default=30000)
    p.add_argument("--test_size", type=float, default=0.2)
    p.add_argument("--random_state", type=int, default=42)
    args = p.parse_args()

    os.makedirs("/opt/ml/processing/input", exist_ok=True)
    os.makedirs("/opt/ml/processing/output", exist_ok=True)
    os.makedirs("/opt/ml/processing/temp", exist_ok=True)

    s3 = boto3.client("s3")
    bucket = args.input_data.split("/")[2]
    key = "/".join(args.input_data.split("/")[3:])
    local_csv = "/opt/ml/processing/input/raw.csv"
    s3.download_file(bucket, key, local_csv)

    df = pd.read_csv(local_csv)
    df[args.text_column] = df[args.text_column].astype(str).map(basic_clean)
    df = df[df[args.text_column].str.len() > 0]

    y = pd.to_numeric(df[args.label_column], errors="coerce").clip(1, 5).astype(int)
    X_train_txt, X_test_txt, y_train, y_test = train_test_split(
        df[args.text_column], y, test_size=args.test_size, random_state=args.random_state, stratify=y
    )

    tfidf = TfidfVectorizer(max_features=args.max_features, ngram_range=(1,2))
    X_train = tfidf.fit_transform(X_train_txt)
    X_test = tfidf.transform(X_test_txt)

    model_dir = "/opt/ml/processing/output/model"
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(tfidf, os.path.join(model_dir, "tfidf.joblib"))
    joblib.dump(tfidf, os.path.join("/tmp", "tfidf.joblib"))
        


    sparse.save_npz("/opt/ml/processing/output/X_train.npz", X_train)
    sparse.save_npz("/opt/ml/processing/output/X_test.npz", X_test)
    np.save("/opt/ml/processing/output/y_train.npy", y_train)
    np.save("/opt/ml/processing/output/y_test.npy", y_test)

    fs_df = build_fs_table(df, args.text_column, args.label_column)
    fs_df.to_csv("/opt/ml/processing/output/features.csv", index=False)

    out_b = args.output_data.split("/")[2]
    out_p = "/".join(args.output_data.split("/")[3:])
    s3.upload_file(os.path.join(model_dir, "tfidf.joblib"), out_b, f"{out_p}/tfidf.joblib")
    for f in ["X_train.npz", "X_test.npz", "y_train.npy", "y_test.npy", "features.csv"]:
        s3.upload_file(f"/opt/ml/processing/output/{f}", out_b, f"{out_p}/{f}")

if __name__ == "__main__":
    main()
"""

# -------------------------------------------------------------
# 2️⃣ Training Script
# -------------------------------------------------------------
training_script = """\
import argparse, os, boto3, joblib, numpy as np, shutil
from scipy import sparse
from sklearn.linear_model import LogisticRegression

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--C", type=float, default=1.0)
    p.add_argument("--max_iter", type=int, default=1000)
    p.add_argument("--processed_data", required=True)
    p.add_argument("--model_dir", default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model"))
    args = p.parse_args()

    s3 = boto3.client("s3")
    bucket = args.processed_data.split("/")[2]
    prefix = "/".join(args.processed_data.split("/")[3:])

    tmp_dir = "/tmp/"
    os.makedirs(tmp_dir, exist_ok=True)

    # Download training artifacts
    X_local = "/tmp/X_train.npz"
    y_local = "/tmp/y_train.npy"
    tfidf_local = "/tmp/tfidf.joblib"
    s3.download_file(bucket, f"{prefix}/X_train.npz", X_local)
    s3.download_file(bucket, f"{prefix}/y_train.npy", y_local)
    s3.download_file(bucket, f"{prefix}/tfidf.joblib", tfidf_local)

    X_train = sparse.load_npz(X_local)
    y_train = np.load(y_local)
    tfidf = joblib.load(tfidf_local)

    clf = LogisticRegression(C=args.C, max_iter=args.max_iter, solver="saga", n_jobs=-1, multi_class="multinomial")
    clf.fit(X_train, y_train)

     # Save model and vectorizer directly to SM_MODEL_DIR
    os.makedirs(args.model_dir, exist_ok=True)
    model_path = os.path.join(args.model_dir, "model.joblib")
    tfidf_path = os.path.join(args.model_dir, "tfidf.joblib")
    
    joblib.dump(clf, model_path)
    joblib.dump(tfidf, tfidf_path)
    
    print(f"✅ Saved model to {model_path}")
    print(f"✅ Saved TF-IDF vectorizer to {tfidf_path}")


if __name__ == "__main__":
    main()
"""

# -------------------------------------------------------------
# 3️⃣ Inference Script
# -------------------------------------------------------------
inference_script = """\
import os, json, joblib, shutil

def find_model_file(filename):
    opt_path = os.path.join("/opt/ml/model", filename)
    tmp_path = os.path.join("/tmp/", filename)
    if os.path.exists(opt_path):
        return opt_path
    elif os.path.exists(tmp_path):
        return tmp_path
    else:
        raise FileNotFoundError(f"Missing file: {filename}")

def model_fn(model_dir):
    os.makedirs("/tmp/", exist_ok=True)
    src_model = find_model_file("model.joblib")
    src_vect = find_model_file("tfidf.joblib")
    shutil.copy(src_model, "/tmp/model.joblib")
    shutil.copy(src_vect, "/tmp/tfidf.joblib")
    model = joblib.load("/tmp/model.joblib")
    vect = joblib.load("/tmp/tfidf.joblib")
    return {"model": model, "vectorizer": vect}

def input_fn(request_body, request_content_type="application/json"):
    if request_content_type == "application/json":
        payload = json.loads(request_body)
        texts = payload.get("texts", [])
        if isinstance(texts, str): texts = [texts]
        return texts
    elif request_content_type == "text/csv":
        return [line for line in request_body.splitlines() if line.strip()]
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(inputs, model):
    vect = model["vectorizer"]
    clf = model["model"]
    X = vect.transform(inputs)
    preds = clf.predict(X).tolist()
    proba = clf.predict_proba(X).tolist() if hasattr(clf, "predict_proba") else None
    return {"predictions": preds, "probabilities": proba}

def output_fn(prediction, accept="application/json"):
    return json.dumps(prediction), "application/json"
"""

# -------------------------------------------------------------
# 4️⃣ Evaluation Script
# -------------------------------------------------------------
evaluation_script = """\
import argparse, os, json, boto3, joblib, numpy as np, tarfile
from scipy import sparse
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

def extract_model_tar(model_artifact, local_dir):
    os.makedirs(local_dir, exist_ok=True)
    tar_path = os.path.join(local_dir, "model.tar.gz")
    s3 = boto3.client("s3")
    b = model_artifact.split("/")[2]
    k = "/".join(model_artifact.split("/")[3:])
    s3.download_file(b, k, tar_path)
    with tarfile.open(tar_path) as t:
        t.extractall(local_dir)
    return os.path.join(local_dir, "model.joblib")

def main():
    p = argparse.ArgumentParser()
    p.add_argument("--processed_data", required=True)
    p.add_argument("--model_artifact", required=True)
    p.add_argument("--output", default="/opt/ml/processing/output/evaluation")
    args = p.parse_args()

    os.makedirs(args.output, exist_ok=True)
    model_path = extract_model_tar(args.model_artifact, "/opt/ml/processing/temp/model")
    model = joblib.load(model_path)

    s3 = boto3.client("s3")
    pb = args.processed_data.split("/")[2]
    pp = "/".join(args.processed_data.split("/")[3:])
    Xte = "/opt/ml/processing/temp/X_test.npz"
    yte = "/opt/ml/processing/temp/y_test.npy"
    os.makedirs(os.path.dirname(Xte), exist_ok=True)
    s3.download_file(pb, f"{pp}/X_test.npz", Xte)
    s3.download_file(pb, f"{pp}/y_test.npy", yte)

    X_test = sparse.load_npz(Xte)
    y_test = np.load(yte)
    y_pred = model.predict(X_test)

    acc = float(accuracy_score(y_test, y_pred))
    f1m = float(f1_score(y_test, y_pred, average="macro"))
    cm = confusion_matrix(y_test, y_pred).tolist()
    report = classification_report(y_test, y_pred, output_dict=True)

    results = {"accuracy": acc, "f1_macro": f1m, "confusion_matrix": cm, "classification_report": report}
    with open(os.path.join(args.output, "evaluation.json"), "w") as f:
        json.dump(results, f)
    print(json.dumps(results, indent=2))

if __name__ == "__main__":
    main()
"""

# -------------------------------------------------------------
# 5️⃣ Write all scripts to src_scripts/
# -------------------------------------------------------------
(SRC_DIR / "processing_supervised.py").write_text(processing_script.strip() + "\n", encoding="utf-8")
(SRC_DIR / "training_supervised.py").write_text(training_script.strip() + "\n", encoding="utf-8")
(SRC_DIR / "inference_supervised.py").write_text(inference_script.strip() + "\n", encoding="utf-8")
(SRC_DIR / "evaluation_supervised.py").write_text(evaluation_script.strip() + "\n", encoding="utf-8")

print("✅ All helper scripts written successfully:")
for p in SRC_DIR.iterdir():
    print("  -", p.name)


✅ All helper scripts written successfully:
  - inference_supervised.py
  - processing_supervised.py
  - training_supervised.py
  - evaluation_supervised.py


## 📦 Data Sources

In [152]:
raw_s3 = f"s3://{bucket}/{prefix}/raw/yelp_reviews.csv"   # Must contain 'Text' and 'Stars (Review)'
proc_s3 = f"s3://{bucket}/{prefix}/processed"
print("RAW:", raw_s3)
print("PROC:", proc_s3)

RAW: s3://sagemaker-eu-central-1-495599743560/yelp-integrated-pipeline/raw/yelp_reviews.csv
PROC: s3://sagemaker-eu-central-1-495599743560/yelp-integrated-pipeline/processed


## 🧹 Processing (TF‑IDF + Splits + features.csv)

In [153]:
from sagemaker.processing import ScriptProcessor
import sagemaker

script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve("sklearn", region, version="1.2-1"),
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    command=["python3"],
)

script_processor.run(
    code=os.path.join(SRC_DIR, "processing_supervised.py"),
    arguments=[
        "--input_data", raw_s3,
        "--output_data", proc_s3,
        "--text_column", "Text",
        "--label_column", "Stars (Review)",
        "--max_features", "30000",
        "--test_size", "0.2",
        "--random_state", "42",
    ],
    inputs=[],
    outputs=[],
)
print("Processing job complete.")

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2025-10-20-11-25-10-201


...............
..Processing job complete.


## 🔬 Optional: KMeans Analysis (from the unsupervised notebook)

This section reproduces a KMeans view purely for **insight**. It downloads TF‑IDF artifacts from S3,
trains KMeans **locally in the notebook kernel** on a sample (or full set if memory allows), and prints
top terms per cluster. It **does not modify** the supervised training inputs unless you explicitly change that.


In [154]:
import os, io, joblib, boto3, numpy as np
from scipy import sparse
from sklearn.cluster import KMeans

# Download TF-IDF model and training matrix
s3 = boto3.client("s3")
b = proc_s3.split("/")[2]; p = "/".join(proc_s3.split("/")[3:])
local_tfidf = "tfidf.joblib"
local_Xtr   = "X_train.npz"
s3.download_file(b, f"{p}/tfidf.joblib", local_tfidf)
s3.download_file(b, f"{p}/X_train.npz", local_Xtr)

tfidf = joblib.load(local_tfidf)
X_train = sparse.load_npz(local_Xtr)

# Fit KMeans on a sub-sample if needed (to keep speed/memory in check)
n_samples = min(X_train.shape[0], 20000)   # <<< adjust
if n_samples < X_train.shape[0]:
    idx = np.random.RandomState(42).choice(X_train.shape[0], size=n_samples, replace=False)
    X_km = X_train[idx]
else:
    X_km = X_train

k = 20  # <<< number of clusters
km = KMeans(n_clusters=k, max_iter=300, n_init="auto", random_state=42)
km.fit(X_km)

# Show top terms per cluster
terms = tfidf.get_feature_names_out()
centers = km.cluster_centers_
for i in range(k):
    top_idx = centers[i].argsort()[-10:][::-1]
    top_terms = [terms[j] for j in top_idx]
    print(f"Cluster {i}: {', '.join(top_terms)}")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Cluster 0: pizza, the, the pizza, and, was, is, of, to, it, crust
Cluster 1: ramen, the, and, is, broth, was, noodles, to, pork, the ramen
Cluster 2: the, and, was, menu, to, the menu, on, food, of, is
Cluster 3: you, the, and, to, is, if, if you, your, are, for
Cluster 4: the, to, and, back, was, go, this, place, it, for
Cluster 5: the, to, and, it, was, that, for, of, my, me
Cluster 6: the, and, to, of, we, was, in, with, were, for
Cluster 7: the, is, and, place, this, of, in, to, you, this place
Cluster 8: great, and, the, food, friendly, service, staff, is, very, friendly staff
Cluster 9: the, was, and, it, of, to, with, we, for, were
Cluster 10: thai, pad, pad thai, the, and, thai food, was, food, is, curry
Cluster 11: chicken, the, was, and, the chicken, it, of, to, fried, with
Cluster 12: the, is, and, are, of, to, food, always, in, they
Cluster 13: sushi, the, and, is, was, of, the sushi, to, roll, rolls
Cluster 14: good, the, and, is, but, pretty, food, are, for, service
Clust

## 🗂️ Feature Store (Optional)

Create a Feature Group from the engineered features (`features.csv`) and ingest to **Online + Offline**.


In [155]:
import io, time, pandas as pd
from sagemaker.feature_store.feature_group import FeatureGroup
from time import gmtime, strftime

# Load features.csv
s3 = boto3.client("s3", region_name=region)
pb = proc_s3.split("/")[2]
pp = "/".join(proc_s3.split("/")[3:])
feat_key = f"{pp}/features/features.csv"
obj = s3.get_object(Bucket=pb, Key=feat_key)
fs_df = pd.read_csv(io.BytesIO(obj["Body"].read()))

# Enforce FS-friendly dtypes
fs_df["review_id"]  = fs_df["review_id"].astype(str)
fs_df["event_time"] = pd.to_numeric(fs_df["event_time"], errors="coerce").astype("float64")
fs_df["num_chars"]  = pd.to_numeric(fs_df["num_chars"], errors="coerce").astype("int64")
fs_df["num_words"]  = pd.to_numeric(fs_df["num_words"], errors="coerce").astype("int64")
fs_df["avg_word_len"] = pd.to_numeric(fs_df["avg_word_len"], errors="coerce").astype("float64")
fs_df["stars"] = pd.to_numeric(fs_df["stars"], errors="coerce").astype("int64")

fg_name = "yelp-review-features-" + strftime("%d-%H-%M-%S", gmtime())
fg = FeatureGroup(name=fg_name, sagemaker_session=sess)
fg.load_feature_definitions(data_frame=fs_df)
fg.create(
    s3_uri=f"s3://{bucket}/{prefix}/feature-store",
    record_identifier_name="review_id",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True,
)

def wait_fg(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation...")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Feature Group not created. Status={status}")
    print("Feature Group created:", feature_group.name)

wait_fg(fg)

# Ingest to both stores (SDK enums vary by version, handle broadly)
try:
    from sagemaker.feature_store.inputs import TargetStore
    opt = None
    for a,b in [("ONLINE","OFFLINE"), ("ONLINE_STORE","OFFLINE_STORE"), ("OnlineStore","OfflineStore")]:
        if hasattr(TargetStore, a) and hasattr(TargetStore, b):
            opt = [getattr(TargetStore,a), getattr(TargetStore,b)]
            break
    if opt:
        fg.ingest(data_frame=fs_df, max_workers=4, wait=True, target_stores=opt)
    else:
        fg.ingest(data_frame=fs_df, max_workers=4, wait=True)
except Exception:
    fg.ingest(data_frame=fs_df, max_workers=4, wait=True)

print("FS ingestion complete:", fg_name)

Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Feature Group created: yelp-review-features-20-11-28-06


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.feature_store.feature_group:Started ingesting index 2977 to 5954
INFO:sagemaker.feature_store.feature_group:Started ingesting index 8931 to 11906
INFO:sagemaker.feature_store.feature_group:Started ingesting index 5954 to 8931
INFO:sagemaker.feature_store.feature_group:Started ingesting index 0 to 2977
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 5954 to 8931
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 2977 to 5954
INFO:sagemaker.feature_store.feature_group:Successfully ingested row 8931 to 11906
INFO:sagemaker.feature_store.feature_group:Successfully

FS ingestion complete: yelp-review-features-20-11-28-06


## 🏋️ Supervised Training — Logistic Regression (Multinomial)

In [156]:
from sagemaker.sklearn.estimator import SKLearn

est = SKLearn(
    entry_point="training_supervised.py",
    source_dir="src",  # Folder containing the script
    role=role,
    instance_type="ml.m5.large",
    framework_version="1.2-1",
    py_version="py3",
    base_job_name="yelp-rating-train",
    model_dir="/tmp"
)

est.set_hyperparameters(
    processed_data=proc_s3,   # e.g. 's3://my-bucket/yelp/processed'
    C=1.0,
    max_iter=1000
)

est.fit()

model_artifact = est.model_data
print("Model artifact:", model_artifact)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: yelp-rating-train-2025-10-20-11-28-53-296


2025-10-20 11:28:54 Starting - Starting the training job...
2025-10-20 11:29:08 Starting - Preparing the instances for training...
  import pkg_resources[0m
[34m2025-10-20 11:31:04,531 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-10-20 11:31:04,536 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-20 11:31:04,539 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-10-20 11:31:04,556 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-10-20 11:31:04,801 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-20 11:31:04,804 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-10-20 11:31:04,823 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-10-20 11:31

## 📊 Evaluation — Accuracy & Macro F1

In [157]:
print(SRC_DIR)
print(model_artifact)

src_scripts
s3://sagemaker-eu-central-1-495599743560/yelp-rating-train-2025-10-20-11-28-53-296/output/model.tar.gz


## 🚀 Deployment — Real-time Endpoint

In [160]:
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.model_monitor import DataCaptureConfig
import os
from pathlib import Path

# -------------------------------------------------------------
# Directories
# -------------------------------------------------------------
SRC_DIR = Path("src_scripts")
SRC_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------------------------------------------
# Model directory (where artifacts are saved during training)
# -------------------------------------------------------------
MODEL_DIR = "/tmp/"

# -------------------------------------------------------------
# SageMaker Model Definition
# -------------------------------------------------------------
sk_model = SKLearnModel(
    model_data=model_artifact,  # e.g., "s3://<bucket>/<prefix>/model.tar.gz"
    entry_point=os.path.join(SRC_DIR, "inference_supervised.py"),
    framework_version="1.2-1",
    role=role,
    env={
        # Add custom environment variables here
        "TFIDF_S3_URI": f"{proc_s3}/tfidf.joblib",
        "MODEL_DIR": MODEL_DIR   # ✅ New param for inference script
    }
)

# -------------------------------------------------------------
# Deployment
# -------------------------------------------------------------
predictor = sk_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    data_capture_config=DataCaptureConfig(
        enable_capture=True,
        sampling_percentage=100,
        destination_s3_uri=f"s3://{bucket}/{prefix}/datacapture/"
    )
)

print("✅ Endpoint name:", predictor.endpoint_name)


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-10-20-11-57-07-239
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2025-10-20-11-57-07-903
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2025-10-20-11-57-07-903


------!✅ Endpoint name: sagemaker-scikit-learn-2025-10-20-11-57-07-903


### 🔎 Quick Probe
### Predict for few reviews text

In [166]:
import json
import boto3

runtime = boto3.client('sagemaker-runtime')

response = runtime.invoke_endpoint(
    EndpointName='sagemaker-scikit-learn-2025-10-20-11-57-07-903',
    ContentType='application/json',
    Body=json.dumps({
        "texts": [
            "Amazing service and delicious food. Will come back!",
            "The burger was dry and the fries were soggy. Not impressed."
        ]
    })
)

print(response['Body'].read().decode('utf-8'))


{"predictions": [5, 2], "probabilities": [[0.00928452605770668, 0.007932621288482582, 0.006602629918243063, 0.15182515104165684, 0.8243550716939109], [0.11524447288971221, 0.5435778677067528, 0.1977567268045214, 0.08386447610440634, 0.0595564564946072]]}


## 📈 Monitoring — Baseline & Schedule

In [167]:
from sagemaker.model_monitor import DefaultModelMonitor, DatasetFormat, CronExpressionGenerator

monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
    base_job_name="yelp-rating-monitor",
    sagemaker_session=sess,
)

monitor.suggest_baseline(
    baseline_dataset=raw_s3,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=f"s3://{bucket}/{prefix}/monitoring/baseline",
    wait=True
)

monitor.create_monitoring_schedule(
    endpoint_input=predictor.endpoint_name,
    output_s3_uri=f"s3://{bucket}/{prefix}/monitoring/reports",
    statistics=f"s3://{bucket}/{prefix}/monitoring/baseline/statistics.json",
    constraints=f"s3://{bucket}/{prefix}/monitoring/baseline/constraints.json",
    schedule_cron_expression=CronExpressionGenerator.hourly(),
)
print("Monitoring schedule created.")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name yelp-rating-monitor-2025-10-20-12-05-30-479


................[34m2025-10-20 12:08:05.692618: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory[0m
[34m2025-10-20 12:08:05.692654: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.[0m
[34m2025-10-20 12:08:07.349500: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory[0m
[34m2025-10-20 12:08:07.349547: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)[0m
[34m2025-10-20 12:08:07.349578: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-10-0-122-34.eu-central-1.compute.internal): /proc/driver/nvidi

INFO:sagemaker.model_monitor.model_monitoring:Creating Monitoring Schedule with name: yelp-rating-monitor-2025-10-20-12-10-58-124


Monitoring schedule created.


## 🧹 Cleanup 

In [None]:
# predictor.delete_endpoint()
# monitor.delete_monitoring_schedule()
# fg.delete()