*** Purpose: Human-in-the-loop logging + review-ready feedback

#### High LeveL Steps
1.Load base DistilBERT (feature extractor only) 

2.Load the saved logreg_model.joblib and label_encoder.joblib

3.Define predict(text) → returns label + confidence

4.Load 10 complaint samples from complaints_train.csv

5.Loop through and run predict() on each

6.Print results clearly (index, predicted label, confidence)

7.(Optional) Wrap with agent-style print formatting

8.Confirm inference works end-to-end — no missing artifacts, no crashes

9.Package model: distilbert_model/, label_encoder.joblib into a .tar.gz

10.Upload .tar.gz to S3

11.Deploy as SageMaker endpoint with proper inference handler (real-time API)

In [2]:
#!pip install torch --quiet
#!pip install transformers --quiet

In [22]:
# Step 1 — Load base DistilBERT feature extractor

from transformers import DistilBertTokenizer, DistilBertModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer + base encoder
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
bert.to(device)
bert.eval()



DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [23]:
# Step 2 — Load classifier and label encoder
import joblib

clf = joblib.load("logreg_model.joblib")
le = joblib.load("label_encoder.joblib")



In [24]:
#Step 3 — Define predict(text) function
def predict(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # BERT embedding
    with torch.no_grad():
        outputs = bert(**inputs)
        pooled = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Mean pooling

    # Predict
    pred_idx = clf.predict(pooled)[0]
    confidence = clf.predict_proba(pooled).max()
    label = le.inverse_transform([pred_idx])[0]

    return label, confidence


#### Clasification and Prep for Agentifying Loop

In [25]:
#Step 1: Set up logging function
import csv
from datetime import datetime
LOG_FILE = "agent_log.csv"
def log_prediction(text, predicted_label, confidence, true_label=""):
    reviewed = "No" if confidence < 0.6 else "N/A"
    with open(LOG_FILE, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            datetime.now().isoformat(),
            text,
            predicted_label,
            round(confidence, 4),
            reviewed,
            true_label
        ])

In [26]:
#Step 2: If log file doesn't exist, add header
import os

if not os.path.exists(LOG_FILE):
    with open(LOG_FILE, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["timestamp", "text", "predicted_label", "confidence", "reviewed", "true_label"])

In [30]:
#Step 4 — Test on sample complaints
import pandas as pd

df = pd.read_csv("downloads/complaints_train.csv")
texts = df["narrative"].dropna().astype(str).tolist()[:1000]

for i, text in enumerate(texts):
    label, conf = predict(text)
    log_prediction(text, label, conf)
    print(f"[{i+1}] → {label} (confidence: {conf:.2f})")

In [33]:
!find /home/ec2-user -name "logreg_model.joblib"

find: ‘/home/ec2-user/SageMaker/lost+found’: Permission denied
/home/ec2-user/SageMaker/complaint-classifier/notebooks/logreg_model.joblib


In [34]:
### Move the golden copy to #S3/complaint-classifier-jp2025/models
import boto3

s3 = boto3.client('s3')
bucket = 'complaint-classifier-jp2025'
prefix = 'models/'

# Absolute paths
logreg_path = '/home/ec2-user/SageMaker/complaint-classifier/notebooks/logreg_model.joblib'
label_path = '/home/ec2-user/SageMaker/complaint-classifier/notebooks/label_encoder.joblib'

# Upload to S3
s3.upload_file(logreg_path, bucket, prefix + 'logreg_model.joblib')
s3.upload_file(label_path, bucket, prefix + 'label_encoder.joblib')
