In [8]:
import sys, sagemaker, pandas as pd
sess = sagemaker.Session()
bucket = sess.default_bucket()
print("Python:", sys.version)
print("Bucket:", bucket)

# Check if s3fs exists
try:
    import s3fs
    print("s3fs installed ✅")
except Exception as e:
    print("s3fs missing ❌", e)

# Confirm the file exists in S3
!aws s3 ls s3://{bucket}/processed/
!aws s3 ls s3://{bucket}/raw/


Python: 3.12.12 | packaged by conda-forge | (main, Oct 22 2025, 23:25:55) [GCC 14.3.0]
Bucket: sagemaker-us-east-1-341104199580
s3fs installed ✅
2026-02-02 00:56:24       1785 model_profiles.csv
2026-02-02 01:03:35    1322198 synthetic_requests_labeled.csv
2026-02-02 01:05:44    2690432 synthetic_requests_labeled_v2.csv
2026-02-02 00:53:53       7312 aimodelpoll.csv
2026-02-02 00:53:53      19349 lifearchitectmodels.csv


In [9]:
import pandas as pd, sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket()

profiles = pd.read_csv(f"s3://{bucket}/processed/model_profiles.csv")
print("profiles shape:", profiles.shape)
print(profiles.columns.tolist())
display(profiles.head(10))


profiles shape: (17, 9)
['Model', 'quality_score_mean', 'quality_score_max', 'achievements_count', 'domains_count', 'peer_reviewed_rate', 'outperforms_rate', 'domains_covered', 'quality_tier']


Unnamed: 0,Model,quality_score_mean,quality_score_max,achievements_count,domains_count,peer_reviewed_rate,outperforms_rate,domains_covered,quality_tier
0,Claude 3.6S,5.415792,5.415792,1,1,1.0,1.0,Persuasion,5
1,o3-mini-high,5.360643,5.360643,1,1,1.0,1.0,Health reviews,5
2,o1,5.040036,5.5,2,2,1.0,1.0,"Maths, Medicine",5
3,"GPT-4, etc",4.694612,4.694612,1,1,1.0,1.0,Emotional intelligence,5
4,o4-mini,4.61401,4.61401,1,1,1.0,1.0,Finance,4
5,Gemini 3,4.47688,4.47688,1,1,0.0,1.0,Transcription,4
6,GPT-4.5,4.355234,4.355234,1,1,1.0,1.0,Being human,4
7,Bing Chat,3.885513,4.067345,2,2,0.5,1.0,"Japan: National Medical Licensure Examination,...",3
8,davinci,3.718992,4.02434,4,3,0.25,1.0,"General knowledge, IQ (Binet-Simon Scale, verb...",3
9,GPT-4,3.13128,5.168894,16,15,0.75,0.9375,"Academia, Aerospace, Art (via prompting Midjou...",3


In [10]:
import numpy as np
import pandas as pd
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib

sess = sagemaker.Session()
bucket = sess.default_bucket()

# -------------------------
# Load model profiles
# -------------------------
profiles = pd.read_csv(f"s3://{bucket}/processed/model_profiles.csv")
profiles["Model"] = profiles["Model"].astype(str).str.strip()
profiles = profiles[profiles["Model"].notna() & (profiles["Model"] != "nan")].copy()

# Using top 6 models
profiles = profiles.sort_values(["quality_tier","quality_score_mean"], ascending=[False, False]).head(6).copy()
models = profiles["Model"].tolist()
print("Using models:", models)


Using models: ['Claude 3.6S', 'o3-mini-high', 'o1', 'GPT-4, etc', 'o4-mini', 'Gemini 3']


In [11]:
# -------------------------
# Improved synthetic traits (wider spread)
# -------------------------
# cost rises with quality tier (more realistic)
profiles["cost_per_token"] = 0.0000006 * (1.8 ** (profiles["quality_tier"] - 1))   # bigger gap

# latency increases with quality tier but not crazy
profiles["base_latency_ms"] = 60 + 55 * (profiles["quality_tier"] - 1)            # 60..335
profiles["per_token_ms"] = 0.010 + 0.008 * (profiles["quality_tier"] - 1)         # 0.01..0.042

# quality score normalized
q = profiles["quality_score_mean"].astype(float)
profiles["quality_score_norm"] = ((q - q.min()) / (q.max() - q.min() + 1e-9)).clip(0, 1)

mp = profiles.set_index("Model")[["base_latency_ms","per_token_ms","cost_per_token","quality_score_norm","quality_tier"]].to_dict("index")
fallback_model = profiles.sort_values("quality_score_norm", ascending=False).iloc[0]["Model"]
print("Fallback model:", fallback_model)


Fallback model: Claude 3.6S


In [12]:
# -------------------------
# Generate synthetic requests (rebalanced)
# -------------------------
rng = np.random.default_rng(7)
N = 40000

mix = rng.choice([0,1,2], size=N, p=[0.50,0.40,0.10])
prompt_tokens = np.where(
    mix==0, rng.integers(20, 500, size=N),
    np.where(mix==1, rng.integers(500, 2500, size=N), rng.integers(2500, 10000, size=N))
)
output_tokens = (prompt_tokens * rng.uniform(0.15,0.7,size=N) + rng.integers(20,250,size=N)).astype(int)
output_tokens = np.clip(output_tokens, 30, 3000)

domains = rng.choice(["chat","writing","code","qa","reasoning"], size=N, p=[0.28,0.20,0.18,0.20,0.14])
lat_req = rng.choice([200,400,800,1500,3000,6000], size=N, p=[0.10,0.15,0.25,0.25,0.18,0.07])

# KEY CHANGE: quality requirement is softened (more feasible competition)
# Instead of mapping tier -> 0..1, map tier -> 0.35..0.85
q_tier = rng.choice([1,2,3,4,5], size=N, p=[0.25,0.28,0.25,0.15,0.07])
q_req = 0.35 + (q_tier - 1) * (0.50/4.0)   # 0.35..0.85

requests = pd.DataFrame({
    "prompt_tokens": prompt_tokens,
    "output_tokens": output_tokens,
    "tokens_total": prompt_tokens + output_tokens,
    "domain": domains,
    "latency_requirement_ms": lat_req,
    "quality_req_tier": q_tier,
    "quality_requirement": q_req,
})
requests["log_tokens_total"] = np.log1p(requests["tokens_total"])
requests["strict_latency"] = (requests["latency_requirement_ms"] <= 400).astype(int)

# Domain penalty still matters but slightly lower so mid models can compete
domain_penalty = {"chat":0.00,"writing":0.04,"qa":0.03,"code":0.07,"reasoning":0.08}
requests["domain_penalty"] = requests["domain"].map(domain_penalty).fillna(0.05)

print("Requests:", requests.shape)
display(requests.head())


Requests: (40000, 10)


Unnamed: 0,prompt_tokens,output_tokens,tokens_total,domain,latency_requirement_ms,quality_req_tier,quality_requirement,log_tokens_total,strict_latency,domain_penalty
0,2103,1217,3320,writing,200,4,0.725,8.108021,1,0.04
1,2353,1094,3447,reasoning,3000,3,0.6,8.14555,0,0.08
2,511,475,986,chat,400,1,0.35,6.89467,1,0.0
3,204,191,395,code,800,4,0.725,5.981414,0,0.07
4,121,141,262,writing,400,3,0.6,5.572154,1,0.04


In [13]:
# -------------------------
# Label policy: cheapest feasible model
# -------------------------
labels = []
for _, r in requests.iterrows():
    tokens = r["tokens_total"]
    long_pen = 0.08 if tokens > 4000 else 0.0

    feas = []
    for m in models:
        base = mp[m]["base_latency_ms"]
        per  = mp[m]["per_token_ms"]
        cpt  = mp[m]["cost_per_token"]
        qn   = mp[m]["quality_score_norm"]

        pred_lat = (base + per * tokens) * 1.25  # slightly less pessimistic than 1.35
        pred_q   = np.clip(qn - r["domain_penalty"] - long_pen, 0, 1)
        pred_c   = cpt * tokens

        if (pred_lat <= r["latency_requirement_ms"]) and (pred_q >= r["quality_requirement"]):
            feas.append((pred_c, m))

    labels.append(sorted(feas)[0][1] if feas else fallback_model)

requests["label_model"] = labels
print("\nLabel distribution (should be multi-class):")
print(requests["label_model"].value_counts(normalize=True))



Label distribution (should be multi-class):
label_model
Claude 3.6S    1.0
Name: proportion, dtype: float64


In [14]:
# -------------------------
# Train router (RandomForest)
# -------------------------
X = requests[[
    "prompt_tokens","output_tokens","tokens_total","log_tokens_total",
    "latency_requirement_ms","quality_requirement","quality_req_tier","strict_latency"
]].copy()
X = pd.get_dummies(pd.concat([X, requests[["domain"]]], axis=1), columns=["domain"], drop_first=False)

y = requests["label_model"].astype("category")
y_codes = y.cat.codes
label_names = list(y.cat.categories)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_codes, test_size=0.2, random_state=42, stratify=y_codes
)

rf = RandomForestClassifier(n_estimators=600, min_samples_leaf=2, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
proba = rf.predict_proba(X_test)

print("\nRouter report:")
print(classification_report(y_test, pred, target_names=label_names))



Router report:
              precision    recall  f1-score   support

 Claude 3.6S       1.00      1.00      1.00      8000

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000



In [15]:
# -------------------------
# Enhancement: feasibility-gated inference + confidence fallback
# -------------------------
# Gate: if predicted model is infeasible, choose cheapest feasible.
# If nothing feasible, choose fallback.
# Also: if confidence < threshold, choose cheapest feasible.
conf_thresh = 0.55

# Rebuild request slice for test set
idx = np.arange(len(X))
_, idx_test = train_test_split(idx, test_size=0.2, random_state=42, stratify=y_codes)
req_test = requests.iloc[idx_test].reset_index(drop=True)

pred_codes = pred
pred_models = np.array([label_names[c] for c in pred_codes])
conf = proba.max(axis=1)

def cheapest_feasible(row):
    tokens = row["tokens_total"]
    long_pen = 0.08 if tokens > 4000 else 0.0
    feas = []
    for m in models:
        base = mp[m]["base_latency_ms"]
        per  = mp[m]["per_token_ms"]
        cpt  = mp[m]["cost_per_token"]
        qn   = mp[m]["quality_score_norm"]
        pred_lat = (base + per * tokens) * 1.25
        pred_q = np.clip(qn - row["domain_penalty"] - long_pen, 0, 1)
        pred_c = cpt * tokens
        if (pred_lat <= row["latency_requirement_ms"]) and (pred_q >= row["quality_requirement"]):
            feas.append((pred_c, m))
    return sorted(feas)[0][1] if feas else fallback_model

# Apply gating
gated_models = []
for i in range(len(req_test)):
    if conf[i] < conf_thresh:
        gated_models.append(cheapest_feasible(req_test.loc[i]))
    else:
        # check feasibility for predicted model; if infeasible => cheapest feasible
        m = pred_models[i]
        row = req_test.loc[i]
        tokens = row["tokens_total"]
        long_pen = 0.08 if tokens > 4000 else 0.0
        base = mp[m]["base_latency_ms"]
        per  = mp[m]["per_token_ms"]
        qn   = mp[m]["quality_score_norm"]
        pred_lat = (base + per * tokens) * 1.25
        pred_q = np.clip(qn - row["domain_penalty"] - long_pen, 0, 1)
        if (pred_lat <= row["latency_requirement_ms"]) and (pred_q >= row["quality_requirement"]):
            gated_models.append(m)
        else:
            gated_models.append(cheapest_feasible(row))

gated_models = np.array(gated_models)


In [16]:
# -------------------------
# Evaluate cost + SLA for ungated vs gated vs baselines
# -------------------------
def eval_policy(chosen_models, df_req):
    base = np.array([mp[m]["base_latency_ms"] for m in chosen_models])
    per  = np.array([mp[m]["per_token_ms"] for m in chosen_models])
    cpt  = np.array([mp[m]["cost_per_token"] for m in chosen_models])
    qn   = np.array([mp[m]["quality_score_norm"] for m in chosen_models])

    tokens = df_req["tokens_total"].values
    lat_req = df_req["latency_requirement_ms"].values
    q_req = df_req["quality_requirement"].values
    dom_pen = df_req["domain_penalty"].values
    long_pen = (tokens > 4000).astype(float) * 0.08

    pred_lat = (base + per * tokens) * 1.25
    pred_q = np.clip(qn - dom_pen - long_pen, 0, 1)
    pred_cost = cpt * tokens
    feasible = (pred_lat <= lat_req) & (pred_q >= q_req)

    return {
        "avg_cost": float(pred_cost.mean()),
        "sla_rate": float(feasible.mean()),
        "p95_latency_ms": float(np.percentile(pred_lat, 95)),
        "avg_quality": float(pred_q.mean()),
        "fallback_rate": float((chosen_models == fallback_model).mean())
    }

cheapest_model = profiles.sort_values("cost_per_token").iloc[0]["Model"]
bestq_model = profiles.sort_values("quality_score_norm", ascending=False).iloc[0]["Model"]

res_ungated = eval_policy(pred_models[:len(req_test)], req_test)
res_gated   = eval_policy(gated_models, req_test)
res_cheapest= eval_policy(np.array([cheapest_model]*len(req_test)), req_test)
res_bestq   = eval_policy(np.array([bestq_model]*len(req_test)), req_test)

summary = pd.DataFrame([
    {"policy":"router_raw", **res_ungated},
    {"policy":"router_gated_conf", **res_gated},
    {"policy":"always_cheapest", **res_cheapest},
    {"policy":"always_best_quality", **res_bestq},
])
display(summary)

# -------------------------
# Save artifacts
# -------------------------
requests.to_csv("synthetic_requests_labeled_v2.csv", index=False)
joblib.dump(rf, "rf_router_v2.joblib")

!aws s3 cp synthetic_requests_labeled_v2.csv s3://{bucket}/processed/synthetic_requests_labeled_v2.csv
!aws s3 cp rf_router_v2.joblib s3://{bucket}/models/rf_router_v2.joblib

print("Saved v2 dataset + model")

Unnamed: 0,policy,avg_cost,sla_rate,p95_latency_ms,avg_quality,fallback_rate
0,router_raw,0.012987,0.814375,830.241125,0.953622,1.0
1,router_gated_conf,0.012987,0.814375,830.241125,0.953622,1.0
2,always_cheapest,0.007215,0.0,670.016625,0.0,0.0
3,always_best_quality,0.012987,0.814375,830.241125,0.953622,1.0


upload: ./synthetic_requests_labeled_v2.csv to s3://sagemaker-us-east-1-341104199580/processed/synthetic_requests_labeled_v2.csv
upload: ./rf_router_v2.joblib to s3://sagemaker-us-east-1-341104199580/models/rf_router_v2.joblib
Saved v2 dataset + model
