In [5]:
import pandas as pd
import numpy as np
import sagemaker

# --- S3 paths ---
sess = sagemaker.Session()
bucket = sess.default_bucket()
raw_path = f"s3://{bucket}/raw/lifearchitectmodels.csv"
out_local = "model_profiles.csv"
out_s3 = f"s3://{bucket}/processed/model_profiles.csv"

print("Reading:", raw_path)

df = pd.read_csv(raw_path)

# --- basic cleaning ---
# normalize column names
df.columns = [c.strip().replace("\n", " ").replace("\r", " ") for c in df.columns]

# handle the weird "Peer-\nreviewed?" header if it exists
peer_col = None
for c in df.columns:
    if "Peer" in c and "review" in c.lower():
        peer_col = c
        break


Reading: s3://sagemaker-us-east-1-341104199580/raw/lifearchitectmodels.csv


In [6]:
# normalize text fields
for col in ["Model", "Field", "Outperforms human avg?"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

# coerce numeric result fields if present
for col in ["Result", "Human result"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# normalize peer-reviewed to boolean-ish
if peer_col:
    df[peer_col] = df[peer_col].astype(str).str.strip().str.lower()
    df["peer_reviewed_flag"] = df[peer_col].isin(["yes", "y", "true", "1"])
else:
    df["peer_reviewed_flag"] = False

# normalize "outperforms" to boolean-ish
if "Outperforms human avg?" in df.columns:
    df["outperforms_flag"] = df["Outperforms human avg?"].astype(str).str.strip().str.lower().isin(["yes", "y", "true", "1"])
else:
    df["outperforms_flag"] = False



In [7]:

# --- Build a "quality score" ---
# Simple, explainable scoring:
# +2 if outperforms human
# +1 if peer-reviewed
# + up to +2 from numeric Result (scaled to 0..2)
# +0.5 if Human result exists (means comparable benchmark exists) : proxy for design doc
score = np.zeros(len(df), dtype=float)

score += 2.0 * df["outperforms_flag"].astype(float)
score += 1.0 * df["peer_reviewed_flag"].astype(float)

if "Result" in df.columns:
    # normalize Result to 0..2 using percentile scaling (robust)
    r = df["Result"].copy()
    r_min = np.nanpercentile(r, 5) if np.isfinite(r).any() else 0
    r_max = np.nanpercentile(r, 95) if np.isfinite(r).any() else 100
    denom = (r_max - r_min) if (r_max - r_min) != 0 else 1.0
    r_norm = ((r - r_min) / denom).clip(0, 1)
    score += 2.0 * r_norm.fillna(0)

if "Human result" in df.columns:
    score += 0.5 * df["Human result"].notna().astype(float)

df["quality_score"] = score



In [None]:

# --- Aggregate to model-level profiles ---
# We keep: avg score, count of achievements, domains covered, peer-reviewed ratio
agg = df.groupby("Model", dropna=False).agg(
    quality_score_mean=("quality_score", "mean"),
    quality_score_max=("quality_score", "max"),
    achievements_count=("Achievement", "count"),
    domains_count=("Field", lambda x: x.nunique()),
    peer_reviewed_rate=("peer_reviewed_flag", "mean"),
    outperforms_rate=("outperforms_flag", "mean"),
).reset_index()

# domains list (optional but useful)
domains = df.groupby("Model")["Field"].apply(lambda s: ", ".join(sorted(set([str(x).strip() for x in s.dropna()])))).reset_index()
domains = domains.rename(columns={"Field": "domains_covered"})
model_profiles = agg.merge(domains, on="Model", how="left")

# --- Create tiers (1..5) for routing constraints ---
# Use quantiles so tiers are balanced
model_profiles["quality_tier"] = pd.qcut(
    model_profiles["quality_score_mean"].rank(method="first"),
    q=min(5, model_profiles.shape[0]),
    labels=False
) + 1

# sort helpful
model_profiles = model_profiles.sort_values(["quality_tier", "quality_score_mean"], ascending=[False, False])

print("Model profiles preview:")
display(model_profiles.head(10))

# --- Save locally and upload to S3 ---
model_profiles.to_csv(out_local, index=False)
print("Saved:", out_local)

# upload (works in SageMaker notebooks)
!aws s3 cp model_profiles.csv {out_s3}
!aws s3 ls s3://{bucket}/processed/

Model profiles preview:


Unnamed: 0,Model,quality_score_mean,quality_score_max,achievements_count,domains_count,peer_reviewed_rate,outperforms_rate,domains_covered,quality_tier
3,Claude 3.6S,5.415792,5.415792,1,1,1.0,1.0,Persuasion,5
14,o3-mini-high,5.360643,5.360643,1,1,1.0,1.0,Health reviews,5
13,o1,5.040036,5.5,2,2,1.0,1.0,"Maths, Medicine",5
5,"GPT-4, etc",4.694612,4.694612,1,1,1.0,1.0,Emotional intelligence,5
15,o4-mini,4.61401,4.61401,1,1,1.0,1.0,Finance,4
8,Gemini 3,4.47688,4.47688,1,1,0.0,1.0,Transcription,4
6,GPT-4.5,4.355234,4.355234,1,1,1.0,1.0,Being human,4
0,Bing Chat,3.885513,4.067345,2,2,0.5,1.0,"Japan: National Medical Licensure Examination,...",3
10,davinci,3.718992,4.02434,4,3,0.25,1.0,"General knowledge, IQ (Binet-Simon Scale, verb...",3
4,GPT-4,3.13128,5.168894,16,15,0.75,0.9375,"Academia, Aerospace, Art (via prompting Midjou...",3


Saved: model_profiles.csv
upload: ./model_profiles.csv to s3://sagemaker-us-east-1-341104199580/processed/model_profiles.csv
