In [1]:
# Install strands library for mistral
!pip install strands-agents[mistral] python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Collecting strands-agents[mistral]
  Downloading strands_agents-1.14.0-py3-none-any.whl.metadata (13 kB)
Collecting docstring-parser<1.0,>=0.15 (from strands-agents[mistral])
  Downloading docstring_parser-0.17.0-py3-none-any.whl.metadata (3.5 kB)
Collecting mcp<2.0.0,>=1.11.0 (from strands-agents[mistral])
  Downloading mcp-1.19.0-py3-none-any.whl.metadata (85 kB)
Collecting opentelemetry-api<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-instrumentation-threading<1.00b0,>=0.51b0 (from strands-agents[mistral])
  Downloading opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl.metadata (2.1 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.30.0 (from strands-agents[mistral])
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mistralai>=1.8.2 (from strands-agent

In [3]:
# Core imports
import json
import os
import sys
import dotenv

from pathlib import Path
from typing import Dict, List, Tuple, TypeVar
from tqdm import tqdm

# Add parent directory to import our utilities
sys.path.append('..')
from src.utils import (
    save_json,
    read_json,
    load_file_content,
    get_job_paths,
    get_training_paths,
    sanity_check,
	chat_with_persona,
    track_api_call,  # Cost tracking from utils
    print_cost_summary,  # Cost summary from utils
    reset_cost_tracker  # Reset cost tracker from utils
)

# Pydantic for structured data
from pydantic import BaseModel, Field

# Strands for AI agents
from strands.agent import Agent
from strands.models.mistral import MistralModel

# Type hints
M = TypeVar('M', bound=BaseModel)

# Set up submission directory
SUBMISSION_DIR = Path('../submissions')
SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)

# Load environment
dotenv.load_dotenv("../env")

print("‚úÖ Setup complete")
sanity_check()

‚úÖ Setup complete
‚úÖ API request successful


True

In [12]:
import pandas as pd


results_df = pd.read_json("../submissions/results_final.json")
results_df

Unnamed: 0,persona_id,predicted_type,predicted_items,trainings,jobs
0,persona_001,awareness,too_young,,
1,persona_002,trainings_only,,[tr179],
2,persona_003,jobs+trainings,,,"[{'job_id': 'j65', 'suggested_trainings': []}]"
3,persona_004,jobs+trainings,,,"[{'job_id': 'j65', 'suggested_trainings': ['tr..."
4,persona_005,trainings_only,,"[tr169, tr163, tr177, tr159, tr165]",
...,...,...,...,...,...
95,persona_096,jobs+trainings,,,"[{'job_id': 'j115', 'suggested_trainings': []}]"
96,persona_097,jobs+trainings,,,"[{'job_id': 'j124', 'suggested_trainings': ['t..."
97,persona_098,jobs+trainings,,,"[{'job_id': 'j125', 'suggested_trainings': []}]"
98,persona_099,trainings_only,,"[tr313, tr309, tr327, tr321]",


In [13]:
# --- Configuration: adjust as needed ---
RESULTS_PATH = "../submissions/results_final.json"                  # your results_json file
PERSONAS_PATH = "../processed_data/personas_merged_reassigned_domains_21_10_2025_2.json"         # dict-style personas keyed by persona_id
EXTRACTED_JOBS_PATH = "../processed_data/extracted_jobs_merged_2025-10-21.json"  # dict-style jobs keyed by job_id

JOBS_DIR = "../../GDSC-8/data/jobs"                         # where j65.md, j101.md, etc. live
TRAININGS_DIR = "../../GDSC-8/data/trainings"               # where tr177.md, tr1.md, etc. live

# UI settings
OPEN_SECTIONS_BY_DEFAULT = False               # True = expanded <details>, False = collapsed

# --- Imports ---
import os
import json
import html
from pathlib import Path
from typing import Dict, Any, List, Tuple, Optional
from IPython.display import display, HTML
import pandas as pd

# Sanity print
print(f"Jobs dir:       {Path(JOBS_DIR).resolve()}")
print(f"Trainings dir:  {Path(TRAININGS_DIR).resolve()}")
print(f"Results path:   {Path(RESULTS_PATH).resolve()}")
print(f"Personas path:  {Path(PERSONAS_PATH).resolve()}")
print(f"Jobs JSON path: {Path(EXTRACTED_JOBS_PATH).resolve()}")

Jobs dir:       /home/ec2-user/SageMaker/GDSC-8/data/jobs
Trainings dir:  /home/ec2-user/SageMaker/GDSC-8/data/trainings
Results path:   /home/ec2-user/SageMaker/GDSC/submissions/results_final.json
Personas path:  /home/ec2-user/SageMaker/GDSC/processed_data/personas_merged_reassigned_domains_21_10_2025_2.json
Jobs JSON path: /home/ec2-user/SageMaker/GDSC/processed_data/extracted_jobs_merged_2025-10-21.json


In [14]:
def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def md_path_for(dir_path: str, item_id: str) -> str:
    """
    Given an ID like 'j65' or 'tr177' (or with .md), return the candidate .md path.
    """
    fname = f"{item_id}.md" if not str(item_id).endswith(".md") else str(item_id)
    return os.path.join(dir_path, fname)

def read_md(dir_path: str, item_id: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Returns (content, error_message). One of them is None.
    """
    if not item_id:
        return None, "Empty or invalid ID."
    path = md_path_for(dir_path, item_id)
    if not os.path.exists(path):
        return None, f"Not found: {path}"
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read(), None
    except Exception as e:
        return None, f"Error reading {path}: {e}"

def esc(s: Any) -> str:
    """HTML-escape."""
    return html.escape(str(s), quote=True)

def details(summary_html: str, body_html: str, open_default: bool = False) -> str:
    open_attr = " open" if open_default else ""
    return f"<details{open_attr}><summary>{summary_html}</summary><div style='margin-left:1rem'>{body_html}</div></details>"

def codeblock(md_text: str) -> str:
    """
    Show markdown as a pre/code block for readability in HTML.
    """
    return f"<pre style='white-space:pre-wrap; background:#f6f8fa; padding:0.75rem; border-radius:6px; border:1px solid #eaecef'>{esc(md_text)}</pre>"

def json_pretty(obj: Any) -> str:
    return f"<pre style='white-space:pre-wrap; background:#fff; padding:0.75rem; border-radius:6px; border:1px solid #eaecef'>{esc(json.dumps(obj, ensure_ascii=False, indent=2))}</pre>"

In [15]:
# Load input files
results = load_json(RESULTS_PATH)
raw_personas = load_json(PERSONAS_PATH)
raw_jobs = load_json(EXTRACTED_JOBS_PATH)

# Normalize personas into a dict keyed by persona_id
if isinstance(raw_personas, dict):
    persona_by_id: Dict[str, Dict[str, Any]] = raw_personas
elif isinstance(raw_personas, list):
    persona_by_id = {p.get("persona_id"): p for p in raw_personas if isinstance(p, dict) and p.get("persona_id")}
else:
    raise ValueError("Unsupported personas_merged.json structure. Expected dict or list.")

# Normalize jobs into a dict keyed by job_id
if isinstance(raw_jobs, dict):
    job_by_id: Dict[str, Dict[str, Any]] = raw_jobs
elif isinstance(raw_jobs, list):
    job_by_id = {j.get("job_id"): j for j in raw_jobs if isinstance(j, dict) and j.get("job_id")}
else:
    raise ValueError("Unsupported extracted_jobs_merged.json structure. Expected dict or list.")

# Warnings for missing ids
missing_personas = [r.get("persona_id") for r in results if r.get("persona_id") not in persona_by_id]
if missing_personas:
    print("Warning: personas not found in personas_merged.json:\n  - " + "\n  - ".join(missing_personas))

all_job_ids_from_results = []
for r in results:
    for j in (r.get("jobs") or []):
        jid = j.get("job_id")
        if jid:
            all_job_ids_from_results.append(jid)
missing_job_ids = [jid for jid in all_job_ids_from_results if jid not in job_by_id]
if missing_job_ids:
    print("Warning: job_ids from results not found in extracted_jobs_merged.json:\n  - " + "\n  - ".join(sorted(set(missing_job_ids))))

In [16]:
def job_quick_facts(jinfo: Dict[str, Any]) -> Dict[str, Any]:
    if not jinfo:
        return {}
    return {
        "title": jinfo.get("title"),
        "job_role": jinfo.get("job_role"),
        "domain": jinfo.get("domain"),
        "work_type": jinfo.get("work_type"),
        "location_city": jinfo.get("location_city"),
        "education_level_required": jinfo.get("education_level_required"),
        "years_of_experience_required": jinfo.get("years_of_experience_required"),
        "languages_required": jinfo.get("languages_required"),
        "required_skills": jinfo.get("required_skills"),
    }

def render_job_block(job: Dict[str, Any]) -> str:
    job_id = job.get("job_id")
    suggested_trainings = job.get("suggested_trainings", []) or []

    # Load job Markdown
    job_md, job_md_err = read_md(JOBS_DIR, job_id)

    # Load job JSON metadata
    jinfo = job_by_id.get(job_id)
    quick = job_quick_facts(jinfo)

    job_summary = f"<b>Job:</b> {esc(job_id)}"
    body_parts = []

    # JSON quick facts and full job record
    if jinfo:
        body_parts.append(details("<i>Job quick facts (from extracted_jobs_merged.json)</i>", json_pretty(quick), open_default=True if OPEN_SECTIONS_BY_DEFAULT else False))
        body_parts.append(details("<i>Full job JSON</i>", json_pretty(jinfo), open_default=False))
    else:
        body_parts.append(f"<div style='color:#d33'><b>Job metadata not found</b> in extracted_jobs_merged.json for {esc(job_id)}</div>")

    # Job Markdown content
    if job_md:
        body_parts.append(details("<i>Job file content (.md)</i>", codeblock(job_md), open_default=False))
    else:
        body_parts.append(f"<div style='color:#d33'><b>Missing job file:</b> {esc(md_path_for(JOBS_DIR, job_id))}<br>{esc(job_md_err)}</div>")

    # Suggested trainings content (Markdown)
    if suggested_trainings:
        tr_blocks = []
        for tr in suggested_trainings:
            tr_md, tr_err = read_md(TRAININGS_DIR, tr)
            if tr_md:
                tr_blocks.append(details(f"Training {esc(tr)}", codeblock(tr_md), open_default=False))
            else:
                tr_blocks.append(f"<div style='color:#d33'><b>Missing training file:</b> {esc(md_path_for(TRAININGS_DIR, tr))}<br>{esc(tr_err)}</div>")
        body_parts.append(details(f"<i>Suggested trainings</i> ({len(suggested_trainings)})", "<br>".join(tr_blocks), open_default=False))
    else:
        body_parts.append("<div><i>No suggested trainings</i></div>")

    return details(job_summary, "<br>".join(body_parts), open_default=False)

def render_training_list_block(trainings: List[str]) -> str:
    if not trainings:
        return "<div><i>No trainings</i></div>"
    blocks = []
    for tr in trainings:
        tr_md, tr_err = read_md(TRAININGS_DIR, tr)
        if tr_md:
            blocks.append(details(f"Training {esc(tr)}", codeblock(tr_md), open_default=False))
        else:
            blocks.append(f"<div style='color:#d33'><b>Missing training file:</b> {esc(md_path_for(TRAININGS_DIR, tr))}<br>{esc(tr_err)}</div>")
    return "<br>".join(blocks)

def render_persona_section(res: Dict[str, Any]) -> str:
    pid = res.get("persona_id")
    pinfo = persona_by_id.get(pid)

    predicted_type = res.get("predicted_type")
    predicted_items = res.get("predicted_items")  # e.g., "too_young" for awareness
    jobs = res.get("jobs", []) or []
    direct_trainings = res.get("trainings", []) or []

    header = f"<h3 style='margin-bottom:0.25rem'>Persona: {esc(pid)} <span style='font-weight:normal; color:#666'>‚Äî predicted_type: {esc(predicted_type)}</span></h3>"

    # Persona info (quick facts + full record)
    if pinfo is not None:
        quick = {
            "full_name": pinfo.get("full_name"),
            "age": pinfo.get("age"),
            "location_city": pinfo.get("location_city"),
            "current_focus": pinfo.get("current_focus"),
            "preferred_work_type": pinfo.get("preferred_work_type"),
            "education_level": pinfo.get("education_level"),
            "top_domain": pinfo.get("top_domain")
        }
        persona_block = (
            details("Persona quick facts", json_pretty(quick), open_default=OPEN_SECTIONS_BY_DEFAULT) +
            details("Full persona record", json_pretty(pinfo), open_default=False)
        )
    else:
        persona_block = f"<div style='color:#d33'><b>Persona not found</b> in personas_merged.json</div>"

    # Predicted items (e.g., "too_young")
    misc_block = ""
    if predicted_items is not None:
        misc_block = details("Other predicted items", json_pretty({"predicted_items": predicted_items}), open_default=False)

    # Jobs section (with JSON metadata + MD content)
    if jobs:
        job_blocks = [render_job_block(j) for j in jobs]
        jobs_block = details(f"Jobs ({len(jobs)})", "<br>".join(job_blocks), open_default=OPEN_SECTIONS_BY_DEFAULT)
    else:
        jobs_block = "<div><b>Jobs:</b> <i>none</i></div>"

    # Direct trainings section (Markdown content)
    trainings_block = details(f"Direct trainings ({len(direct_trainings)})", render_training_list_block(direct_trainings), open_default=OPEN_SECTIONS_BY_DEFAULT)

    return f"<section style='padding:1rem; border:1px solid #eaecef; border-radius:8px; margin:1rem 0'>{header}{persona_block}{misc_block}{jobs_block}{trainings_block}</section>"

def build_report_html(results: List[Dict[str, Any]]) -> str:
    sections = [render_persona_section(r) for r in results]
    body = "\n".join(sections)
    style = """
    <style>
      body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; line-height: 1.45; }
      summary { cursor:pointer; padding:0.25rem 0; }
      details { margin: 0.25rem 0; }
      h2, h3 { margin-top:0.75rem; }
    </style>
    """
    header = "<h2>Manual Evaluation Report</h2>"
    return f"<!doctype html><html><head><meta charset='utf-8'>{style}</head><body>{header}{body}</body></html>"

In [17]:
# Build a summary table enriched with job titles when available
summary_rows = []
for r in results:
    pid = r.get("persona_id")
    ptype = r.get("predicted_type")
    jobs = r.get("jobs", []) or []
    direct_trainings = r.get("trainings", []) or []
    n_jobs = len(jobs)
    n_direct_trainings = len(direct_trainings)
    n_suggested_trainings = sum(len((j.get("suggested_trainings") or [])) for j in jobs)

    # Collect job titles for convenience
    job_ids = [j.get("job_id") for j in jobs if j.get("job_id")]
    job_titles = []
    for jid in job_ids:
        jinfo = job_by_id.get(jid)
        job_titles.append(jinfo.get("title") if jinfo else None)

    pinfo = persona_by_id.get(pid, {})
    summary_rows.append({
        "persona_id": pid,
        "predicted_type": ptype,
        "n_jobs": n_jobs,
        "n_direct_trainings": n_direct_trainings,
        "n_suggested_trainings": n_suggested_trainings,
        "persona_found": pid in persona_by_id,
        "job_ids": ", ".join([x for x in job_ids if x]),
        "job_titles": ", ".join([t for t in job_titles if t]),
        # a few quick persona columns if available
        "full_name": pinfo.get("full_name"),
        "age": pinfo.get("age"),
        "location_city": pinfo.get("location_city"),
        "current_focus": pinfo.get("current_focus"),
    })

summary_df = pd.DataFrame(summary_rows).sort_values(by=["persona_id"]).reset_index(drop=True)
display(summary_df)

# Build and display the full HTML report
report_html = build_report_html(results)
display(HTML(report_html))

# Save to file for sharing
out_path = "../processed_data/evaluation_report.html"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(report_html)
print(f"Saved report to: {Path(out_path).resolve()}")

Unnamed: 0,persona_id,predicted_type,n_jobs,n_direct_trainings,n_suggested_trainings,persona_found,job_ids,job_titles,full_name,age,location_city,current_focus
0,persona_001,awareness,0,0,0,True,,,Rafael,21,S√£o Paulo,awareness
1,persona_002,trainings_only,0,1,0,True,,,Mariana,21,Recife,training_only
2,persona_003,jobs+trainings,1,0,0,True,j65,Detailed Job Description: Junior Food Manufact...,Mateus,26,Recife,jobs+trainings
3,persona_004,jobs+trainings,1,0,2,True,j65,Detailed Job Description: Junior Food Manufact...,Rafael,27,Rio de Janeiro,jobs+trainings
4,persona_005,trainings_only,0,5,0,True,,,Pedro,16,Bras√≠lia,training_only
...,...,...,...,...,...,...,...,...,...,...,...,...
95,persona_096,jobs+trainings,1,0,0,True,j115,Detailed Job Description: Ground Logistics Ana...,Beatriz,18,Bras√≠lia,jobs+trainings
96,persona_097,jobs+trainings,5,0,15,True,"j124, j125, j126, j128, j129",Job Description: Onboard Safety Supervisor ‚Äì C...,Andr√©,28,unknown,jobs+trainings
97,persona_098,jobs+trainings,1,0,0,True,j125,Analyst ‚Äì Maritime Operations & Maintenance: H...,Isabela,29,Curitiba,jobs+trainings
98,persona_099,trainings_only,0,4,0,True,,,Patr√≠cia,28,Salvador,training_only


Saved report to: /home/ec2-user/SageMaker/GDSC/processed_data/evaluation_report.html
