In [1]:
import sys
sys.path.append("..")

In [3]:
import networkx as nx

from src.modules.knowledge_graph.graph import (
    build_kg, save_graph, load_graph, add_node_once
)
from src.modules.knowledge_graph.utils import (
    _city_key, _domain_key, _job_key, _lang_key, _norm_text, _norm_key,
    _persona_key, _skill_key, _training_key, load_bundle
)
from src.modules.knowledge_graph.matching import batch_recommendations

In [7]:
# --- Cell 3: Load real data bundle ---
personas_path = "../submissions/personas_merged_reassigned_domains_21_10_2025_2.json"
jobs_path = "../submissions/extracted_jobs_merged_2025-10-21.json"
trainings_path = "../submissions/extracted_trainings_backfilled_2025-10-21_2.json"

personas, jobs, trainings = load_bundle(personas_path, jobs_path, trainings_path)
print(f"personas={len(personas)}, jobs={len(jobs)}, trainings={len(trainings)}")

# Build graph
G = build_kg(jobs, trainings, personas)
print(f"Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")
assert isinstance(G, (nx.MultiGraph, nx.MultiDiGraph)), "Expected a MultiGraph-like structure"

Loaded ‚Üí personas=100 jobs=200 trainings=497
personas=100, jobs=200, trainings=497
Graph built: 1023 nodes, 3442 edges.


In [8]:
from collections import defaultdict

def edges_with(G, u, v=None, **attr_eq):
    """Find edges from u (optionally to v) where edge data exactly matches attr_eq."""
    matches = []
    if v is None:
        for nb in G.neighbors(u):
            for k, data in G.get_edge_data(u, nb).items():
                if all(data.get(k2) == v2 for k2, v2 in attr_eq.items()):
                    matches.append((u, nb, data))
    else:
        if G.has_edge(u, v):
            for k, data in G.get_edge_data(u, v).items():
                if all(data.get(k2) == v2 for k2, v2 in attr_eq.items()):
                    matches.append((u, v, data))
    return matches

# Count by node 'type' attribute
by_type = defaultdict(int)
for n, d in G.nodes(data=True):
    by_type[d.get("type", "_unknown")] += 1

print("Node counts by type:")
for k, v in sorted(by_type.items(), key=lambda x: x[0]):
    print(f"  {k:>10}: {v}")

print("\nSample nodes:")
for i, (n, d) in enumerate(G.nodes(data=True)):
    if i >= 10: break
    print(n, "->", d)

print("\nSample edges:")
for i, (u, v, d) in enumerate(G.edges(data=True)):
    if i >= 10: break
    print(u, "<->", v, ":", d)

Node counts by type:
        city: 11
      domain: 20
         job: 200
    language: 3
     persona: 100
       skill: 192
    training: 497

Sample nodes:
persona:persona_001 -> {'type': 'persona', 'full_name': 'Rafael', 'age': 21, 'location_city': 'S√£o Paulo', 'open_to_relocate': False, 'preferred_work_type': 'either', 'education_level': 'Ensino Fundamental', 'years_experience': 0.0, 'languages': ['pt-br'], 'current_focus': 'awareness', 'top_domain': 'Food Industry'}
domain:food industry -> {'type': 'domain', 'name': 'Food Industry'}
city:sao paulo -> {'type': 'city', 'name': 'Sao Paulo'}
language:pt br -> {'type': 'language', 'code': 'pt-br'}
skill:food safety food safety sanitation -> {'type': 'skill', 'name': 'Food Safety (Food Safety Sanitation)'}
skill:food innovation and development -> {'type': 'skill', 'name': 'Food Innovation and Development'}
skill:packaging technology -> {'type': 'skill', 'name': 'Packaging Technology'}
skill:manufacturing line operations -> {'type': 'sk

In [9]:
# --- Cell: run batch and write results ---
import os, json
from datetime import date

JOBS_TOP_K = 10
TOP_PER_LEVEL = 1
# TRAININGS_PER_LEVEL_GAP = 2
# TRAININGS_STANDALONE_K  = 3

results = batch_recommendations(G, jobs_top_k=JOBS_TOP_K, top_per_level=TOP_PER_LEVEL)

current_date = date.today().isoformat()
out_path = f"../submissions/results_{current_date}.json"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Wrote {len(results)} persona recommendations ‚Üí {os.path.abspath(out_path)}")
print(json.dumps(results[:5], ensure_ascii=False, indent=2))

‚úÖ Wrote 100 persona recommendations ‚Üí /home/ec2-user/SageMaker/GDSC/submissions/results_2025-10-29.json
[
  {
    "persona_id": "persona_001",
    "predicted_type": "awareness",
    "predicted_items": "info"
  },
  {
    "persona_id": "persona_002",
    "predicted_type": "trainings_only",
    "trainings": [
      "tr179"
    ]
  },
  {
    "persona_id": "persona_003",
    "predicted_type": "jobs+trainings",
    "jobs": [
      {
        "job_id": "j65",
        "suggested_trainings": []
      }
    ]
  },
  {
    "persona_id": "persona_004",
    "predicted_type": "jobs+trainings",
    "jobs": [
      {
        "job_id": "j65",
        "suggested_trainings": [
          "tr171",
          "tr156"
        ]
      }
    ]
  },
  {
    "persona_id": "persona_005",
    "predicted_type": "trainings_only",
    "trainings": [
      "tr169",
      "tr163",
      "tr177",
      "tr159",
      "tr165"
    ]
  }
]


## Submit to leaderboard

In [11]:
import json

# Path to your JSON file
file_path = out_path

# Read the JSON file and load its content into a variable
with open(file_path, "r", encoding="utf-8") as file:
    results = json.load(file)

# Now `data` is a Python dictionary (or list, depending on the JSON structure)
print(results)

[{'persona_id': 'persona_001', 'predicted_type': 'awareness', 'predicted_items': 'info'}, {'persona_id': 'persona_002', 'predicted_type': 'trainings_only', 'trainings': ['tr179']}, {'persona_id': 'persona_003', 'predicted_type': 'jobs+trainings', 'jobs': [{'job_id': 'j65', 'suggested_trainings': []}]}, {'persona_id': 'persona_004', 'predicted_type': 'jobs+trainings', 'jobs': [{'job_id': 'j65', 'suggested_trainings': ['tr171', 'tr156']}]}, {'persona_id': 'persona_005', 'predicted_type': 'trainings_only', 'trainings': ['tr169', 'tr163', 'tr177', 'tr159', 'tr165']}, {'persona_id': 'persona_006', 'predicted_type': 'trainings_only', 'trainings': ['tr458', 'tr460', 'tr463', 'tr468']}, {'persona_id': 'persona_007', 'predicted_type': 'trainings_only', 'trainings': ['tr469', 'tr463', 'tr474', 'tr459', 'tr453']}, {'persona_id': 'persona_008', 'predicted_type': 'awareness', 'predicted_items': 'info'}, {'persona_id': 'persona_009', 'predicted_type': 'trainings_only', 'trainings': ['tr461', 'tr466'

In [None]:
from src.utils import make_submission

# Submit
response = make_submission(results)

if response and response.status_code == 200:
    print("üéâ Submission successful! Check the leaderboard!")
else:
    print(f"‚ùå Submission failed: {response.text if response else 'No response'}")

# Final cost report
print("\n" + "="*50)
print_cost_summary()