In [1]:
# Ensure both project root and src/ are on sys.path
import sys
import os
import platform
from pathlib import Path

NB_DIR = Path(os.getcwd()).resolve()
ROOT = NB_DIR.parent.resolve()
SRC = ROOT / "src"

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

print("Notebook dir:", NB_DIR)
print("Project root:", ROOT)
print("Python:", platform.python_version())
print("sys.path head:", sys.path[:3])

# Core pipeline imports
from src.modules.training_info_extraction.pipeline import (
    extract_trainings_one_skill,
    clean_extracted_csv,
    relabel_domains_by_index_ranges,
    label_triplets_from_csv,
)
from src.modules.training_info_extraction.io import (
    derive_allowed_domains_from_jobs_csv,
)
from src.common.canonicalization import canonicalize_domain_or_raise, to_bia_skill_level
from src.common.env import load_project_dotenv, project_root



# Optional: quick sanity on canonicalizers
print("Loaded:", load_project_dotenv(), "ROOT:", project_root())
print("to_bia_skill_level('fundamentals') ->", to_bia_skill_level("fundamentals"))

Notebook dir: /home/ec2-user/SageMaker/GDSC/notebooks
Project root: /home/ec2-user/SageMaker/GDSC
Python: 3.10.18
sys.path head: ['/home/ec2-user/SageMaker/GDSC/src', '/home/ec2-user/SageMaker/GDSC', '/home/ec2-user/anaconda3/envs/python3/lib/python310.zip']
Loaded: /home/ec2-user/SageMaker/GDSC/env ROOT: /home/ec2-user/SageMaker/GDSC
to_bia_skill_level('fundamentals') -> Basic


In [4]:
import os
from src.modules.training_info_extraction.llm_client import call_llm


print("MISTRAL_API_KEY set:", bool(os.getenv("MISTRAL_API_KEY")))
MODEL = "mistral-medium-latest"

# Tiny ping: ask the LLM for a trivial JSON; we just want a non-None response
ping = call_llm('Return ONLY this: {"ok": true}', model=MODEL)
print("LLM ping returned:", type(ping), ping.keys() if isinstance(ping, dict) else ping)

MISTRAL_API_KEY set: True
LLM ping returned: <class 'dict'> dict_keys(['content', 'model', 'duration', 'input_tokens', 'output_tokens', 'total_tokens'])


In [6]:
import os
from pathlib import Path

# ---- Inputs / Outputs ----
TRAININGS_DIR = Path("/home/ec2-user/SageMaker/GDSC-8/data/trainings")

# Jobs CSV (source for allowed domains)
JOBS_CSV = ROOT / "processed_data" / "jobs_extracted_labels_en_blocked.csv"
JOBS_DOMAIN_COL = "libelle_domaine_professionel_en"  # column used in the original notebook

# Outputs live under a test subfolder so we don't overwrite anything else
OUT_DIR = (ROOT / "processed_data" / "outputs" / "nb_test").resolve()
OUT_DIR.mkdir(parents=True, exist_ok=True)

# LLM model & key
MODEL = os.getenv("MISTRAL_MODEL_ID", "mistral-medium-latest")
HAS_MISTRAL_KEY = bool(os.getenv("MISTRAL_API_KEY"))

# Run controls (keep small while testing)
MAX_FILES = 5             # 0 = all files; set to small N to test quickly
SLEEP_SECONDS = 0.1
RESUME = False            # for predictable tests, set False

print("TRAININGS_DIR:", TRAININGS_DIR, "exists:", TRAININGS_DIR.exists())
print("JOBS_CSV:", JOBS_CSV, "exists:", JOBS_CSV.exists())
print("OUT_DIR:", OUT_DIR)
print("MODEL:", MODEL, "HAS_MISTRAL_KEY:", HAS_MISTRAL_KEY)

TRAININGS_DIR: /home/ec2-user/SageMaker/GDSC-8/data/trainings exists: True
JOBS_CSV: /home/ec2-user/SageMaker/GDSC/processed_data/jobs_extracted_labels_en_blocked.csv exists: True
OUT_DIR: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test
MODEL: mistral-medium-latest HAS_MISTRAL_KEY: True


In [7]:
# Derive allowed domains from the jobs CSV (as the original notebook does)
if not JOBS_CSV.exists():
    raise FileNotFoundError(f"Jobs CSV not found: {JOBS_CSV}")

raw_domains = derive_allowed_domains_from_jobs_csv(JOBS_CSV, JOBS_DOMAIN_COL)

# Canonicalize using DOMAIN_ALIAS (tolerant if a value is not known)
allowed_canon = set()
for d in raw_domains:
    try:
        allowed_canon.add(canonicalize_domain_or_raise(d))
    except Exception:
        if d:
            allowed_canon.add(d)

print(f"Allowed domains derived: {len(allowed_canon)}")
print(list(sorted(allowed_canon))[:10])

Allowed domains derived: 20
['Accounting & Management', 'Banking & Insurance Administration', 'Culture & Document Management', 'Design, Research, Studies & Development', 'Electronics & Electrical', 'Fibers & Paper Industry', 'Food Industry', 'Hospitality Reception', 'Industrial HSE', 'Insurance']


In [36]:
from pathlib import Path
from src.modules.training_info_extraction.io import find_markdown_files, load_resume_set

print("CSV path (should exist):", csv_path)
print("Exists?", Path(csv_path).exists())

files = find_markdown_files(TRAININGS_DIR)
print("Markdown files found:", len(files))
print("First few files:", files[:5])

done_set, known_prev = load_resume_set(jsonl_path)
print("Resume enabled?:", RESUME)
print("JSONL path:", jsonl_path, "exists?", Path(jsonl_path).exists())
print("Already processed (resume set) size:", len(done_set))

CSV path (should exist): /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.csv
Exists? False
Markdown files found: 692
First few files: [PosixPath('/home/ec2-user/SageMaker/GDSC-8/data/trainings/.ipynb_checkpoints/tr0-checkpoint.md'), PosixPath('/home/ec2-user/SageMaker/GDSC-8/data/trainings/.ipynb_checkpoints/tr1-checkpoint.md'), PosixPath('/home/ec2-user/SageMaker/GDSC-8/data/trainings/.ipynb_checkpoints/tr10-checkpoint.md'), PosixPath('/home/ec2-user/SageMaker/GDSC-8/data/trainings/.ipynb_checkpoints/tr100-checkpoint.md'), PosixPath('/home/ec2-user/SageMaker/GDSC-8/data/trainings/.ipynb_checkpoints/tr101-checkpoint.md')]
Resume enabled?: False
JSONL path: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.jsonl exists? False
Already processed (resume set) size: 0


In [37]:
from datetime import datetime

jsonl_path, csv_path = extract_trainings_one_skill(
    trainings_dir=TRAININGS_DIR,
    out_dir=OUT_DIR,
    model=MODEL,
    # pass the allowed domains we derived to ensure consistent canonicalization
    allowed_domains=allowed_canon,
    jobs_csv_path=None,  # not needed since we pass allowed_domains
    jsonl_name=f"trainings_extracted_one_skill_{datetime.now().strftime('%Y%m%d')}.jsonl",
    csv_name=f"trainings_extracted_one_skill_{datetime.now().strftime('%Y%m%d')}.csv",
    skill_catalog_name="skill_catalog.json",
    max_files=MAX_FILES,
    sleep_seconds=SLEEP_SECONDS,
    resume=RESUME,
)

print("JSONL path:", jsonl_path)
print("CSV path:", csv_path)

import pandas as pd
df_extracted = pd.read_csv(csv_path)
print("Rows extracted:", len(df_extracted))
df_extracted.head(10)

[1/5] tr0-checkpoint.md domain='Accounting & Management' skill='Accounting Software' level=Basic
[2/5] tr1-checkpoint.md domain='Accounting & Management' skill='Accounting Software' level=Intermediate
[3/5] tr10-checkpoint.md domain='Accounting & Management' skill='Accounting Software' level=Intermediate
[4/5] tr100-checkpoint.md domain='Electronics & Electrical' skill='Embedded C' level=Intermediate
[5/5] tr101-checkpoint.md domain='Electronics & Electrical' skill='Embedded C' level=Advanced
Saved CSV -> /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.csv
Saved JSONL -> /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.jsonl
JSONL path: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.jsonl
CSV path: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.csv
Rows extracted: 5


Unnamed: 0,file,course_title,domain,skill_name,skill_level,evidence,confidence,model,duration,input_tokens,output_tokens,total_tokens,timestamp
0,/home/ec2-user/SageMaker/GDSC-8/data/trainings...,Master Financial Software Tools - Beginner Level,Accounting & Management,Accounting Software,Basic,develop proficiency in financial software appl...,0.95,mistral-medium-latest,1.109547,691,94,785,1761606000.0
1,/home/ec2-user/SageMaker/GDSC-8/data/trainings...,Intermediate Financial Software Training,Accounting & Management,Accounting Software,Intermediate,advance your ability to manage complex account...,0.98,mistral-medium-latest,1.094504,706,89,795,1761606000.0
2,/home/ec2-user/SageMaker/GDSC-8/data/trainings...,Intermediate Financial Cost Analysis Training,Accounting & Management,Accounting Software,Intermediate,master intermediate-level techniques for exami...,0.85,mistral-medium-latest,0.873496,706,83,789,1761606000.0
3,/home/ec2-user/SageMaker/GDSC-8/data/trainings...,Intermediate Programming for Embedded Systems,Electronics & Electrical,Embedded C,Intermediate,Master programming embedded controllers on an ...,0.95,mistral-medium-latest,1.08931,674,92,766,1761606000.0
4,/home/ec2-user/SageMaker/GDSC-8/data/trainings...,Advanced Programming for Embedded Systems,Electronics & Electrical,Embedded C,Advanced,develop expertise in programming embedded cont...,0.95,mistral-medium-latest,1.443816,685,90,775,1761606000.0


In [39]:
clean_csv_path = OUT_DIR / (csv_path.stem + "_cleaned.csv")
n_rows = clean_extracted_csv(csv_path, clean_csv_path)

import pandas as pd
df_clean = pd.read_csv(clean_csv_path)
print("Rows in cleaned CSV:", len(df_clean))
df_clean.head(10)

✅ Cleaned CSV saved to /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027_cleaned.csv with 0 rows.
Rows in cleaned CSV: 0


Unnamed: 0,file,course_title,domain,skill_name,skill_level,evidence,confidence,model,duration,input_tokens,output_tokens,total_tokens,timestamp,index


In [40]:
import pandas as pd

relabel_in = clean_csv_path
relabel_out = OUT_DIR / (clean_csv_path.stem + "_relabelled.csv")

# Example map (empty by default). Add tuples: (start_idx, end_idx, "Canonical Domain Label")
mapping_ranges = [
    (210, 239, "Insurance"),
    (270, 299, "Maritime & Inland Waterway Sedentary Personnel"),
    (330, 359, "Production & Collective Equipment"),
]

canonical_map = {
    # "Insurance": "Insurance",
    # "Maritime and River Transport Sedentary Personnel": "Maritime & Inland Waterway Sedentary Personnel",
    # "Production and Collective Equipment": "Production & Collective Equipment",
}

if mapping_ranges:
    relabel_domains_by_index_ranges(relabel_in, relabel_out, mapping_ranges, canonical_map=canonical_map)
    relabelled_path = relabel_out
else:
    print("No mapping_ranges provided → skip relabel. Using cleaned CSV for next step.")
    relabelled_path = relabel_in

df_relabel = pd.read_csv(relabelled_path)
print("Rows after relabel step:", len(df_relabel))
df_relabel.head(10)

✅ Saved relabelled file -> /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027_cleaned_relabelled.csv
Rows after relabel step: 0


Unnamed: 0,file,course_title,domain,skill_name,skill_level,evidence,confidence,model,duration,input_tokens,output_tokens,total_tokens,timestamp,index


In [41]:
triplet_out = OUT_DIR / (Path(relabelled_path).stem + "_triplets.csv")
cache_path = OUT_DIR / "triplet_skill_cache.json"

label_triplets_from_csv(
    in_csv=Path(relabelled_path),
    out_csv=triplet_out,
    cache_path=cache_path,
    model=MODEL,
)

import pandas as pd
df_triplets = pd.read_csv(triplet_out)
print("Rows after triplet labeling:", len(df_triplets))
df_triplets.head(12)

Processed triplets: 0; errors: 0
Saved: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027_cleaned_relabelled_triplets.csv
Rows after triplet labeling: 0


Unnamed: 0,file,course_title,domain,skill_name,skill_level,evidence,confidence,model,duration,input_tokens,output_tokens,total_tokens,timestamp,index,skill


In [42]:
import pandas as pd

assert {"file", "course_title", "domain", "skill_name", "skill_level"}.issubset(df_extracted.columns), \
    "Extracted CSV missing required columns."

# Skill levels should be normalized to BIA ladder
allowed_levels = {"Basic", "Intermediate", "Advanced"}
bad_levels = set(df_extracted["skill_level"].dropna()) - allowed_levels
assert not bad_levels, f"Unexpected skill levels detected: {bad_levels}"

# Domain should be in the allowed set or blank (if LLM couldn't be canonicalized)
bad_domains = [d for d in df_extracted["domain"].dropna() if d and d not in allowed_canon]
assert not bad_domains, f"Domains not in allowed_canon: {set(bad_domains)}"

print("✅ Basic assertions passed.")

✅ Basic assertions passed.


In [43]:
print("Artifacts:")
print(" - Extracted JSONL:", jsonl_path.resolve())
print(" - Extracted CSV:", csv_path.resolve())
print(" - Cleaned CSV:", clean_csv_path.resolve())
if relabelled_path != clean_csv_path:
    print(" - Relabelled CSV:", Path(relabelled_path).resolve())
print(" - Triplet-labeled CSV:", triplet_out.resolve())
print(" - Triplet cache:", cache_path.resolve())

Artifacts:
 - Extracted JSONL: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.jsonl
 - Extracted CSV: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027.csv
 - Cleaned CSV: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027_cleaned.csv
 - Relabelled CSV: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027_cleaned_relabelled.csv
 - Triplet-labeled CSV: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/trainings_extracted_one_skill_20251027_cleaned_relabelled_triplets.csv
 - Triplet cache: /home/ec2-user/SageMaker/GDSC/processed_data/outputs/nb_test/triplet_skill_cache.json


In [9]:
from pathlib import Path
from src.modules.training_info_extraction.pipeline import (
    export_trainings_json_from_tagged_csv,
    backfill_training_fields,
)

# 1) Export final JSON from your tagged/cleaned CSV
csv_in = Path("../processed_data/outputs/courses_tagged.csv")  # or your cleaned/relabelled CSV
json_primary = export_trainings_json_from_tagged_csv(input_csv=csv_in)

# 2) Backfill remaining fields (level/language/delivery/city/duration) from Markdown
md_dir = Path("/home/ec2-user/SageMaker/GDSC-8/data/trainings")
json_out = Path("submissions/extracted_trainings_backfilled_test.json")

backfill_training_fields(
    training_md_dir=md_dir,
    primary_json_path=json_primary,
    output_json_path=json_out,
    model="mistral-medium-latest",
    cache_period=20,
)

✅ Wrote 497 trainings -> ../processed_data/outputs/extracted_trainings_2025-10-28.json
Processing 497 files (0 already cached).


KeyboardInterrupt: 