In [39]:
import random
import numpy as np
import json
import pandas as pd
from dotenv import load_dotenv
import os
import asyncio
from openai import OpenAI
from tqdm import tqdm
import time

## EURES

In [2]:
def load_job_ads(file_path):
    """Load job ads from JSON file"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

In [3]:
def prepare_test_job_ads(job_ads, n_samples=5, seed=42):
    """
    Select random job ads with fixed seed for reproducibility
    Args:
        job_ads: List of job advertisements
        n_samples: Number of samples to select
        seed: Random seed for reproducibility
    """
    random.seed(seed)
    np.random.seed(seed)
    
    test_ads = random.sample(job_ads, n_samples)
    
    formatted_ads = []
    for ad in test_ads:
        formatted_ad = {
            'job_title': ad['title'],
            'esco_id': ad['esco_id'],  
            'description': ad['short_texts'],
            'correct_esco_job': ad['esco_job']
        }
        formatted_ads.append(formatted_ad)
    
    return formatted_ads

In [4]:
# file_path = '../00_data/EURES/eures_testads_final_short.json'
# job_ads = load_job_ads(file_path)
# test_ads = prepare_test_job_ads(job_ads, n_samples=5, seed=42)

## ESCO

In [54]:
def load_isco_groups(file_path):
    """Load ISCO groups from CSV file"""
    df = pd.read_csv(file_path, dtype={"code": str})
    return df

In [59]:
def create_isco_hierarchy(df):
    """Create a structured ISCO hierarchy from the CSV file"""
    hierarchy = {}

    # Level 1: Major groups (1-digit)
    major_groups = df[df['code'].str.len() == 1]
    for _, major in major_groups.iterrows():
        code = major['code']
        hierarchy[code] = {
            'label': major['preferredLabel'],
            'subgroups': {}
        }

        # Level 2: Sub-major groups (prefix match and length == 2)
        submajor = df[(df['code'].str.startswith(code)) & (df['code'].str.len() == 2)]
        for _, sub in submajor.iterrows():
            sub_code = sub['code']
            hierarchy[code]['subgroups'][sub_code] = {
                'label': sub['preferredLabel'],
                'occupations': []
            }

    return hierarchy

## LLMs

### Prompt

In [61]:
def create_major_group_prompt(job_ad, hierarchy):
    """Create prompt for identifying major group"""
    prompt = f"""As a job classification expert, identify the most relevant ISCO major groups for this job posting.

Job Title: {job_ad['job_title']}
Job Description: {job_ad['description']}

Available Major Groups:
{json.dumps({code: data['label'] for code, data in hierarchy.items()}, indent=2, ensure_ascii=False)}

Return the TWO most relevant major group codes with confidence scores as JSON:
[
    {{"code": "2", "confidence": 0.9}},
    {{"code": "3", "confidence": 0.7}}
]"""
    return prompt

In [62]:
def create_subgroup_prompt(job_ad, major_code, hierarchy):
    """Create prompt for identifying subgroups within major group"""
    subgroups = hierarchy[major_code]['subgroups']
    
    prompt = f"""Within the major group "{hierarchy[major_code]['label']}", identify the most relevant sub-major groups for this job.

Job Title: {job_ad['job_title']}
Job Description: {job_ad['description']}

Available Sub-major Groups:
{json.dumps({code: data['label'] for code, data in subgroups.items()}, indent=2, ensure_ascii=False)}

Return the most relevant sub-group codes with confidence scores as JSON:
[
    {{"code": "21", "confidence": 0.95}},
    {{"code": "23", "confidence": 0.6}}
]"""
    return prompt

In [63]:
def create_final_matching_prompt(job_ad, esco_jobs):
    """Create prompt for final ESCO job matching"""
    prompt = f"""Match this job posting to the most relevant ESCO jobs.

Job Title: {job_ad['job_title']}
Job Description: {job_ad['description']}

Available ESCO Jobs:
{json.dumps(esco_jobs, indent=2, ensure_ascii=False)}

Return the top 100 matches as JSON:
[
    {{"esco_id": "string", "confidence": float}},
    ...
]"""
    return prompt

In [64]:
# Example prompt using the formatted job ads:
def create_test_prompt(formatted_ads, isco_hierarchy):
    prompt = """As a job matching expert, analyze these job postings and match them to the most appropriate ESCO job classifications.

For each job posting, provide the most likely ESCO job matches in descending order of relevance.

Job Postings to Analyze:
"""
    
    for i, ad in enumerate(formatted_ads, 1):
        prompt += f"\nJob Posting {i}:\nTitle: {ad['job_title']}\nDescription: {ad['description']}\n"
    
    prompt += """\nFor each job posting, return a JSON array in this format:
{
    "job_posting_1": [
        {"esco_id": "1234.5", "confidence": 0.95},
        {"esco_id": "5678.9", "confidence": 0.85},
        ...
    ],
    ...
}
"""
    
    return prompt

### API Call

In [11]:
def get_llm_predictions(test_ads, client):
    """Get predictions from LLM for test ads"""
    predictions = []
    
    for ad in tqdm(test_ads):
        prompt = f"""Given this job posting, identify the most relevant ESCO job classification with ID.

Job Title: {ad['job_title']}
Job Description: {ad['description']}

Return your answer as a JSON array with your top 100 ESCO job IDs and confidence scores:
[
    {{"esco_id": "string", "confidence": float}},
    ...
]

Note: Return ONLY the JSON array."""

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0
            )
            
            result = {
                'job_title': ad['job_title'],
                'correct_esco_id': ad['esco_id'],
                'predictions': json.loads(response.choices[0].message.content)
            }
            predictions.append(result)
            
            time.sleep(1)  # Rate limiting
            
        except Exception as e:
            print(f"Error processing {ad['job_title']}: {e}")
    
    return predictions

In [12]:
# Save predictions
def save_predictions(predictions, filename='llm_predictions.json'):
    with open(filename, 'w') as f:
        json.dump(predictions, f, indent=2)

In [13]:
# Evaluate separately
def evaluate_predictions(predictions):
    results = []
    
    for pred in predictions:
        # Find rank of correct ID
        rank = None
        for idx, p in enumerate(pred['predictions'], 1):
            if p['esco_id'] == pred['correct_esco_id']:
                rank = idx
                break
        
        mrr = 1/rank if rank and rank <= 100 else 0
        
        results.append({
            'job_title': pred['job_title'],
            'correct_esco_id': pred['correct_esco_id'],
            'rank': rank,
            'mrr': mrr
        })
    
    return results

## Experiment

In [34]:
# Load data
job_ads = load_job_ads('../00_data/EURES/eures_testads_final_short.json')
isco_groups = load_isco_groups('../00_data/ESCO/ESCO_isco_groups.csv')
print(isco_groups.head())

isco_hierarchy = create_isco_hierarchy(isco_groups)
    
# Prepare test ads
test_ads = prepare_test_job_ads(job_ads, n_samples=5, seed=42)

  conceptType                             conceptUri  code  \
0   ISCOGroup     http://data.europa.eu/esco/isco/C0     0   
1   ISCOGroup    http://data.europa.eu/esco/isco/C01     1   
2   ISCOGroup   http://data.europa.eu/esco/isco/C011    11   
3   ISCOGroup  http://data.europa.eu/esco/isco/C0110   110   
4   ISCOGroup    http://data.europa.eu/esco/isco/C02     2   

                              preferredLabel    status  altLabels  \
0      Angehörige der regulären Streitkräfte  released        NaN   
1       Offiziere in regulären Streitkräften  released        NaN   
2       Offiziere in regulären Streitkräften  released        NaN   
3       Offiziere in regulären Streitkräften  released        NaN   
4  Unteroffiziere in regulären Streitkräften  released        NaN   

                                            inScheme  description  
0  http://data.europa.eu/esco/concept-scheme/occu...          NaN  
1  http://data.europa.eu/esco/concept-scheme/isco...          NaN  
2  http:

In [35]:
print(isco_hierarchy)

{'0': {'label': 'Angehörige der regulären Streitkräfte', 'subgroups': {}}, '1': {'label': 'Führungskräfte', 'subgroups': {'11': {'label': 'Geschäftsführer, Vorstände, leitende Verwaltungsbedienstete und Angehörige gesetzgebender Körperschaften', 'occupations': []}, '12': {'label': 'Führungskräfte im kaufmännischen Bereich', 'occupations': []}, '13': {'label': 'Führungskräfte in der Produktion und bei speziellen Dienstleistungen', 'occupations': []}, '14': {'label': 'Führungskräfte in Hotels und Restaurants, im Handel und in der Erbringung sonstiger Dienstleistungen', 'occupations': []}}}, '2': {'label': 'Akademische Berufe', 'subgroups': {'21': {'label': 'Naturwissenschaftler, Mathematiker und Ingenieure', 'occupations': []}, '22': {'label': 'Akademische und verwandte Gesundheitsberufe', 'occupations': []}, '23': {'label': 'Lehrkräfte', 'occupations': []}, '24': {'label': 'Betriebswirte und vergleichbare akademische Berufe', 'occupations': []}, '25': {'label': 'Akademische und vergleic

In [40]:
# Load environment variables from .env
load_dotenv()

# Get predictions
client = OpenAI()
predictions = get_llm_predictions(test_ads, client)

# Save predictions for later
save_predictions(predictions)

# Evaluate
results = evaluate_predictions(predictions)

# Print results
avg_mrr = sum(r['mrr'] for r in results) / len(results)
print(f"\nAverage MRR@100: {avg_mrr:.3f}")

print("\nDetailed Results:")
for r in results:
    print(f"\nJob: {r['job_title']}")
    print(f"Correct ESCO ID: {r['correct_esco_id']}")
    print(f"Rank: {r['rank']}")
    print(f"MRR: {r['mrr']:.3f}")

100%|███████████████████████████████████████████████████████████████| 5/5 [01:33<00:00, 18.73s/it]


Average MRR@100: 0.000

Detailed Results:

Job:  Fachinformatiker für Systemintegration (m/w/d) in Vollzeit (Systeminformatiker/in) 
Correct ESCO ID: 3114.1.2
Rank: None
MRR: 0.000

Job:  Projektingenieur_in Automotive Electronics 
Correct ESCO ID: 2149.2.1
Rank: None
MRR: 0.000

Job:  CNC - Dreher (m/w/d) (CNC-Dreher/in) 
Correct ESCO ID: 7223.4.4
Rank: None
MRR: 0.000

Job:  GLASER_IN 
Correct ESCO ID: 7125.1
Rank: None
MRR: 0.000

Job:  Damenkleidermacher_in als Vorarbeiter_in 
Correct ESCO ID: 7531.2
Rank: None
MRR: 0.000



