<a href="https://colab.research.google.com/github/ldsbalu/Balu-Portfolio/blob/main/Using_Transformers_to_Extract_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import torch
from transformers import pipeline
from sklearn.metrics import accuracy_score
from datasets import load_dataset

In [4]:
print("loading Dataset")
dataset = load_dataset("Crisp-Unimib/JobSet")
print("loaded Dataset Sucessfully")

loading Dataset


README.md: 0.00B [00:00, ?B/s]

JobSet.csv:   0%|          | 0.00/28.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15469 [00:00<?, ? examples/s]

loaded Dataset Sucessfully


In [11]:
print(dataset["train"])


Dataset({
    features: ['job_ad', 'esco_id', 'esco_label', 'esco_skills'],
    num_rows: 15469
})


In [37]:
questions = ["Job Title", "ESCO Label", "ESCO Skill"]

In [39]:
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")

Device set to use cuda:0


In [63]:
results = []
for i in range(5):  # Analyze first 5 job ads for brevity
    context = dataset["train"][i]["job_ad"]  # Note the addition of ["train"]
    entry = {"context": context}
    for q in questions:
        answer = qa_model(question=q, context=context)
        entry[q] = answer['answer']
        entry[q+"_score"] = answer['score']
    results.append(entry)


In [64]:
results_df = pd.DataFrame(results)
avg_scores = results_df[[q+"_score" for q in questions]].mean()
print("Average QA Confidence Scores:")
print(avg_scores)

Average QA Confidence Scores:
Job Title_score     0.029750
ESCO Label_score    0.019425
ESCO Skill_score    0.019224
dtype: float64


In [65]:
qa_model_alt = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad")

Device set to use cuda:0


In [66]:
results_alt = []
for i in range(5):  # Analyze first 5 job ads for brevity
    context = dataset["train"][i]["job_ad"]  # Note the addition of ["train"]
    entry = {"context": context}
    for q in questions:
        answer = qa_model_alt(question=q, context=context)
        entry[q] = answer['answer']
        entry[q+"_score"] = answer['score']
    results_alt.append(entry)

In [67]:
results_df = pd.DataFrame(results_alt)
avg_scores = results_df[[q+"_score" for q in questions]].mean()
print("Average QA Confidence Scores:")
print(avg_scores)

Average QA Confidence Scores:
Job Title_score     0.001343
ESCO Label_score    0.001353
ESCO Skill_score    0.000904
dtype: float64


In [68]:
ground_truth = [
    {
        "Job Title": "Zoological Operations Manager",
        "ESCO Label": "Managing directors and chief executives",
        "ESCO Skill": "digital marketing techniques"
    },
    {
        "Job Title": "Wildlife Operations Director",
        "ESCO Label": "Managing directors and chief executives",
        "ESCO Skill": "manage budgets"
    },
    {
        "Job Title": "Animal Facility Manager",
        "ESCO Label": "Managing directors and chief executives",
        "ESCO Skill": "apply strategic thinking"
    },
    {
        "Job Title": "Animal Facility Manager",
        "ESCO Label": "Managing directors and chief executives",
        "ESCO Skill": "digital marketing techniques"
    },
    {
        "Job Title": "Animal Facility Manager",
        "ESCO Label": "Managing directors and chief executives",
        "ESCO Skill": "establish collaborative relations"
    }
]


In [69]:
def calculate_accuracy(extracted, truth,  method_name):
    correct_count = 0
    total_count = 0
    print(f"\n--- Analysis for {method_name} ---")
    for i, row in enumerate(extracted):
        true_row = truth[i]
        for key in row:
            if key.endswith("_score") or key == "context":
                continue
            pred = str(row[key]).lower().strip()
            target = str(true_row[key]).lower().strip()
            # Loose matching to handle format differences
            match = (pred in target) or (target in pred)
            total_count += 1
            if match:
                correct_count += 1
            else:
                print(f"Miss: Field '{key}' | Pred: '{pred}' | Truth: '{target}'")
    acc = correct_count / total_count if total_count > 0 else 0
    print(f"Accuracy: {acc * 100:.2f}%")
    return acc




In [70]:
qa_acc = calculate_accuracy(results,ground_truth, qa_model)


--- Analysis for <transformers.pipelines.question_answering.QuestionAnsweringPipeline object at 0x7e0c277e28a0> ---
Miss: Field 'Job Title' | Pred: 'full-time' | Truth: 'zoological operations manager'
Miss: Field 'ESCO Label' | Pred: 'results-driven' | Truth: 'managing directors and chief executives'
Miss: Field 'ESCO Skill' | Pred: 'results-driven' | Truth: 'digital marketing techniques'
Miss: Field 'Job Title' | Pred: 'key responsibilities' | Truth: 'wildlife operations director'
Miss: Field 'ESCO Label' | Pred: 'key responsibilities' | Truth: 'managing directors and chief executives'
Miss: Field 'ESCO Skill' | Pred: 'key responsibilities' | Truth: 'manage budgets'
Miss: Field 'Job Title' | Pred: 'public face of our zoo' | Truth: 'animal facility manager'
Miss: Field 'ESCO Label' | Pred: 'public face of our zoo' | Truth: 'managing directors and chief executives'
Miss: Field 'ESCO Skill' | Pred: 'leadership skills' | Truth: 'apply strategic thinking'
Miss: Field 'ESCO Label' | Pred: 

In [71]:
qa_acc2 = calculate_accuracy(results_alt,ground_truth, qa_model_alt)


--- Analysis for <transformers.pipelines.question_answering.QuestionAnsweringPipeline object at 0x7e0c397ce030> ---
Miss: Field 'Job Title' | Pred: 'salary and benefits package' | Truth: 'zoological operations manager'
Miss: Field 'ESCO Label' | Pred: 'efficiency' | Truth: 'managing directors and chief executives'
Miss: Field 'ESCO Skill' | Pred: 'efficiency' | Truth: 'digital marketing techniques'
Miss: Field 'ESCO Label' | Pred: 'wildlife operations director' | Truth: 'managing directors and chief executives'
Miss: Field 'ESCO Skill' | Pred: 'wildlife operations director' | Truth: 'manage budgets'
Miss: Field 'ESCO Label' | Pred: 'we are seeking an exceptional animal facility manager' | Truth: 'managing directors and chief executives'
Miss: Field 'ESCO Skill' | Pred: 'we are seeking an exceptional animal facility manager' | Truth: 'apply strategic thinking'
Miss: Field 'Job Title' | Pred: 'permanent

job reference: [insert job reference]

summary' | Truth: 'animal facility manag