In [2]:
import time
import warnings
from pathlib import Path

import datasets
import evaluate
import numpy as np
import pandas as pd
import transformers
from evaluate import evaluator
from openvino import Core
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

from optimum.intel import OVConfig, OVModelForTokenClassification, OVQuantizationConfig, OVQuantizer

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

In [12]:
MODEL_ID = "manishiitg/resume-ner"
DATASET_NAME = ""

base_model_path = Path(f"models/{MODEL_ID}")
fp32_model_path = base_model_path.with_name(base_model_path.name + "_FP32")
int8_ptq_model_path = base_model_path.with_name(base_model_path.name + "_INT8_PTQ")

In [None]:
model = OVModelForTokenClassification.from_pretrained(MODEL_ID, export=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# See how the tokenizer for the given model converts input text to model input values
print(tokenizer("hello world!"))

In [None]:
from datasets import load_dataset

ds = load_dataset("Sachinkelenjaguri/Resume_dataset", revision= "main", data_files= "UpdatedResumeDataSet.csv")

In [9]:
def preprocess_fn(examples, tokenizer):
    """convert the text from the dataset into tokens in the format that the model expects"""
    return tokenizer(
        examples["Resume"],
        padding=True,
        truncation=True,
        max_length= 512,
        return_tensors="pt"
    )

In [6]:
dataset = ds
ds.column_names

{'train': ['Category', 'Resume']}

In [None]:
filtered_examples = dataset["train"].filter(lambda x: x["Category"] == "Data Science")
train_dataset = filtered_examples.map(lambda x: preprocess_fn(x, tokenizer), batched=True)

In [None]:
warnings.simplefilter("ignore")

# Quantize the model
quantizer = OVQuantizer.from_pretrained(model)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
quantizer.quantize(calibration_dataset=train_dataset, ov_config=ov_config, save_directory=int8_ptq_model_path)

In [None]:
quantized_model_ptq = OVModelForTokenClassification.from_pretrained(int8_ptq_model_path)
ov_qa_pipeline_ptq = pipeline("token-classification", model=quantized_model_ptq, tokenizer=tokenizer)


In [23]:
result = ov_qa_pipeline_ptq("Education Details May 2013 to May 2017 B.E UIT-RGPV Data Scientist Data Scientist - Matelabs Skill Details Python- Exprience - Less than 1 year months Statsmodels- Exprience")
for item in result:
    print(item)

{'entity': 'DATE', 'score': np.float32(0.9824913), 'index': 3, 'word': 'may', 'start': 18, 'end': 21}
{'entity': 'DATE', 'score': np.float32(0.98756987), 'index': 4, 'word': '2013', 'start': 22, 'end': 26}
{'entity': 'DATE', 'score': np.float32(0.54650015), 'index': 5, 'word': 'to', 'start': 27, 'end': 29}
{'entity': 'DATE', 'score': np.float32(0.9854757), 'index': 6, 'word': 'may', 'start': 30, 'end': 33}
{'entity': 'DATE', 'score': np.float32(0.98744106), 'index': 7, 'word': '2017', 'start': 34, 'end': 38}
{'entity': 'EducationDegree', 'score': np.float32(0.9468082), 'index': 8, 'word': 'b', 'start': 39, 'end': 40}
{'entity': 'ORG', 'score': np.float32(0.5314321), 'index': 11, 'word': 'ui', 'start': 43, 'end': 45}
{'entity': 'ORG', 'score': np.float32(0.6193985), 'index': 12, 'word': '##t', 'start': 45, 'end': 46}
{'entity': 'Designation', 'score': np.float32(0.74931425), 'index': 17, 'word': 'data', 'start': 52, 'end': 56}
{'entity': 'Designation', 'score': np.float32(0.7668792), 'i

In [24]:
def structure_resume_entities(entity_list):
    """
    Converts a flat list of entities from the HF pipeline into a structured dictionary.
    It also attempts to merge consecutive tokens of the same entity type.
    """
    if not entity_list:
        return {}

    # Sort by start index to process in order, pipeline output is usually sorted by appearance
    # but explicit sort by 'index' or 'start' is safer if there's any doubt.
    # The pipeline output you showed is already implicitly in order.
    # entity_list.sort(key=lambda x: x['start']) # Or x['index']

    structured_output = {}
    merged_entities = []
    current_entity_text = ""
    current_entity_label = None
    current_entity_score_sum = 0
    current_entity_token_count = 0
    last_end_index = -1

    for entity in entity_list:
        word = entity['word']
        label = entity['entity']
        score = float(entity['score']) # Ensure it's a Python float
        start_index = entity['start']

        # Remove "##" from subword tokens for cleaner text,
        # though pipeline with aggregation usually handles this.
        if word.startswith("##"):
            word = word[2:]
            # For direct concatenation, we might not want a space if it's a subword
            # However, the pipeline's 'word' for aggregated entities should be clean.
            # If 'word' can be a subword token, a more robust re-tokenization or
            # space-joining logic might be needed based on original text.
            # For now, assume 'word' is a complete word or subword that can be joined with space.

        if current_entity_label == label and (start_index == last_end_index or start_index == last_end_index + 1 or word.startswith("##") or not current_entity_text):
            # Continue current entity if same label and tokens are adjacent or overlapping
            # or if it's a subword token.
            # The `word.startswith("##")` is a simple heuristic for subwords.
            # A more robust check would be if the current `start_index`
            # immediately follows `last_end_index` without spaces IF original text was available.
            if current_entity_text and not word.startswith("##") and not current_entity_text.endswith("-"): # Add space if not a subword and not ending with hyphen
                current_entity_text += " "
            current_entity_text += word
            current_entity_score_sum += score
            current_entity_token_count += 1
        else:
            # New entity or different type, finalize previous one
            if current_entity_label and current_entity_text:
                avg_score = current_entity_score_sum / current_entity_token_count if current_entity_token_count > 0 else 0
                merged_entities.append({
                    "text": current_entity_text.strip(),
                    "label": current_entity_label,
                    "score": round(avg_score, 4)
                })

            # Start new entity
            current_entity_text = word
            current_entity_label = label
            current_entity_score_sum = score
            current_entity_token_count = 1

        last_end_index = entity['end']

    # Add the last processed entity
    if current_entity_label and current_entity_text:
        avg_score = current_entity_score_sum / current_entity_token_count if current_entity_token_count > 0 else 0
        merged_entities.append({
            "text": current_entity_text.strip(),
            "label": current_entity_label,
            "score": round(avg_score, 4)
        })

    # Populate the structured_output dictionary
    for item in merged_entities:
        label = item['label']
        text = item['text']
        # score = item['score'] # Optionally include score if needed in final JSON

        # Basic deduplication within each label category
        if label not in structured_output:
            structured_output[label] = []
        if text not in structured_output[label]:
            structured_output[label].append(text)
            
    # --- Attempt to create more complex structures (OPTIONAL & HEURISTIC) ---
    # This part is more complex and error-prone without more context or rules.
    # For now, let's keep the primary output flat as derived from merged_entities.
    # We can add specific post-processing for 'DATE' and 'ExperianceYears'
    
    # Post-process DATE for better readability
    if "DATE" in structured_output:
        # This is a very simple join. A more robust date parser would be better.
        # Example: "May", "2013", "to", "May", "2017" -> "May 2013 to May 2017"
        # This simple logic assumes dates appear contiguously in the `merged_entities`
        # which may not always be true if other entities interleave.
        # A better approach would be to process `merged_entities` directly for date ranges.
        
        # For now, if multiple date parts are separate, we just list them.
        # True date range combination requires looking at original text and proximity.
        pass # The current merging logic already groups adjacent dates

    # Post-process ExperianceYears
    if "ExperianceYears" in structured_output:
        # Try to form more meaningful phrases like "Less than 1 year"
        # This is highly heuristic.
        # The current merging logic groups them if they are consecutive.
        pass

    return structured_output

In [25]:
structured_json_output = structure_resume_entities(result)

import json
print(json.dumps(structured_json_output, indent=2))

{
  "DATE": [
    "may 2013 to may 2017"
  ],
  "EducationDegree": [
    "b"
  ],
  "ORG": [
    "ui t"
  ],
  "Designation": [
    "data scientist data scientist"
  ],
  "ExperianceYears": [
    "1 year months"
  ]
}
