In [2]:
import pandas as pd

In [3]:
path='/Users/juanignaciosolerno/Downloads/julius/datasets/title_discovery_no_parsed_jt (3).csv'
df = pd.read_csv(path)
df.sample(30)


Unnamed: 0,doc_id,doc_title,doc_company,doc_location,parsed_jt,source,Job Title Present
8760,5627353,Deliver with Uber Eats,Uber,"Kansas City, KS",,linkedin,No
14165,2351896,Deliver with Uber Eats,Uber,"Bonham, TX",,linkedin,No
1695,88577b662cce2cc9,San Angelo 02 eStores - eStore Curbie - Part-Time,HEB,"San Angelo, TX 76901",,indeed,Yes
7121,7415441,Deliver with Uber Eats,Uber,"Midland, AR",,linkedin,No
9829,5573195,Deliver with Uber Eats,Uber,"Littleton, CO",,linkedin,No
11582,6400595,Deliver with Uber Eats,Uber,"Iowa City, IA",,linkedin,No
15314,119478900,Deliver with Uber Eats,Uber,"Mishawaka, IN",,linkedin,No
16201,209006767,"Coord Sr, Distribution Construction",Southern Company,"Valdosta, GA",,linkedin,No
3726,105a528ecee08be7,Pleasanton Produce - Perishables Rep - Part-Time,HEB,"Pleasanton, TX 78064",,indeed,Yes
14406,2351823,Deliver with Uber Eats,Uber,"Iona, ID",,linkedin,No


In [4]:
df.shape

(16826, 7)

In [8]:
titles_to_parse = list(set(df[df['Job Title Present']=='Yes']['doc_title']))

In [10]:
titles_to_parse[:10]

['Tomball eStore Curbie - Part-Time',
 'Kerrville 02 Market - Perishables Rep - Part-Time',
 'Stephenville Market - Perishables Rep - Part-Time',
 'Summer College Intern-Point Beach Nuclear Plant',
 'Odessa 02 Seafood - Perishables Rep - Part-Time',
 'Austin 14 eStores - eStore Curbie - Part-Time',
 'FuSa Consulting Lead - 358645*',
 'Supv-Operations-3',
 'Cust Serv Rep Pt-1664',
 'Temporary Journeyman Lineman']

In [11]:
len(titles_to_parse)

1289

In [12]:
from openai import OpenAI
openai_client = OpenAI()

In [13]:
from pydantic import BaseModel
class JobTitleExtraction(BaseModel):
    job_title: list[str]
    experience_level: list[str]
    specialization: list[str]
    location: list[str]

In [36]:
def llm_ner_job_title(job_title, client):
    if not client:
        print('No client for LLM usage')
        return None
    
    def prepare_job_title(job_title):
        if not isinstance(job_title, str):
            job_title = str(job_title)
        delimiters = ['-', '/', '_', ',', '.']
        for delimiter in delimiters:
            job_title = job_title.replace(delimiter, f' {delimiter} ')  # Separate special characters to facilitate labelling
        return job_title


    # Separe the special characters from the alphabetic characters
    job_title_prepared = prepare_job_title(job_title)
    #print(job_title_prepared)


    #chat_completion = client.chat.completions.create(
    chat_completion = client.beta.chat.completions.parse(
        model='gpt-4o-mini',
        temperature=0.0,
        seed=42,
        messages=[
            {
            "role": "system",
            "content": """
            You are an expert in Renewable Energies with specialized skills in performing Named Entity Recognition (NER) on job titles. Your task is to extract specific entities from the provided job titles, without altering the original text.

            The entities to extract are:
            - 'job_title': Words directly related to the core occupation (example: 'Wind Turbine Technician')
            - 'experience_level': Words indicating the worker's seniority or experience level (example: 'Senior', 'III'). 
            - 'location': Any words that refer to geographical locations (example: 'California')
            - 'specialization': Additional words that give specific context or specialization to the job title (example: 'offshore wind farms')

            Guidelines:
            - Your output must be a JSON object, where each key is an entity, and the values are lists of the words forming each entity.
            - Do not modify the original job title. Only include words present in the input.
            - Be careful to avoid problematic characters in the output.

            IMPORTANT
            Things or entities you shoud not include:
            - salary ('USD 1000')
            - shift, working hours or schedule ('From 8:00 to 12:00')
            - employment type ('Part - Time')
            - program type ("Summer College" in "Summer College Intern")

            Examples:
            - For the job title 'Financial Analyst II, emergent markets', return: {"job_title": ["Financial", "Analyst"], "experience_level": ["II"], "specialization": ["emergent", "markets"]}
            - For 'RS Avionics Electrical Technician Lead', return: {"job_title": ["Electrical", "Technician"], "experience_level": ["Lead"], "specialization": ["RS", "Avionics"]}
            - For 'Principal Renewables Energy Land Planner (Mid-Senior Level)', return: {"job_title": ["Land", "Planner"], "experience_level": ["Principal"], "specialization": ["Renewables", "Energy"]}

            IMPORTANT:
            - Perform NER only on the job title itself.
            - Do not include any explanations or additional text in the output—just the extracted entities in JSON format.
            """
            },
            {
                "role": "user",
                "content": f"Recognize the specified named entities on this job title: {job_title_prepared}"
            }
        ],
        response_format=JobTitleExtraction,


    )

    response_content = chat_completion.choices[0].message.content
    response_json = eval(response_content)

    return response_json

In [27]:
test  = 'Supv,Customer Experience-Billing'
response = llm_ner_job_title(job_title=test, client=openai_client)
response

{'job_title': ['Customer', 'Experience'],
 'experience_level': ['Supv'],
 'specialization': ['Billing'],
 'location': []}

In [28]:
def generate_iob_tags(job_title, ner_output):
    """
    Generate IOB tags for the given job title based on NER output.

    :param job_title: str, the raw job title string
    :param ner_output: dict, the NER output with keys 'job_title', 'experience_level', 'specialization'
    :return: list, IOB tags corresponding to the job title words
    """

    def prepare_job_title(job_title):
        if not isinstance(job_title, str):
            job_title = str(job_title)
        delimiters = ['-', '/', '_', ',', '.']
        for delimiter in delimiters:
            job_title = job_title.replace(delimiter, f' {delimiter} ')  # Separate special characters to facilitate labelling
        return job_title
    
    # Separe the special characters from the alphabetic characters
    job_title_prepared = prepare_job_title(job_title)
    
    # Split the job title into words
    job_title_tokens = job_title_prepared.split()

    # Create a list to store IOB tags
    iob_tags = ['O'] * len(job_title_tokens)

    # Helper function to mark the IOB tags
    def mark_iob(tags, tokens, label):
        for idx, token in enumerate(tokens):
            token_index = job_title_tokens.index(token)
            if idx == 0:
                tags[token_index] = f'B-{label}'  # Beginning tag
            else:
                tags[token_index] = f'I-{label}'  # Inside tag

    # Process each entity and assign corresponding IOB tags
    if 'experience_level' in ner_output:
        mark_iob(iob_tags, ner_output['experience_level'], 'exp')
    
    if 'job_title' in ner_output:
        mark_iob(iob_tags, ner_output['job_title'], 'job')
    
    if 'specialization' in ner_output:
        mark_iob(iob_tags, ner_output['specialization'], 'spe')
    
    if 'location' in ner_output:
        mark_iob(iob_tags, ner_output['location'], 'loc')

    return (job_title_prepared,iob_tags)

In [30]:
# Example usage
job_title = "Supv, Customer Experience-Billing"


ner_output = {
    "job_title": ["Customer", "Experience"],
    "experience_level": ["Supv"],
    "specialization": ["Billing"]
}

training_tuple = generate_iob_tags(job_title, ner_output)
training_tuple

('Supv ,  Customer Experience - Billing',
 ['B-exp', 'O', 'B-job', 'I-job', 'O', 'B-spe'])

In [39]:
training_tuples = []
for job_title in titles_to_parse[:10]:
    
    ner_output = llm_ner_job_title(job_title,client=openai_client)

    training_tuple = generate_iob_tags(job_title, ner_output)
    
    training_tuples.append(training_tuple)
    

In [40]:
training_tuples

[('Tomball eStore Curbie  -  Part - Time',
  ['B-loc', 'O', 'B-job', 'O', 'O', 'O', 'O']),
 ('Kerrville 02 Market  -  Perishables Rep  -  Part - Time',
  ['B-loc', 'I-loc', 'I-loc', 'O', 'B-job', 'I-job', 'O', 'O', 'O', 'O']),
 ('Stephenville Market  -  Perishables Rep  -  Part - Time',
  ['B-loc', 'O', 'O', 'B-job', 'I-job', 'O', 'O', 'O', 'O']),
 ('Summer College Intern - Point Beach Nuclear Plant',
  ['B-exp', 'I-exp', 'B-job', 'O', 'B-spe', 'I-spe', 'I-spe', 'I-spe']),
 ('Odessa 02 Seafood  -  Perishables Rep  -  Part - Time',
  ['B-loc', 'O', 'I-loc', 'O', 'B-job', 'I-job', 'O', 'O', 'O', 'O']),
 ('Austin 14 eStores  -  eStore Curbie  -  Part - Time',
  ['B-loc', 'O', 'O', 'O', 'B-job', 'I-job', 'O', 'O', 'O', 'O']),
 ('FuSa Consulting Lead  -  358645*', ['B-spe', 'B-job', 'I-job', 'O', 'O']),
 ('Supv - Operations - 3', ['B-exp', 'O', 'B-job', 'O', 'I-exp']),
 ('Cust Serv Rep Pt - 1664', ['B-job', 'I-job', 'I-job', 'B-exp', 'O', 'O']),
 ('Temporary Journeyman Lineman', ['B-exp', '

In [42]:
training_dataset = pd.DataFrame(columns=['job_title_splitted','iob_tags'],data=training_tuples)
training_dataset

Unnamed: 0,job_title_splitted,iob_tags
0,Tomball eStore Curbie - Part - Time,"[B-loc, O, B-job, O, O, O, O]"
1,Kerrville 02 Market - Perishables Rep - Pa...,"[B-loc, I-loc, I-loc, O, B-job, I-job, O, O, O..."
2,Stephenville Market - Perishables Rep - Pa...,"[B-loc, O, O, B-job, I-job, O, O, O, O]"
3,Summer College Intern - Point Beach Nuclear Plant,"[B-exp, I-exp, B-job, O, B-spe, I-spe, I-spe, ..."
4,Odessa 02 Seafood - Perishables Rep - Part...,"[B-loc, O, I-loc, O, B-job, I-job, O, O, O, O]"
5,Austin 14 eStores - eStore Curbie - Part -...,"[B-loc, O, O, O, B-job, I-job, O, O, O, O]"
6,FuSa Consulting Lead - 358645*,"[B-spe, B-job, I-job, O, O]"
7,Supv - Operations - 3,"[B-exp, O, B-job, O, I-exp]"
8,Cust Serv Rep Pt - 1664,"[B-job, I-job, I-job, B-exp, O, O]"
9,Temporary Journeyman Lineman,"[B-exp, B-job, I-job]"
