## NER dataset annotations on job description data - October 27, 2023

### Load superset of Python libraries

In [1]:
!python3 -m pip install openai==0.27.8

Collecting openai==0.27.8
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
[0m

In [2]:
!pip3 install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0
[0m

In [3]:
import os
import re
import ast
import openai
import glob
import pandas as pd
import time
from dotenv import load_dotenv, find_dotenv
from IPython.display import display, Markdown, HTML
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_df= pd.read_csv("training_data.csv")
data_df.head(5)

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [5]:
!pwd

/notebooks/gpt_exps


In [6]:
keypath= "/notebooks/gpt_exps/key.env"

print(f"openai-private key loaded: {load_dotenv(keypath)}")

openai-private key loaded: True


In [7]:
# openai_api_key = os.getenv('OPENAI_API_KEY_FOR_LAB')
openai.api_key=os.getenv('OPENAI_API_KEY_FOR_LAB')

* Test prompt & Completion

In [8]:
# Available models
# gpt-4, gpt-4-0613, gpt-4-32k, gpt-4-32k-0613, gpt-3.5-turbo, gpt-3.5-turbo-0613, gpt-3.5-turbo-16k, 
# gpt-3.5-turbo-16k-0613
model= "gpt-4-0613"
timeout=25
temperature= 0.0
prompt="What is capital of Arizona? Also tell me which GPT model/version you used to answer this prompt"
prompt, model, timeout, temperature

('What is capital of Arizona? Also tell me which GPT model/version you used to answer this prompt',
 'gpt-4-0613',
 25,
 0.0)

In [9]:
def find_word_indices(text, tags):
    # Constructing a pattern to match any of the specified tags
    tag_pattern = "|".join(tags)
    pattern = re.compile(f"<({tag_pattern})>(.*?)</\\1>")
    
#     original_indices = []
    adjusted_index_results = []
    offset = 0
    for match in pattern.finditer(text):
        ent_name = match.group(1)
#         name = match.group(2)# 'match.group(2)' contains the matched word/phrase.
#         'match.start(2)' and 'match.end(2)' give the start and end indices of the matched word/phrase.
        i, j= match.start(2), match.end(2)
        offset+=len(ent_name)+2# Counts for opening anchor
#         print(i, j, offset, text[i:j], txt[i-offset:j-offset])
#         original_indices.append((match.start(2), match.end(2), match.group(2), match.group(1)))
        adjusted_index_results.append((i-offset, j-offset, ent_name))# name can be included for debugging entity exact names
        offset+=len(ent_name)+3# Counts for closing anchor
    
#     Removing all types of tags from the text
    cleaned_text = re.sub(r"<[^>]+>", "", text)
#     cleaned_text = re.sub(r"<(/?\w+)>", "", text)# Just HTML type tags

    return cleaned_text, adjusted_index_results

def get_completion(prompt, model ='gpt-3.5-turbo-16k', timeout=10, temperature= 0.0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        request_timeout=timeout,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [10]:
output= get_completion(prompt, model= "gpt-4", timeout=30)
output

"The capital of Arizona is Phoenix. I used OpenAI's GPT-3 model to answer this prompt."

In [12]:
all_jobtitles=data_df["position_title"].tolist()#unique()
# all_jobtitles_string= ", ".join(all_jobtitles)
print(f"Count of all jobtitles - {len(all_jobtitles)} | {all_jobtitles[:3]}")

Count of all jobtitles - 853 | ['Sales Specialist', 'Apple Solutions Consultant', 'Licensing Coordinator - Consumer Products']


In [13]:
select_job_titles= ['Performance Marketing Specialist, Paid Media', 'Software Engineer - Reno, NV', 'Software Engineer', 
                    'Entry Level Software Engineer', 'Associate Software Engineering', 'Software Engineer', 'Software Engineer',
                    'Software Engineer (JavaScript Backend)', 'Senior Software Engineer', 'Sales Specialist', 'Apple Solutions Consultant',
                    'Web Designer', 'Web Developer', 'Frontend Web Developer', 'Remote Website Designer', 'Web Designer', 'SR. Web Designer',
                    'Web Developer', 'Senior UI Designer', 'Wordpress Web Developer', 'UI Web Designer', 'Full Stack Web Developer', 
                    'Web Developer', 'Web Developer (Front-End)', 'Front-End Engineer/UI Developer', 'Software Engineer (Web)', 
                    'Web Developer', 'Interns (Web Developers Mobile Developers Game Developers)', 'Supply Chain Analyst', 'Supply Chain Analyst IV',
                    'Supply Chain Analyst (ED)', 'Analyst II, Supply Chain - Corporate - US (Open)', 'Supply Chain Analyst - Senior', 'Supply Chain Analyst',
                    'Business Analyst- Supply Chain', 'Software Developer - New Graduate', 'Software Engineer (US-Remote)', 'Software Developer', 
                    'Software Developer', 'Senior Software Developer- Full Stack (Remote)', 'Full Stack Software Engineer', 'Software Developer', 
                    'Software Engineer (Remote)', 'Senior Manager, Test', 'Testing Monitor (DGE) - Part Time (NYC)', 'Firmware Test Engineer', 
                    'Test Center Monitor', 'Software Engineer, Test, Advertising', 'Bulk Product Testing Coordinator', 'Salesforce.com QA Tester',
                    'Test Developer', 'Sr analyst compliance', 'Fraud Resolution Analyst', 'Fraud Systems Analyst (REMOTE OPPORTUNITY)', 'Software Engineer (Remote)',
                    'Remote Software Developer (Remote)', 'Software Engineer I ( 100% Remote)', 'Software Engineer (remote)', 'Senior / Lead Software Engineer (Remote)',
                    'Senior Software Engineer -  Virtualization Group - (Remote)', 'Senior Full Stack Software Engineer- Remote', 'Java Software Engineer - Mid Level (REMOTE)',
                    'Information Technology System Administrator', 'Systems Administrator, IT (Brooklyn, NY)', 'Systems Administrator, IT (Brooklyn, NY)',
                    'Network and Systems Administrator (On-Site ~ NOT REMOTE)', 'Network and Systems Administrator', 'It Systems Administrator Ii', 'Network Systems Admin', 
                    'Systems Administrator', 'Data Analyst/Manager', 'Manager, Data Analytics', 'Data Analyst Manager', 'Senior Data Analyst', 'Analytics Manager', 
                    'Marketing Analytics Manager', 'Data Management Analyst for Netsmart', 'Associate Project Manager, IVD Product Development (San Diego, CA)', 
                    'Sr. Product Marketing Manager - Growth, Music Industry', 'Marketing Program Manager, B2B', 'Systems Analyst - Appserver Admin', 'Senior Embedded C++ Developer',
                    'C++ Embedded Engineer', 'Specialist, Embedded C++ Software Engineer', 'Application Development -L6', 'Project Manager', 'Data Analytics Senior Mangager', 
                    'Senior Analyst, ECommerce, Search Marketing', 'Dir Software Solutions Sales Senior', 'Business Analyst, ACS Publications Sales', 'Software Developer- Entry Level',
                    'Business Analyst (Entry Level)', 'Software test Engineer ( Entry Level )', 'System Software Development Engineer - Entry Level', 
                    'System Administrator', 'IT Systems Administrator', 'Search optimization', 'Senior Systems Administrator']

print(f"Count of select jobtitles - '{len(select_job_titles)}' vs. '{len(all_jobtitles)}' | {select_job_titles[:3]}")
all_jobtitles=select_job_titles#data_df["position_title"].tolist()#unique()
all_jobtitles_string= ", ".join(all_jobtitles)


Count of select jobtitles - '98' vs. '853' | ['Performance Marketing Specialist, Paid Media', 'Software Engineer - Reno, NV', 'Software Engineer']


In [14]:
data_df= data_df.reset_index()# Add index columns
data_technical_roles_df = data_df[data_df["position_title"].isin(all_jobtitles)]#.shape
data_other_roles_df = data_df[~data_df["position_title"].isin(all_jobtitles)]

part_data_for_combined_roles_df = pd.concat([data_technical_roles_df, data_other_roles_df.iloc[:200]], axis=0)
part_data_for_combined_roles_df.reset_index(drop=True, inplace=True)

print("JD counts for All roles(original data) - ", len(data_df), "| JD counts for Tech roles - ", 
      len(data_technical_roles_df), "| JD counts for other roles - ", len(data_other_roles_df),  "| JD counts for combined roles(final pdf) - ", len(part_data_for_combined_roles_df))
part_data_for_combined_roles_df.head(2)

JD counts for All roles(original data) -  853 | JD counts for Tech roles -  113 | JD counts for other roles -  740 | JD counts for combined roles(final pdf) -  313


Unnamed: 0,index,company_name,job_description,position_title,description_length,model_response
0,0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."


In [15]:
all_tags= ["key-skills", "generic-skills", "degree-level", "degree-name", "university", "years-of-experience",
           "visa-sponsorship", "qualification", "job-profile", "location", 
           "payscale", "publication-requirement", "workmode"]
print("Number of tags in NER problem - ", len(all_tags))

Number of tags in NER problem -  13


In [16]:
jd_text = data_df["job_description"][0]
jd_text= re.sub(r'\n', ' ', jd_text)
print("Sample Job description from dataset - \n\n", jd_text)

Sample Job description from dataset - 

 minimum qualifications bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles preferred qualifications  years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills about the job as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our pr

In [17]:
query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""
print("Possible token count in each query - ", len(query.split(" ")))
print("\nSample prompt used to annotate each Job description from dataset - \n\n", query)


Possible token count in each query -  1187

Sample prompt used to annotate each Job description from dataset - 

 You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "Performance Marketing Specialist, Paid Media, Software Engineer - Reno, NV, Software Engineer, Entry Level Software Engineer, Associate Software Engineering, Software Engineer, Software Engineer, Software Engineer (JavaScript Backend), Senior Software Engineer, Sales Specialist, Apple Solutions Consultant, Web Designer, Web Developer, Frontend Web Developer, Remote Website Designer, Web Designer, SR. Web Designer, Web Developer, Senior UI Designer, Wordpress Web Developer, UI Web Designer, Full Stack Web Developer, Web Developer, Web Developer (Front-End), Front-End Engineer/UI Developer, Software Engineer (Web), Web Developer, Interns (Web Developers Mobile Developers Game Developers), Supply Cha

In [18]:
output= get_completion(query, model= "gpt-4", timeout=60)
print("Sample output completion from ChatGPT API with annotation tags for each Job description from dataset - \n\n", output)

Sample output completion from ChatGPT API with annotation tags for each Job description from dataset - 

  minimum qualifications <qualification>bachelors degree</qualification> or equivalent practical experience <years-of-experience>years of experience</years-of-experience> in <key-skills>saas</key-skills> or <key-skills>productivity tools business</key-skills>experience managing <job-profile>enterprise accounts</job-profile> with <key-skills>sales cycles</key-skills> preferred qualifications  <years-of-experience>years of experience</years-of-experience> building strategic business partnerships with <job-profile>enterprise customers</job-profile>ability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent <gene

* Now iterating over each value in column job_description and pass it along with above sample prompt to get completion

In [19]:
part_data_for_combined_roles_df.head(2)

Unnamed: 0,index,company_name,job_description,position_title,description_length,model_response
0,0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."


* On an average there are 5900-7500 tokens in every query passed to chat-gpt

In [None]:
# token_counts= []
# for row in data_df.iterrows():
#     print(f"Processing input prompt for row: {row[0]}")
# #     try:
#     jd_text = row[1]["job_description"]
#     jd_text= re.sub(r'\n', ' ', jd_text)
#     query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""
#     token_counts.append(len(nlp(query)))
# #     print(query.split())


# final_output=[]
# final_entities= []
# issues_list= []
# t1= time.time()
# for row in data_df.iterrows():
#     if row[0]>0:
#         print(f"Processing input prompt for row: {row[0]}")
#         try:
#             jd_text = row[1]["job_description"]
#             jd_text= re.sub(r'\n', ' ', jd_text)
#             query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""

#             output= get_completion(query, model= "gpt-4", timeout=120)
#             output= ast.literal_eval(output)# clean the completion output from GPT
#             final_output.append(output)
#             cleaned_text, entities = find_word_indices(output, all_tags)
#             final_entities.append((cleaned_text, {"entities":entities}))# With formatted text 'output' removing <tags>
#             print("Process completed!")
#         except Exception as e:
#             print(f"\n__________\nError processing input text for row: {row[0]} with error : {e}")
#             issues_list.append(row[1])
#         time.sleep(50)# 10 ms
#         print("Here")
#         if row[0]==4:
#             break
# print("\n\n_____\n Total execution time : ", time.time()-t1)

In [20]:
data_to_process = part_data_for_combined_roles_df.copy()
print("total number of job descriptions- ", len(list(data_to_process.iterrows())))
print("Estimated Time (Hours) in processing entities for all datapoints: ", round((853*(50+45))/3600))

print("Possible token count in each query - ", len(query.split(" ")))

# data_to_process= data_to_process.iloc[:2].copy()
print("Complete dataset shape - ", data_to_process.shape)
data_to_process.head(2)

total number of job descriptions-  313
Estimated Time (Hours) in processing entities for all datapoints:  23
Possible token count in each query -  1187
Complete dataset shape -  (313, 6)


Unnamed: 0,index,company_name,job_description,position_title,description_length,model_response
0,0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."


In [21]:
final_output=[]
final_entities= []
issues_list= []
t1= time.time()
logs_str= "" 
with open("data_preparation_logs.txt", "w") as file:
    for row in data_to_process.iterrows():
        print(f"Processing input prompt for row: {row[0]}")
        log_line=f"____________\nProcessing input prompt for row: {row[0]}\n"
        logs_str+= log_line
        try:
            jd_text = row[1]["job_description"]
            jd_text= re.sub(r'\n', ' ', jd_text)
            query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""
            log_line=f"\n Formatted the querry! \n"
            logs_str+= log_line

            output= get_completion(query, model= "gpt-4", timeout=120)
            log_line=f"\n ChatGPT Completion generated for given querry! \n"
            logs_str+= log_line
            final_output.append((query, output))

            output= ast.literal_eval(output)# clean the completion output from GPT

            cleaned_text, entities = find_word_indices(output, all_tags)
            log_line=f"\n All Entities indexed for generated completion of given querry. \n"
            logs_str+= log_line
            final_entities.append((cleaned_text, {"entities":entities}))# With formatted text 'output' removing <tags>
        except Exception as e:
            print(f"\n__________\nError processing input text for row: {row[0]} with error : {e}")
            log_line=f"\n__________\nError processing input text for row: {row[0]} with error : {e} \n"
            logs_str+= log_line
            issues_list.append(row[1])
        time.sleep(50)# Wait 50 seconds after each call
    log_line= f"_____\nTotal execution time- {time.time()-t1}"
    logs_str+= log_line
    file.write(logs_str)

print("\n\n_____\n Total execution time : ", time.time()-t1)
file.close()

final_entities_df =pd.DataFrame(final_entities, columns=["text", "entity_values"])
final_entities_df.to_csv("job_description_ner_dataset[final].csv", index=False)# Save the annotated dataset
print("\nSaving final dataframe as 'job_description_ner_dataset[final].csv' ")

prompt_and_gpt_output_df =pd.DataFrame(final_output, columns=["prompt", "gpt_completion"])
prompt_and_gpt_output_df.to_csv("job_description_ner_dataset[gpt_results].csv", index=False)# Save the annotated dataset
print("\nSaving final dataframe as 'job_description_ner_dataset[gpt_results].csv' ")

if issues_list:
    issue_entities_df =pd.DataFrame(issues_list)
    issue_entities_df.to_csv("job_description_ner_dataset[issues].csv", index=False)# Save the annotated dataset
    print("\nSaving final dataframe as 'job_description_ner_dataset[issues].csv' ")

Processing input prompt for row: 0
Processing input prompt for row: 1
Processing input prompt for row: 2
Processing input prompt for row: 3
Processing input prompt for row: 4
Processing input prompt for row: 5
Processing input prompt for row: 6
Processing input prompt for row: 7
Processing input prompt for row: 8
Processing input prompt for row: 9
Processing input prompt for row: 10
Processing input prompt for row: 11
Processing input prompt for row: 12
Processing input prompt for row: 13
Processing input prompt for row: 14
Processing input prompt for row: 15
Processing input prompt for row: 16

__________
Error processing input text for row: 16 with error : unexpected EOF while parsing (<unknown>, line 1)
Processing input prompt for row: 17
Processing input prompt for row: 18
Processing input prompt for row: 19
Processing input prompt for row: 20
Processing input prompt for row: 21

__________
Error processing input text for row: 21 with error : invalid syntax (<unknown>, line 1)
Proc

In [22]:
final_entities_df.head(5)

Unnamed: 0,text,entity_values
0,minimum qualifications bachelors degree or equ...,"{'entities': [(23, 39, 'qualification'), (75, ..."
1,description as an asc you will be highly influ...,"{'entities': [(18, 21, 'job-profile'), (89, 10..."
2,description web designers looking to expand y...,"{'entities': [(13, 26, 'job-profile'), (80, 11..."
3,designups is a nashville based design and inte...,"{'entities': [(139, 145, 'key-skills'), (181, ..."
4,about the position the web designer is respon...,"{'entities': [(24, 36, 'job-profile'), (146, 1..."


In [23]:
final_entities_df.shape

(240, 2)

* Re-run gpt for remaining job descriptions

In [24]:
remaining_jd_df = pd.read_csv("job_description_ner_dataset[issues].csv")
print("Shapes for remaining JD df - ", remaining_jd_df.shape)
remaining_jd_df.head(3)

Shapes for remaining JD df -  (73, 6)


Unnamed: 0,index,company_name,job_description,position_title,description_length,model_response
0,4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."
1,30,Epic,tldr\n\nhighimpact software development jobs f...,Software Engineer,3436,"{\n ""Core Responsibilities"": ""Write software..."
2,35,Blockchains,job description\nour vision\n\nin the new and ...,Software Engineer (JavaScript Backend),6958,"{\n ""Core Responsibilities"": ""Defining and i..."


In [25]:
data_to_process = remaining_jd_df.copy()
print("total number of job descriptions- ", len(list(data_to_process.iterrows())))

# data_to_process= data_to_process.iloc[:2].copy()
print("Complete dataset shape - ", data_to_process.shape)
data_to_process.head(2)

total number of job descriptions-  73
Complete dataset shape -  (73, 6)


Unnamed: 0,index,company_name,job_description,position_title,description_length,model_response
0,4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."
1,30,Epic,tldr\n\nhighimpact software development jobs f...,Software Engineer,3436,"{\n ""Core Responsibilities"": ""Write software..."


In [None]:
final_output=[]
final_entities= []
issues_list= []
t1= time.time()
logs_str= "" 
with open("data_preparation_logs.txt", "w") as file:
    for row in data_to_process.iterrows():
        print(f"Processing input prompt for row: {row[0]}")
        log_line=f"____________\nProcessing input prompt for row: {row[0]}\n"
        logs_str+= log_line
        try:
            jd_text = row[1]["job_description"]
            jd_text= re.sub(r'\n', ' ', jd_text)
            query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""
            log_line=f"\n Formatted the querry! \n"
            logs_str+= log_line

            output= get_completion(query, model= "gpt-4", timeout=120)
            log_line=f"\n ChatGPT Completion generated for given querry! \n"
            logs_str+= log_line
            final_output.append((query, output))

            output= ast.literal_eval(output)# clean the completion output from GPT

            cleaned_text, entities = find_word_indices(output, all_tags)
            log_line=f"\n All Entities indexed for generated completion of given querry. \n"
            logs_str+= log_line
            final_entities.append((cleaned_text, {"entities":entities}))# With formatted text 'output' removing <tags>
        except Exception as e:
            print(f"\n__________\nError processing input text for row: {row[0]} with error : {e}")
            log_line=f"\n__________\nError processing input text for row: {row[0]} with error : {e} \n"
            logs_str+= log_line
            issues_list.append(row[1])
        time.sleep(50)# Wait 50 seconds after each call
    log_line= f"_____\nTotal execution time- {time.time()-t1}"
    logs_str+= log_line
    file.write(logs_str)

print("\n\n_____\n Total execution time : ", time.time()-t1)
file.close()

final_entities_df =pd.DataFrame(final_entities, columns=["text", "entity_values"])
final_entities_df.to_csv("job_description_ner_dataset[final].csv", index=False)# Save the annotated dataset
print("\nSaving final dataframe as 'job_description_ner_dataset[final].csv' ")

prompt_and_gpt_output_df =pd.DataFrame(final_output, columns=["prompt", "gpt_completion"])
prompt_and_gpt_output_df.to_csv("job_description_ner_dataset[gpt_results].csv", index=False)# Save the annotated dataset
print("\nSaving final dataframe as 'job_description_ner_dataset[gpt_results].csv' ")

if issues_list:
    issue_entities_df =pd.DataFrame(issues_list)
    issue_entities_df.to_csv("job_description_ner_dataset[issues].csv", index=False)# Save the annotated dataset
    print("\nSaving final dataframe as 'job_description_ner_dataset[issues].csv' ")

Processing input prompt for row: 0
Processing input prompt for row: 1

__________
Error processing input text for row: 1 with error : unexpected EOF while parsing (<unknown>, line 1)
Processing input prompt for row: 2


In [None]:
# final_output=[]
# final_entities= []
# issues_list= []
# t1= time.time()
# logs_str= "" 
# with open("data_preparation_logs.txt", "w") as file:
#     for row in data_to_process.iterrows():
#         print(f"Processing input prompt for row: {row[0]}")
#         log_line=f"____________\nProcessing input prompt for row: {row[0]}\n"
#         logs_str+= log_line
#         try:
#             jd_text = row[1]["job_description"]
#             jd_text= re.sub(r'\n', ' ', jd_text)
#             query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""
#             log_line=f"\n Formatted the querry! \n"
#             logs_str+= log_line

#             output= get_completion(query, model= "gpt-4", timeout=120)
#             log_line=f"\n ChatGPT Completion generated for given querry! \n"
#             logs_str+= log_line
#             output= ast.literal_eval(output)# clean the completion output from GPT
#             final_output.append(output)
#             cleaned_text, entities = find_word_indices(output, all_tags)
#             log_line=f"\n All Entities indexed for generated completion of given querry. \n"
#             logs_str+= log_line
#             final_entities.append((cleaned_text, {"entities":entities}))# With formatted text 'output' removing <tags>
#         except Exception as e:
#             print(f"\n__________\nError processing input text for row: {row[0]} with error : {e}")
#             log_line=f"\n__________\nError processing input text for row: {row[0]} with error : {e} \n"
#             logs_str+= log_line
#             issues_list.append(row[1])
#         time.sleep(50)# Wait 50 seconds after each call
#     log_line= f"_____\nTotal execution time- {time.time()-t1}"
#     logs_str+= log_line
#     file.write(logs_str)

# print("\n\n_____\n Total execution time : ", time.time()-t1)
# file.close()

# final_entities_df =pd.DataFrame(final_entities, columns=["text", "entity_values"])
# final_entities_df.to_csv("job_description_ner_dataset[final].csv", index=False)# Save the annotated dataset
# print("\nSaving final dataframe as 'job_description_ner_dataset[final].csv' ")

# issue_entities_df =pd.DataFrame(issues_list)
# final_entities_df.to_csv("job_description_ner_dataset[issues].csv", index=False)# Save the annotated dataset
# print("\nSaving final dataframe as 'job_description_ner_dataset.csv' ")

Processing input prompt for row: 0
Processing input prompt for row: 1
Processing input prompt for row: 2

__________
Error processing input text for row: 3 with error : invalid syntax (<unknown>, line 1)
Processing input prompt for row: 4
Processing input prompt for row: 5
Processing input prompt for row: 6

__________
Error processing input text for row: 6 with error : EOL while scanning string literal (<unknown>, line 1)
Processing input prompt for row: 7
Processing input prompt for row: 8

__________
Error processing input text for row: 8 with error : Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=120)
Processing input prompt for row: 9
Processing input prompt for row: 10

__________
Error processing input text for row: 10 with error : invalid syntax (<unknown>, line 1)
Processing input prompt for row: 11

__________
Error processing input text for row: 11 with error : invalid syntax (<unknown>, line 1)
Processing input prompt 

In [18]:
# final_entities_df =pd.DataFrame(final_entities, columns=["text", "entity_values"])
# final_entities_df.to_csv("job_description_ner_dataset.csv", index=False)# Save the annotated dataset

final_entities_df.head(5)

Unnamed: 0,text,entity_values
0,minimum qualifications bachelors degree or equ...,"{'entities': [(23, 39, 'qualification'), (75, ..."
1,design develop and test high quality software ...,"{'entities': [(119, 182, 'key-skills'), (183, ..."
2,hey just wanted you to check out this job let ...,{'entities': []}
3,inventory clerk grocery part time inventory cl...,{'entities': []}


In [None]:
# final_entities_df= pd.read_csv("job_description_ner_dataset.csv")# Read saved NER dataset

final_entities = [tuple(row) for row in final_entities_df.itertuples(index=False)]
# Extract the list of tuples back in Original Spacy NER format

* Build a chracteristic table of entities for each job description.

_____________________________________

_____

In [465]:
final_entities[0][0][98:102], final_entities[0][0][661:688]

('saas', 'writtenverbal communication')

In [None]:
# Sample Entities Training example- 
#  [
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]}),
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]})
# ]

# # From Spacy's. documentation - "https://spacy.io/api/data-formats"

# # Training data for an entity recognizer (option 1)
# doc = nlp("Laura flew to Silicon Valley.")
# gold_dict = {"entities": ["U-PERS", "O", "O", "B-LOC", "L-LOC"]}
# example = Example.from_dict(doc, gold_dict)

# # Training data for an entity recognizer (option 2)
# doc = nlp("Laura flew to Silicon Valley.")
# gold_dict = {"entities": [(0, 5, "PERSON"), (14, 28, "LOC")]}
# example = Example.from_dict(doc, gold_dict)

In [None]:
# # text = "<obj>Apple</obj> is a fruit. <per>John</per> and <per>Mary</per> went to the <plc>park</plc>."
# text = "Apple is a fruit. John and Mary went to the park."
# # # Specifying tags in a list
# # tags = ["obj", "per", "plc", "loc"]

# # find_word_indices(output, all_tags)
# txt, ent = find_word_indices(text, tags)
# txt, ent

In [331]:
# # find_word_indices(output, all_tags)
# txt, ent = find_word_indices(text, tags)
# text, txt, ent

('<obj>Apple</obj> is a fruit. <per>John</per> and <per>Mary</per> went to the <plc>park</plc>.',
 'Apple is a fruit. John and Mary went to the park.',
 [(5, 10, 'obj'), (23, 27, 'per'), (32, 36, 'per'), (49, 53, 'plc')])

In [354]:
# i, j=32, 36
# p, q= 54,58
# np, nq= p-27, q-27
# txt[i:j], text[p:q], txt[np:nq]

('went', 'Mary', 'Mary')

In [377]:
# i, j=23, 27
# p, q= 34,38
# np, nq= 34-16, 38-16
# txt[i:j], text[p:q], txt[np:nq]

('and ', 'John', 'John')

In [366]:
# p, q, txt[p:q]

(18, 22, 'John')

In [337]:
# txt[5:10], text[5:10]

(' is a', 'Apple')

In [338]:
# tags
# tag_pattern = "|".join(tags)
# pattern = re.compile(f"<({tag_pattern})>(.*?)</\\1>")
# tag_pattern, pattern

('obj|per|plc|loc', re.compile(r'<(obj|per|plc|loc)>(.*?)</\1>', re.UNICODE))

In [356]:
# text, txt

('<obj>Apple</obj> is a fruit. <per>John</per> and <per>Mary</per> went to the <plc>park</plc>.',
 'Apple is a fruit. John and Mary went to the park.')

In [378]:
# original_indices = []
# offset= 0
# for match in pattern.finditer(text):
#     ent_name = match.group(1)
#     i, j= match.start(2), match.end(2)
#     name = match.group(2)
#     offset+=len(ent_name)+2
#     print(i, j, offset, text[i:j], txt[i-offset:j-offset])
#     original_indices.append((i, j, name, ent_name))
#     offset+=len(ent_name)+3
# original_indices

5 10 5 Apple Apple
34 38 16 John John
54 58 27 Mary Mary
82 86 38 park park


[(5, 10, 'Apple', 'obj'),
 (34, 38, 'John', 'per'),
 (54, 58, 'Mary', 'per'),
 (82, 86, 'park', 'plc')]

In [386]:
# text= sentence
# text

'<obj>Book</obj>s spread knowledge. <per>Emily</per> adores the <obj>library</obj>. She met <per>Dylan</per> there. They discovered a hidden <plc>room</plc> and found an ancient <obj>map</obj>, leading to a secret <loc>site</loc>.'

In [388]:
# txt, ent = find_word_indices(text, tags)
# txt

'Books spread knowledge. Emily adores the library. She met Dylan there. They discovered a hidden room and found an ancient map, leading to a secret site.'

In [394]:
# original_indices = []
# offset= 0
# for match in pattern.finditer(text):
#     ent_name = match.group(1)
#     i, j= match.start(2), match.end(2)
#     name = match.group(2)
#     offset+=len(ent_name)+2
#     print(i, j, offset, text[i:j], txt[i-offset:j-offset])
#     original_indices.append((i-offset, j-offset, name, ent_name))
#     offset+=len(ent_name)+3
# original_indices

5 9 5 Book Book
40 45 16 Emily Emily
68 75 27 library library
96 101 38 Dylan Dylan
145 149 49 room room
182 185 60 map map
218 222 71 site site


[(0, 4, 'Book', 'obj'),
 (24, 29, 'Emily', 'per'),
 (41, 48, 'library', 'obj'),
 (58, 63, 'Dylan', 'per'),
 (96, 100, 'room', 'plc'),
 (122, 125, 'map', 'obj'),
 (147, 151, 'site', 'loc')]

In [381]:
# # Tags are specified
# tags = ["obj", "per", "plc", "loc"]

# # Constructing a sentence with the specified tags and a word limit
# sentence = f"<{tags[0]}>Book</{tags[0]}>s spread knowledge. <{tags[1]}>Emily</{tags[1]}> adores the <{tags[0]}>library</{tags[0]}>. She met <{tags[1]}>Dylan</{tags[1]}> there. They discovered a hidden <{tags[2]}>room</{tags[2]}> and found an ancient <{tags[0]}>map</{tags[0]}>, leading to a secret <{tags[3]}>site</{tags[3]}>."

# print(sentence)


<obj>Book</obj>s spread knowledge. <per>Emily</per> adores the <obj>library</obj>. She met <per>Dylan</per> there. They discovered a hidden <plc>room</plc> and found an ancient <obj>map</obj>, leading to a secret <loc>site</loc>.


In [368]:
# original_indices = []
# offset= 0
# for match in pattern.finditer(text):
#     ent_name = match.group(1)
# #     offset+=len(ent_name)+2
# #     print(offset, offset)
#     original_indices.append((match.start(2), match.end(2), match.group(2), ent_name))
# original_indices

[(5, 10, 'Apple', 'obj'),
 (34, 38, 'John', 'per'),
 (54, 58, 'Mary', 'per'),
 (82, 86, 'park', 'plc')]

In [None]:
# [
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]}),
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]})
# ]

In [None]:
# jd_text = data_df["job_description"][0]
# jd_text= re.sub(r'\n', ' ', jd_text)

# query= f"""You are an Experienced Talent Acquisition Specialist with 20 years of experience in recruiting across various Technical and Non-Technical industries for different roles such as - "{", ".join(all_jobtitles)}", Now understand this job description well considering your experience for listed jobroles- "{jd_text}", and tag the words or expressions (group of words) within the given JD in one or all of the following tags - "{", ".join(all_tags)}", whichever are applicable, and return the entire given Job description with relevant tags as shown in the example format for a short Job Description (Note: PLEASE DO NOT TRUNCATE THE JOB DESCRIPTION. DO NOT DROP ANY WORDS FROM JD IN THE OUTPUT, JUST ADD THE RELEVANT APPLICABLE TAGS as shown in the example here). Example output for a short JD - 'The candidate must have <qualification>completed Bachelors</qualification>, Currently in their <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, with <years-of-experience>3+ years of related work experience</years-of-experience>, or <years-of-experience>equivalent work experience</years-of-experience> in <key-skills>C++</key-skills>, <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>, Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>, <key-skills>large-scale recommendation system</key-skills>. The candidate should also have <years-of-experience>4 years of programming experience</years-of-experience> in total. The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>. Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>'. where "key-skills" looks for and tags the core required skills(programming language, platform, software) for that particular job role which are mandatorily to have for that job, "generic-skills" looks for and tags generic non-technical skills(leadership, collaboration, communication) losely related to the job which are considered good to have, "degree-level" tags targeted degree requirements, "degree-name" tags names of eligible degrees/courseworks, "university" tags names of university preferred or targeted, "years-of-experience" looks for and tags the number of years of experience(mentioned in numeric or alphanumeric format ex- 4 years, five years, equivalent experience etc.) preferred, "visa-sponsorship" looks for and tags whether the candidate can be sponsored by company, "qualification" looks for and tags minimum required degree/qualification, "job-profile" tags the name of existing jobrole or desired job profiles from previous work experience, "location" tags the location of job, "payscale" tags the range of salary/pay, "publication-requirement" tags the preferred research publication/conferences, "workmode" tags the mode of work at job(in-person, remote or hybrid)."""


# # output= get_completion(query, model= "gpt-4", timeout=40)

# output= ast.literal_eval(output)

# entities, ent_names = find_word_indices(output, all_tags)


In [215]:
# output= ast.literal_eval(output)
# output

'minimum qualifications <qualification>bachelors degree</qualification> or equivalent practical experience <years-of-experience>years of experience</years-of-experience> in <key-skills>saas</key-skills> or <key-skills>productivity tools business</key-skills>experience managing <job-profile>enterprise accounts</job-profile> with sales cycles preferred qualifications  <years-of-experience>years of experience</years-of-experience> building strategic business partnerships with <job-profile>enterprise customers</job-profile>ability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent <generic-skills>account management</generic-skills> <generic-skills>writtenverbal communication</generic-skills> <generic-skills>strateg

In [217]:
# entities, ent_names = find_word_indices(output, all_tags)

([(38, 54, 'qualification'),
  (127, 146, 'years-of-experience'),
  (184, 188, 'key-skills'),
  (217, 244, 'key-skills'),
  (290, 309, 'job-profile'),
  (389, 408, 'years-of-experience'),
  (490, 510, 'job-profile'),
  (879, 897, 'generic-skills'),
  (931, 958, 'generic-skills'),
  (992, 1031, 'generic-skills')],
 [(38, 54, 'bachelors degree'),
  (127, 146, 'years of experience'),
  (184, 188, 'saas'),
  (217, 244, 'productivity tools business'),
  (290, 309, 'enterprise accounts'),
  (389, 408, 'years of experience'),
  (490, 510, 'enterprise customers'),
  (879, 897, 'account management'),
  (931, 958, 'writtenverbal communication'),
  (992, 1031, 'strategic and analyticalthinking skills')])

In [87]:
# [
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]}),
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]})
# ]



# find_word_indices(output, all_tags)

([(32, 48, 'degree'),
  (114, 133, 'years-of-experience'),
  (171, 175, 'key-skills'),
  (204, 231, 'key-skills'),
  (349, 368, 'years-of-experience'),
  (812, 830, 'generic-skills'),
  (865, 892, 'generic-skills'),
  (927, 966, 'generic-skills')],
 [(32, 48, 'bachelors degree'),
  (114, 133, 'years of experience'),
  (171, 175, 'saas'),
  (204, 231, 'productivity tools business'),
  (349, 368, 'years of experience'),
  (812, 830, 'account management'),
  (865, 892, 'writtenverbal communication'),
  (927, 966, 'strategic and analyticalthinking skills')])

In [None]:
# The candidate must have <qualification>completed Bachelors</qualification>, Currently in their 
# <degree-level>Masters</degree-level> or <degree-level>Phd</degree-level> degree 
# in <degree-name>computer science</degree-name>, <degree-name>data science</degree-name> or a related major, 
# with <years-of-experience>3+ years of related work experience</years-of-experience> in <key-skills>C++</key-skills>, 
# <key-skills>Python</key-skills>, Strong <generic-skills>communication, writing and teamwork skills</generic-skills>,
# Must have <key-skills>applied machine learning</key-skills>, <key-skills>machine learning infrastructure</key-skills>,
# <key-skills>large-scale recommendation system</key-skills>, The job is based in <location>San Jose, CA</location>. Its a <workmode>hybrid</workmode> role, with some flexibility with 
# <workmode>in-person</workmode> and <workmode>remote attendence</workmode>, prefernce will be given to <university>University of California</university> 
# candidates. The salary for this job ranges from <payscale>$126,000/yr - $221,760/yr</payscale>, <visa-sponsorship>Sponsorship for this role is case-by-case basis</visa-sponsorship>.
# Candidates with influential open-source projects or publications at <publication-requirement>top-tier AI conferences (e.g., NeurIPS, CVPR, ICML, ICLR, ICCV, and ACL) are preferred</publication-requirement>

In [None]:
# Qualifications

#  - Bachelor degree in computer science or a related major, with 3+ years of related work experience;
# - Strong software development experience with C++, Python or other programming language;
# - Experience in one or more of the following areas: applied machine learning, machine learning infrastructure, large-scale recommendation system, market-facing machine learning product;
# - Strong communication and teamwork skills;
# - Passion about techniques and solving challenging problems

In [14]:
# data_df[data_df["position_title"]=="Software Engineer"]["job_description"].iloc[1]

'description hughes private capital in business since  and one of the fastest growing companies in northern nevada is on a yearoveryear growth trajectory that requires increased staffing in all departments to keep pace with this growth and theres no letup in sight we are proud of our roots in the real estate investment industry we provide sound healthy and stable returns to our investors through the acquisition rehab and longterm hold of rental homes\njob summary\n\nthe software engineer is a critical role within the software engineering team that participates in the development of new technical systems enhances the existing application fixes bugs performs rd and is involved in communicating and coordinating with senior and junior engineers qa team members and other members of the software engineering team this individual may conduct code reviews as needed must act as an individual contributor within regularly scheduled sprint cycles and is expected to participate in software rollouts 

In [39]:
df["position_title"].value_counts()

Sales Associate                                     9
Retail Sales Associate                              9
Project Manager                                     7
Inventory Clerk                                     6
Systems Administrator                               6
                                                   ..
Senior Sales Marketing Specialist (Programmatic)    1
Sales and Marketing Coordinator                     1
Growth Marketing Specialist                         1
Marketing Coordinator                               1
Remote Inbound Customer Service Representative      1
Name: position_title, Length: 725, dtype: int64

In [90]:
# df["job_description"].values[0]

In [None]:
# [(“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,{“entities”:[(0,25,”degree”),(27,56,”school_name”),(58,66,”location”),(68,72,”date”)]}),
# (“Bachelor of Business, University of Western Sydney (2005) “,{“entities”:[(0,20,”degree”),(22,43,”school_name”),(44,50,”location”),(52,56,”date”)]}),
 
 

In [None]:
# [
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]}),
# (“Higher School Certificate, Parramatta Marist High School, Westmead (1998)”,
#  {“entities”:[(0,25,”degree”), (27,56,”school_name”),(58,66,”location”),
#               (68,72,”date”)]})
# ]



In [None]:
# # From Spacy's. documentation - "https://spacy.io/api/data-formats"

# # Training data for an entity recognizer (option 1)
# doc = nlp("Laura flew to Silicon Valley.")
# gold_dict = {"entities": ["U-PERS", "O", "O", "B-LOC", "L-LOC"]}
# example = Example.from_dict(doc, gold_dict)

# # Training data for an entity recognizer (option 2)
# doc = nlp("Laura flew to Silicon Valley.")
# gold_dict = {"entities": [(0, 5, "PERSON"), (14, 28, "LOC")]}
# example = Example.from_dict(doc, gold_dict)

In [56]:
# all_tags

['key-skills',
 'generic-skills',
 'degree',
 'university',
 'years-of-experience',
 'visa-sponsorship',
 'qualification',
 'job-profile',
 'location',
 'payscale',
 'publication-requirement',
 'workmode']

In [63]:
# output[32:48]

'bachelors degree'

In [68]:
# output

'"minimum qualifications\n<degree>bachelors degree</degree> or equivalent practical experience <years-of-experience>years of experience</years-of-experience> in <key-skills>saas</key-skills> or <key-skills>productivity tools business</key-skills>experience managing enterprise accounts with sales cycles\npreferred qualifications\n <years-of-experience>years of experience</years-of-experience> building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent <generic-skills>account management</generic-skills>, <generic-skills>writtenverbal communication</generic-skills>, <generic-skills>strategic and analyticalthinking skills</generic-skills>\nabout the j

In [77]:
# jd_text[20:150]

'ns\nbachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managi'

In [78]:
# output[171:175], output[204:231]

('saas', 'productivity tools business')

In [72]:
# output[114:133], output[349:368]

('years of experience', 'years of experience')

In [69]:
# find_word_indices(output, all_tags)

([(32, 48, 'degree'),
  (114, 133, 'years-of-experience'),
  (171, 175, 'key-skills'),
  (204, 231, 'key-skills'),
  (349, 368, 'years-of-experience'),
  (812, 830, 'generic-skills'),
  (865, 892, 'generic-skills'),
  (927, 966, 'generic-skills')],
 [(32, 48, 'bachelors degree'),
  (114, 133, 'years of experience'),
  (171, 175, 'saas'),
  (204, 231, 'productivity tools business'),
  (349, 368, 'years of experience'),
  (812, 830, 'account management'),
  (865, 892, 'writtenverbal communication'),
  (927, 966, 'strategic and analyticalthinking skills')])

In [58]:
# import re

# def find_word_indices(text, tags):
#     # Constructing a pattern to match any of the specified tags
#     tag_pattern = "|".join(tags)
# #     tag_pattern = "|".join([tag[1:-1] for tag in tags])  # Removing "<>" 
#     pattern = re.compile(f"<({tag_pattern})>(.*?)</\\1>")
    
#     results = []
#     exact_results=[]
#     for match in pattern.finditer(text):
#         # 'match.group(2)' contains the matched word/phrase.
#         # 'match.start(2)' and 'match.end(2)' give the start and end indices of the matched word/phrase.
#         exact_results.append((match.start(2), match.end(2), match.group(2)))# appends the word itself in last index
#         results.append((match.start(2), match.end(2), match.group(1)))# appends the word tag in last index
        
#     return results, exact_results

In [44]:
# abc= "<per>Laura</per> flew to <obj>Silicon</obj> <plc>Valley</plc>."#"Extract <obj>key</obj> statements of fact"

# print(find_word_indices(abc, ['obj', 'per', 'plc']))# Output with word tags

[(5, 10, 'per'), (30, 37, 'obj'), (49, 55, 'plc')]


In [36]:
# abc= "<per>Laura</per> flew to <obj>Silicon</obj> <plc>Valley</plc>."#"Extract <obj>key</obj> statements of fact"

# print(find_word_indices(abc, ['obj', 'per', 'plc']))# Output with words

# print(abc[5:10], abc[30:37], abc[49:55])

[(5, 10, 'Laura'), (30, 37, 'Silicon'), (49, 55, 'Valley')]
Laura Silicon Valley


In [None]:
# query = f'''Extract key statements of fact from the article and
#                  arrange them into a Python array of strings.  Each statement of fact
#                  needs to standalone and as unambiguous as possible.  Statements should be
#                  English pronoun-free.  Instead, explicitly use the subject(s) of every
#                  pronoun in each statement.  Ignore headings and titles.  
#                  Make sure each key statement of fact is unique in the final Python list.
#                 ### Article
#                 {text}
#                 ###
#                 '''
#         t = self.get_completion(query)
#         try:
#             l = ast.literal_eval(t)