In [5]:
import json

In [6]:
import re

In [7]:
# https://huggingface.co/premrawat/en_model_ner_skills

# !pip install pdfminer.six
# !pip install docx2txt
# !pip install spacy==3.2.4

from spacy.matcher import Matcher
from collections import defaultdict
import docx2txt, spacy

# input_file = 'Resume.docx'

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_text_from_docx(docx_path):
    txt = docx2txt.process(docx_path)
    if txt:
        return txt.replace('\t', ' ')
    return None

def preprocessMatchOutput(indexes):
    middle_indexes = defaultdict(list)

    for k, v in indexes:
        middle_indexes[k].append(v)

    middle_indexes = {k: max(v) for k, v in middle_indexes.items()}
    indexes = list(middle_indexes.items())

    middle_indexes = defaultdict(list)
    for k, v in indexes:
        middle_indexes[v].append(k)

    middle_indexes = {min(v):k for k, v in middle_indexes.items()}
    indexes = list(middle_indexes.items())
    
    return indexes

nlp = spacy.load("en_model_ner_skills")
patterns = [
    [{'ENT_TYPE': 'SKILL', 'OP': '+'}]
]
    
matcher = Matcher(nlp.vocab)
matcher.add("skills", patterns)

def extract_skills(output):
    '''
    update code to also obtain result as list, so that freq can be obtained for each skill later
    '''
    doc = nlp(output)

    matches = matcher(doc)

    indexes = [match[1:] for match in matches]
    indexes = preprocessMatchOutput(indexes)

    skillset = list()
    for (start,end) in indexes:
#         print(doc[start:end])
        skillset.append(doc[start:end].text)
    return skillset

In [8]:
def get_int(x):
    return ''.join(re.findall(r'\d+',x))
         

In [9]:
# spacy sentence tokenize

In [10]:
import spacy

# doc = nlp("This is a sentence. This is another sentence.")
# print(doc.has_annotation("SENT_START"))

# for sent in doc.sents:
#     print(sent.text)
nlp_1 = spacy.load("en_core_web_sm")

def H1B_flag(x):
    
    doc = nlp_1(x)
    h1b = 1
    for sent in doc.sents:
        data = sent.text.upper()
        if data.find("SPONSOR")>-1:
            if data.find("NOT")>-1:
                h1b= 0 
        else:
            continue
    return h1b
    
    

In [11]:
def get_details(job_data):
    Company_name = job_data['Company']
    Job_family = job_data['Job_family']
    job_location = job_data['job_location']
    City_name = job_location.split(",")[0]
    State = job_location.split(",")[1]
    Link = job_data['Link']
    Title = job_data['Title']
    Job_type = "Full Time" if job_data['company_overview'].find("Full-time")>-1 else "Part Time"
    Unique_key = job_data['job_key']
    skills = extract_skills(job_data['JD'])
    try:
        
        jd = job_data['JD'].lower()

        education_set = set()
        if jd.find("bachelor's")>0 or jd.find("bachelors’")>0:
            education_set.add("Bachelor Degree")
        if jd.find("master's")>0 or jd.find("masters’")>0:
            education_set.add("Master Degree")
        if jd.find("phd")>0:
            education_set.add("PhD Degree")
        Education = list(education_set)
        if Education==[]:
            Education = ["Bachelor Degree","Master Degree","PhD Degree"]
    except:
        Education = ["Bachelor Degree","Master Degree","PhD Degree"]
    
    try:
        

        Work_max = max([int(re.findall(r'\d+',w)[0]) for w in re.findall(r"\d\+ years",jd)])
    except:
        Work_max = 1
    try:

        Work_min = min([int(re.findall(r'\d+',w)[0]) for w in re.findall(r"\d\+ years",jd)])
    except:
        Work_min = 1
        

    overview = job_data['company_overview']
    
    try:
        start_index = re.finditer("-time",overview)
        end_index = re.finditer("\+ employees",overview)

        start = [w.end() for w in start_index][0]

        end = [w.start() for w in end_index][0]

        Employee_Count = overview[start:end]  ##int(get_int(overview[start:end]).replace(",",''))
    except:
        Employee_Count = '10000'
        
    
    try:
        end_index = re.finditer("\+ employees",overview)
        start_comp_type = [w.end() for w in end_index][0]


        new_string = overview[start_comp_type:]

        end_index = re.finditer("\d. school",new_string)

        end = [w.start() for w in end_index][0]

        company_type_tmp = new_string[:end]

        company_type = re.sub('[^a-zA-Z ]+', '', company_type_tmp).strip()
    except:
        company_type = 'BFSI'
    
    

    job_dict = {}
    job_dict['Company_name'] = Company_name
    job_dict['Job_family'] = Job_family
    job_dict['job_location'] = job_location
    job_dict['City_name'] = City_name
    job_dict['State'] = State
    job_dict['Link'] = Link
    job_dict['Title'] = Title
    if Job_type =='':
        job_dict['Job_type'] = "BFSI"
    else:
        job_dict['Job_type'] = Job_type
    
    job_dict['Unique_key'] = Unique_key
    job_dict['skills'] = skills
    job_dict['Education'] = Education
    job_dict['Work_max'] = Work_max
    job_dict['Work_min'] = Work_min
    job_dict['Employee_Count'] = Employee_Count
    job_dict['company_type'] = company_type
    job_dict['H1B_Flg'] = H1B_flag(job_data['JD'])
    job_dict['Job_Description'] = job_data['JD']
    
    return job_dict
    
    

In [12]:
with open("consolidated_jds.json",'r') as fin:
    lines = fin.readlines()

In [13]:
# job_level_list = []
# count = 0
# #file = open('result1_skill_updated.jsonl', 'w')

# for i in lines[:]:
    
#     job_data = json.loads(i)
#     data = H1B_flag(job_data['JD'])
#     job_level_list.append(data)
   

In [14]:
job_level_list = []
count = 0
file = open('result1_skill_updated.jsonl', 'w')

for i in lines[:]:
    
    job_data = json.loads(i)
    try:
        data = get_details(job_data)
        job_level_list.append(data)
        line = json.dumps(data,ensure_ascii=False) + "\n"
        file.write(line) 
    except:
        count+=1
