In [33]:
import pandas as pd
import spacy
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS

from flashtext import KeywordProcessor
from spacy.matcher import PhraseMatcher
import sys

#progress bar packages
from tqdm import tqdm
#ngram package
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
import nltk
nlp = spacy.load("en_core_web_lg")

In [34]:
df = pd.read_csv("indeed_jd.csv")
df.head()

Unnamed: 0,Job Type,Job_Title,Description
0,Data engineer,Data Engineer,"Responsibilities\nBuild, maintain the data pip..."
1,Data engineer,Data Engineer,What to Expect\nTesla Electronics Test and Aut...
2,Data engineer,Production Support Engineer - Data Engineering,Bonhill Partners Singapore\nPosted 5 days ago ...
3,Data engineer,Data Engineer,Beyond Limits is a pioneering Artificial Intel...
4,Data engineer,Data Engineer,Trusted carbon credits. Real impact.\nClimate ...


In [36]:
df.dtypes

Job Type       object
Job_Title      object
Description    object
dtype: object

In [3]:
#preprocessing
def preprocess_jd(text):
    
    # Lowercase the text
    text = text.lower()
    #remove stop words
    text = text = ' '.join([word for word in text.split() if word.lower() not in STOP_WORDS])
    # Check if any of the patterns .\n, \n\n, \n is present and replace accordingly
    if '.\n' in text:
        text = text.replace('.\n', '. ')
    elif '\n\n' in text:
        text = text.replace('\n\n', ' ')
    elif '\n' in text:
        text = text.replace('\n', '. ')
    # Process the text using SpaCy
    doc = nlp(text)

    # Extract sentences from the processed text and remove trailing full stops
    sentences = [sent.text.strip('. ') for sent in doc.sents]
    # remove special chracters except those that useually used in hard skill
    special_char = ['#', '+']
    sentences = [re.sub(r'[^a-zA-Z0-9{}]+'.format(re.escape(' '.join(special_char))), ' ', sent) for sent in sentences]
    #remove double spacing
    sentences = [re.sub(r'\s+', ' ', sent) for sent in sentences]
    
    return sentences

In [4]:
df["clean_jd"] = df["Description"].apply(preprocess_jd)
df

Unnamed: 0,Job Type,Job_Title,Description,clean_jd
0,Data engineer,Data Engineer,"Responsibilities\nBuild, maintain the data pip...",[responsibilities build maintain data pipeline...
1,Data engineer,Data Engineer,What to Expect\nTesla Electronics Test and Aut...,[expect tesla electronics test automation grou...
2,Data engineer,Production Support Engineer - Data Engineering,Bonhill Partners Singapore\nPosted 5 days ago ...,[bonhill partners singapore posted 5 days ago ...
3,Data engineer,Data Engineer,Beyond Limits is a pioneering Artificial Intel...,[limits pioneering artificial intelligence eng...
4,Data engineer,Data Engineer,Trusted carbon credits. Real impact.\nClimate ...,"[trusted carbon credits, real impact, climate ..."
...,...,...,...,...
1023,Machine learning,Software Engineer Intern,About us\nFairmart is an early-stage community...,[fairmart early stage community commerce compa...
1024,Machine learning,AI Developer,Responsibilities:\nWe are working on exciting ...,[responsibilities working exciting new ai powe...
1025,Machine learning,"Biostatistician, Population Health",The Singapore Eye Research Institute (SERI) se...,[singapore eye research institute seri serves ...
1026,Machine learning,Senior/ Research Scientist (High Performance C...,What the role is\n\nThe Centre for Climate Res...,[role centre climate research singapore ccrs d...


In [5]:
df1 = pd.DataFrame({
    'jd': df['Description'].repeat(df['clean_jd'].apply(len)),
    'sentence': [sent for sublist in df['clean_jd'] for sent in sublist]
})
df1

Unnamed: 0,jd,sentence
0,"Responsibilities\nBuild, maintain the data pip...",responsibilities build maintain data pipelines...
0,"Responsibilities\nBuild, maintain the data pip...",data warehouse models sync data source changes...
0,"Responsibilities\nBuild, maintain the data pip...",existing data setup keeping abreast changes ad...
0,"Responsibilities\nBuild, maintain the data pip...",identify root cause data inconsistencies proce...
0,"Responsibilities\nBuild, maintain the data pip...",care deployments code repository maintenance n...
...,...,...
1027,Description\nDialogue summarization is the tas...,ra position aims evaluate recent efforts takin...
1027,Description\nDialogue summarization is the tas...,preferred bachelor s master s degree computer ...
1027,Description\nDialogue summarization is the tas...,ijcai 2022 liu chen controllable neural dialog...
1027,Description\nDialogue summarization is the tas...,emnlp ijcnlp 2019 xiyan fu yating zhang tianyi...


In [6]:
emsi = pd.read_csv("all_emsi_skills.csv")
emsi.head()

Unnamed: 0,type,id,name
0,Certification,KS120P86XDXZJT3B7KVJ,(American Society For Quality) ASQ Certified
1,Hard Skill,KS126XS6CQCFGC3NG79X,.NET Assemblies
2,Hard Skill,KS1200B62W5ZF38RJ7TD,.NET Framework
3,Hard Skill,KS126XW78QJCF4TRV2X7,.NET Framework 1
4,Hard Skill,KS126XY68BNKXSBSLPYS,.NET Framework 3


In [7]:
emsi_hard = emsi.loc[emsi['type'] == "Hard Skill"]
emsi_hard

Unnamed: 0,type,id,name
1,Hard Skill,KS126XS6CQCFGC3NG79X,.NET Assemblies
2,Hard Skill,KS1200B62W5ZF38RJ7TD,.NET Framework
3,Hard Skill,KS126XW78QJCF4TRV2X7,.NET Framework 1
4,Hard Skill,KS126XY68BNKXSBSLPYS,.NET Framework 3
5,Hard Skill,KS126XR63RKYVCKYDNBN,.NET Framework 4
...,...,...,...
29072,Hard Skill,KS125T770BQ1N7T35WW0,cURL
29073,Hard Skill,ESB66C176361E21016FE,eClinicalWorks (ECW)
29074,Hard Skill,KS123KB73N41KLG2QH44,eNodeB (LTE Technology)
29075,Hard Skill,KSEPVMUZUEDHVWQBO8UN,iBeacon Protocol


In [8]:
def extract_and_remove_bracketed_words(text_list):
    cleaned_list = []
    bracketed_words = set() 

    for text in text_list:
        # Define a regular expression pattern to match words in brackets
        pattern = r'[\(\[]([^()\[\]]+)[\)\]]'

        # Find all bracketed words in the text
        found_words = re.findall(pattern, text)

        # Remove the bracketed words from the original text
        cleaned_text = re.sub(pattern, '', text)

        # Append the cleaned text and extracted bracketed words to lists
        cleaned_list.append(cleaned_text.strip())

        # Append non-empty bracketed words to the list
        if found_words:
            bracketed_words.update(found_words)

    return cleaned_list+list(bracketed_words)

skill_list = extract_and_remove_bracketed_words(emsi_hard["name"].values)
skill_list

['.NET Assemblies',
 '.NET Framework',
 '.NET Framework 1',
 '.NET Framework 3',
 '.NET Framework 4',
 '.NET Reflector',
 '.NET Remoting',
 '.nettiers',
 '10 Gigabit Ethernet',
 '1010data',
 '10BASE-F',
 '10BASE-FL',
 '10BASE2',
 '10BASE5',
 '10G-PON',
 '123RF',
 '128bit',
 '12factor',
 '2020 Design Software',
 '2D Animation',
 '2D Computer Graphics',
 '2D Gel Analysis Software',
 '2checkout',
 '3.5G',
 '35 Mm Films',
 '389 Directory Server',
 '3CX Phone Systems',
 '3D Art',
 '3D Camcorder',
 '3D Computer Graphics',
 '3D Computer Graphics Software',
 '3D Conformal Radiotherapy',
 '3D Graphic Design',
 '3D Modeling',
 '3D Modeling Softwares',
 '3D Printing',
 '3D Projection',
 '3D Reconstruction',
 '3D Rendering',
 '3D Scanning',
 '3D Visualization',
 '3DML',
 '3DMark',
 '3DSlicer',
 '3Delight',
 '3Dvia Composer',
 '3GP',
 '3GPP',
 '3GPP2',
 '3d Engine',
 '3d Secure',
 '3d Solid And Surface Modeling',
 '4D Modelling',
 '4DOS',
 '4Sight',
 '4d Database',
 '50% Tissue Culture Infective Do

In [9]:
def preprocess_skills(value_list):
    # Define the pattern to match special characters
    special_char = ['#', '+']
    cleaned_values = [re.sub(r'[^a-zA-Z0-9{}]+'.format(re.escape(' '.join(special_char))), ' ', value) for value in value_list]
    return cleaned_values

skill_list = preprocess_skills(skill_list)


In [10]:
#Creating all the possible grams for each sentence
allgrams = []

#This tokenizer immediately removes punctuation and special characters from the sentence
tokenizer = RegexpTokenizer(r'\w+')

#,sentence,sentence_lemmatized,sentence_no_stopwords

for sentence in tqdm(df1['sentence']):
    tokenizedsentence = tokenizer.tokenize(str(sentence))
    
    #Getting all possible n grams of the sentence
    #for n in range(1,len(tokenizedsentence)+1):
    
    #getting up to four grams for each sentence
    for n in range(1,5):
        grams = ngrams(tokenizedsentence,n)
        for gram in grams:
            allgrams.append(str(gram))

allgrams = pd.DataFrame(allgrams)
allgrams = allgrams.rename(columns={0:'allgrams'})
print(allgrams.shape[0])

100%|██████████████████████████████████████████████████████████████████████████| 17602/17602 [00:03<00:00, 5079.47it/s]


1041064


In [11]:
#Initializing the keyword processor
keyword_processor = KeywordProcessor(case_sensitive=False)

#Adding all the skills to the processor
for skill in tqdm(skill_list):
    keyword_processor.add_keyword(skill)

100%|█████████████████████████████████████████████████████████████████████████| 28441/28441 [00:00<00:00, 29190.39it/s]


In [12]:
def searcher(row):
    #check if the words are in the row and return a True or False instead of the actual word
    boolean = bool(keyword_processor.extract_keywords(row))
    return boolean

tqdm.pandas()
allgrams['contains_skill'] = allgrams['allgrams'].progress_apply(searcher)

#Only selecting the ngrams which contain a skill
allgrams = allgrams[allgrams.contains_skill == True]

100%|█████████████████████████████████████████████████████████████████████| 1041064/1041064 [00:33<00:00, 30673.28it/s]


In [13]:
allgrams = allgrams[allgrams.contains_skill == True]
allgrams = allgrams.drop(columns=['contains_skill'])
allgrams.dropna(subset=['allgrams'],inplace=True)

In [14]:
allgrams['allgrams'] = allgrams['allgrams'].astype(str)
allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
allgrams['allgrams'] = allgrams['allgrams'].str.strip()
allgrams

  allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')


Unnamed: 0,allgrams
12,source
31,export source
32,source systems
49,import export source
50,export source systems
...,...
1040798,nlp good programming skills
1040799,good programming skills python
1040800,programming skills python references
1040801,skills python references xiachong


In [15]:
matcher1 = PhraseMatcher(nlp.vocab)
#Adding all the trigrams to the matcher
for gram in tqdm(allgrams['allgrams']):
    matcher1.add("NGRAM", [nlp(gram)])

100%|██████████████████████████████████████████████████████████████████████████| 169575/169575 [29:02<00:00, 97.32it/s]


In [16]:
leftcontextstorage = []
recoveredgramstorage = []
rightcontextstorage = []

for sentence in tqdm(df1['sentence']):

    #Intiatilizing the sentence and searching for matches
    doc = nlp(sentence)
    matches = matcher1(doc)

    for match_id, start, end in matches:
        
        #Getting up to 5 contextwords next to the grams
        for contextnumbers in range(1,6,1):

            #Defining the number of context words
            leftcontextnumbers = contextnumbers
            rightcontextnumbers = contextnumbers

            #Checking if there is any context to the left or right of the gram, if not skipping the gram
            if start == 0 or end == len(doc):
                continue

            #Checking if there is enough context to the left of the gram
            if start < leftcontextnumbers:
                leftcontextnumbers = start
            #Checking if there is enough context to the right of the gram
            elif end+rightcontextnumbers > len(doc):
                rightcontextnumbers = len(doc)-end

            # Get the left context of the gram
            leftcontext = doc[start-leftcontextnumbers:start]
            # Get the recovered gram by slicing the Doc 
            recoveredgram = doc[start:end]
            # Get the right context of the gram
            rightcontext = doc[end:(end+rightcontextnumbers)]


            leftcontextstorage.append(leftcontext.text)
            recoveredgramstorage.append(recoveredgram.text)
            rightcontextstorage.append(rightcontext.text)

100%|████████████████████████████████████████████████████████████████████████████| 17602/17602 [03:33<00:00, 82.36it/s]


In [17]:
df2 = pd.DataFrame(zip(leftcontextstorage,recoveredgramstorage,rightcontextstorage),
                 columns=['left_context', 'candidate_skill', 'right_context'])
df2


Unnamed: 0,left_context,candidate_skill,right_context
0,data,flow import export source,systems
1,end data,flow import export source,systems data
2,end end data,flow import export source,systems data warehouse
3,facilitate end end data,flow import export source,systems data warehouse reports
4,pipelines facilitate end end data,flow import export source,systems data warehouse reports models
...,...,...,...
751330,skills,python references xiachong feng,xiaocheng
751331,programming skills,python references xiachong feng,xiaocheng feng
751332,good programming skills,python references xiachong feng,xiaocheng feng bing
751333,nlp good programming skills,python references xiachong feng,xiaocheng feng bing qin


In [18]:
df2['concatenated'] = df2['left_context'] + ' | ' + df2['candidate_skill'] + ' | ' + df2['right_context']
df2['label'] = ""
df2

Unnamed: 0,left_context,candidate_skill,right_context,concatenated,label
0,data,flow import export source,systems,data | flow import export source | systems,
1,end data,flow import export source,systems data,end data | flow import export source | systems...,
2,end end data,flow import export source,systems data warehouse,end end data | flow import export source | sys...,
3,facilitate end end data,flow import export source,systems data warehouse reports,facilitate end end data | flow import export s...,
4,pipelines facilitate end end data,flow import export source,systems data warehouse reports models,pipelines facilitate end end data | flow impor...,
...,...,...,...,...,...
751330,skills,python references xiachong feng,xiaocheng,skills | python references xiachong feng | xia...,
751331,programming skills,python references xiachong feng,xiaocheng feng,programming skills | python references xiachon...,
751332,good programming skills,python references xiachong feng,xiaocheng feng bing,good programming skills | python references xi...,
751333,nlp good programming skills,python references xiachong feng,xiaocheng feng bing qin,nlp good programming skills | python reference...,


In [19]:
final_df = df2.sample(n=2000,random_state=200)
final_df

Unnamed: 0,left_context,candidate_skill,right_context,concatenated,label
168421,answer specific,business questions identifying,calling data,answer specific | business questions identifyi...,
106851,docker nginx,sql,nosql databases,docker nginx | sql | nosql databases,
108999,etl software informatica oracle pl,sql scripting,sql scripting unix scripting,etl software informatica oracle pl | sql scrip...,
744355,cv,standard chartered we re,international,cv | standard chartered we re | international,
423826,data science,coe leverage huge,volumes structured,data science | coe leverage huge | volumes str...,
...,...,...,...,...,...
581231,data ai,analytics initiatives,collaboration divisions,data ai | analytics initiatives | collaboratio...,
584628,data architects biostatisticians advance,quality information derived existing,clinical databases,data architects biostatisticians advance | qua...,
292184,analyze data knowledge create apply,accurate algorithms,datasets ability work effectively teams,analyze data knowledge create apply | accurate...,
275398,designation,finance data analyst,responsibilities extract clean data,designation | finance data analyst | responsib...,


In [20]:
final_df.to_csv("final2000.csv",
          index=False)
df2.to_csv("final_full.csv",
          index=False)