In [2]:
import pandas as pd
from flashtext import KeywordProcessor
import spacy
from spacy.matcher import PhraseMatcher
import sys
import re

#progress bar packages
from tqdm import tqdm
#ngram package
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
import nltk

pd.set_option('display.max_columns', None)

#en is the large model
nlp = spacy.load("en_core_web_lg")

In [4]:
df_sentences = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/sentences/sentences_emscad.csv")
df_softskills = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Skills/softskills.csv")
df_hardskills = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Skills/emsi_skills.csv")

df_hardskills = df_hardskills[df_hardskills['type']== 'Hard Skill']
df_hardskills = pd.DataFrame(df_hardskills['name'])

#soft_skill,soft_skill_lemmatized,soft_skill_no_stopwords
columnname = 'soft_skill_lemmatized'
df_softskills = df_softskills[[columnname]]

#Renameing the skill column names and merging afterwards
df_softskills = df_softskills.rename(columns={columnname: 'skill'})
df_hardskills = df_hardskills.rename(columns={'name': 'skill'})
df_skills = pd.concat([df_softskills, df_hardskills])
df_skills = df_skills.drop_duplicates()
df_skills

#Creating all the possible grams for each sentence
allgrams = []

#This tokenizer immediately removes punctuation and special characters from the sentence
tokenizer = RegexpTokenizer(r'\w+')

#,sentence,sentence_lemmatized,sentence_no_stopwords

for sentence in tqdm(df_sentences['sentence']):
    tokenizedsentence = tokenizer.tokenize(str(sentence))
    
    #getting up to four grams for each sentence
    for n in range(1,5):
        grams = ngrams(tokenizedsentence,n)
        for gram in grams:
            allgrams.append(str(gram))

allgrams = pd.DataFrame(allgrams)
allgrams = allgrams.rename(columns={0:'allgrams'})
print(allgrams.shape[0])

#Initializing the keyword processor
keyword_processor = KeywordProcessor(case_sensitive=False)

#Adding all the skills to the processor
for skill in tqdm(df_skills):
    keyword_processor.add_keyword(skill)

def searcher(row):
    #check if the words are in the row and return a True or False instead of the actual word
    boolean = bool(keyword_processor.extract_keywords(row))
    return boolean

tqdm.pandas()
allgrams['contains_skill'] = allgrams['allgrams'].progress_apply(searcher)

#Only selecting the ngrams which contain a skill
allgrams = allgrams[allgrams.contains_skill == True]
#cleaning up
allgrams['allgrams'] = allgrams['allgrams'].astype(str)
allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
allgrams['allgrams'] = allgrams['allgrams'].str.strip()
allgrams = allgrams.drop(columns=['contains_skill'])

#Some softskills match multiple times, therefore removing the duplicates
allgrams = allgrams.drop_duplicates().reset_index(drop=True)

#removing any nans
allgrams.dropna(subset=['allgrams'],inplace=True)

import csv
allgrams.to_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/n-grams.csv", quoting=csv.QUOTE_NONNUMERIC, index=False)

100%|██████████| 302284/302284 [00:25<00:00, 11793.78it/s]
100%|██████████| 2/2 [00:00<00:00, 8924.05it/s]
  0%|          | 0/16426866 [00:00<?, ?it/s]

16426866


100%|██████████| 16426866/16426866 [02:12<00:00, 124003.60it/s]
  allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
