In [1]:
import pandas as pd
from flashtext import KeywordProcessor
import spacy
import sys
from tqdm import tqdm
import csv
import re
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', None)

nlp = spacy.load("en_core_web_lg")

In [None]:
#Function to retrieve the dependencies of the nouns
def dep_tagger(text):
    doc = nlp(str(text))
    text = ' '.join([token.dep_ for token in doc])
    return text

#Function to retrieve the part of speech
def pos_tagger(text):
    doc = nlp(str(text))
    text = ' '.join([token.pos_ for token in doc])
    return text

#Function to remove stopwords
def stop_word_remover(text):
    doc = nlp(str(text))
    text = ' '.join([token.text for token in doc if token.is_stop == False])
    return text

#Function to lemmatize
def lemmatizer(text):
    doc = nlp(str(text))
    text = ' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc])
    return text

In [None]:
df = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Job vacancies/emscad_v1 2.csv")
print(df.shape[0])

#Only selecting real job vacancies
df = df[df.fraudulent=='f']
print(df.shape[0])

#Description and requirements are the columns that im interested in
df = df[['description', 'requirements']]

In [None]:
#Removing the html tags in the columns.
rows = []
for t in tqdm(df['description']):
    soup = BeautifulSoup(t,"lxml")
    rows.append(soup.get_text())
df['description'] = rows

rows = []
#Some columns contain nan therefore appending no text
for t in tqdm(df['requirements']):
    if str(t) == 'nan':
        rows.append('')
    else:
        soup = BeautifulSoup(t,"lxml")
        rows.append(soup.get_text())
df['requirements'] = rows

In [None]:
#Concatting these into one column to create a more realistic situation because the skills are already extracted
#in the requirements column
df['job_description'] = df['description'] + ' ' + df['requirements']

In [None]:
#Removing the anonymized replacements in the text, for example (#URL_86fd830a95a64e2b30ceed829e63fd384c289e4f01e3c93608b42a84f6e662dd)
#See the emscad paper for more details
df['job_description'] = df['job_description'].str.replace('#[\w-]+#', ' ',regex=True)

In [None]:
# Lowercasing, removing newline symbols, stripping leading and ending whitespaces
df['job_description_clean'] = df['job_description'].str.lower()
df['job_description_clean'] = df['job_description_clean'].str.replace(r'\n\n', ' ')
df['job_description_clean'] = df['job_description_clean'].str.replace(r'\n', '. ')
df['job_description_clean'] = df['job_description_clean'].str.strip()


In [None]:
#Splitting the jobvacancies into sentences
jobpostingcomplete = []
sentences = []
    
for index in tqdm(range(0,len(df))):
    doc = nlp(str(df['job_description_clean'].iloc[index]))
    for sentence in doc.sents:
        jobpostingcomplete.append(' '.join([token.text for token in doc]))
        sentences.append(sentence.string.strip())

#creating the df
sentencesdf = pd.DataFrame(zip(jobpostingcomplete,sentences), 
                  columns=['jobposting', 'sentence'])
len(sentencesdf)

In [16]:
#Removing special characters from the sentences, removing double whitespace, and trailing whitespaces
sentencesdf['sentence'] = sentencesdf['sentence'].str.replace(r'[^A-Za-z0-9 ]+', ' ')
sentencesdf['sentence'] = sentencesdf['sentence'].str.replace(r'  ', ' ')
sentencesdf['sentence'] = sentencesdf['sentence'].str.strip()


  sentencesdf['sentence'] = sentencesdf['sentence'].str.replace(r'[^A-Za-z0-9 ]+', ' ')


In [21]:
#Dropping any nan sentences
print('Before removing NaN ', sentencesdf.shape[0])
sentencesdf = sentencesdf.dropna()
print('After removing NaN ', sentencesdf.shape[0])

#Dropping any sentence which contain empty string value
print('Before removing empty strings ', sentencesdf.shape[0])
sentencesdf = sentencesdf[sentencesdf['sentence'] != '']
print('After removing empty strings ', sentencesdf.shape[0])



Before removing NaN  302284
After removing NaN  302284


In [12]:
#tqdm.pandas() is used for a progress bar on the pandas apply function
tqdm.pandas()

sentencesdf['sentence_lemmatized'] = sentencesdf['sentence'].progress_apply(lemmatizer)
sentencesdf['sentence_no_stopwords'] = sentencesdf['sentence'].progress_apply(stop_word_remover)

100%|██████████| 302284/302284 [36:48<00:00, 136.88it/s]
100%|██████████| 302284/302284 [35:52<00:00, 140.46it/s]


In [None]:
sentencesdf = sentencesdf.astype(str)
#Quoting is necessary because the number of rows changes after writing
sentencesdf.to_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/sentences/sentences_emscad.csv",
                   index=False, 
                   quoting=csv.QUOTE_NONNUMERIC
                  )