## 1. Import packages

In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string
import plotly
import plotly.graph_objects as go
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import os
for dirname, _, filenames in os.walk('dissertation/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("../project_data/data-jobs-20221123.csv")

df.head()

Unnamed: 0,job_title,role,description
0,Identity and Access Management Administrator,administrator,Job Summary\nThe Identity and Access Managemen...
1,Systems Administrator (Hybrid Remote),administrator,Systems Administrator (Hybrid Remote)\n\nChris...
2,IT Administrator/Helpdesk,administrator,IT Administrator/Helpdesk\nIf you are a IT Adm...
3,IT Administrator/Helpdesk L2/L3,administrator,IT Administrator/Helpdesk L2/L3\nIf you are a ...
4,Identity Access Management AD Systems Administ...,administrator,Deloitte Global is the engine of the Deloitte ...


## 2. Clean data

In [6]:
stopwords_list = stopwords.words('english') + list(string.punctuation) + ['mantech','gsk','experience','work']

# function to tokenize data and remove stopwords
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed


# function to concat words (used in function below)
def concat_words(list_of_words):
    # remove any NaN's
    # list_of_words = [i for i in list if i is not np.nan]

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each description into a single space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string with the words separated by ' '
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string


In [11]:
for i in range(len(df)):
    processed_desc = process_narrative(df['description'].loc[i])
    desc = make_lemma_and_concat(processed_desc)
    df['description'].loc[i] = desc
    if i % 50 == 0:
        print(f'Finished line number {i}')
df.head()

Finished line number 0
Finished line number 50
Finished line number 100
Finished line number 150
Finished line number 200
Finished line number 250
Finished line number 300
Finished line number 350
Finished line number 400
Finished line number 450


Unnamed: 0,job_title,role,description
0,Identity and Access Management Administrator,administrator,job summary identity access management adminis...
1,Systems Administrator (Hybrid Remote),administrator,system administrator hybrid remote christar in...
2,IT Administrator/Helpdesk,administrator,administrator least two year please read based...
3,IT Administrator/Helpdesk L2/L3,administrator,administrator least two year please read based...
4,Identity Access Management AD Systems Administ...,administrator,deloitte global engine deloitte network profes...


In [12]:
df.to_csv('../project_data/descriptions_processed.csv')