## 1. Import Data

In [218]:
import os
import nltk
import re
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import Word
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from nltk.stem import WordNetLemmatizer

with open('../project_data/specialist_text.txt') as f:
    data = f.read()


## 2. Clean Data

In [219]:
tokens = nltk.word_tokenize(data)
wordnet_lemmatizer = WordNetLemmatizer()

print ("Uncleaned words = ", len(tokens))

## Remove Stop Words
stop = stopwords.words('english')

new_stop = ['and','experience','show','veteran','less','origin','sexual', 'orientation', 'dental', 'insurance', 'hour', 'shift','religion','sex','receive','consideration','pay', 'per','employment','opportunity','consideration', 'employment','job','description','start', 'job','click','gender','benefits', 'k','monday', 'friday','age', 'disability','please', 'visit','salary', 'range','characteristic', 'protected','minimum', 'qualifications','join','reasonable', 'accommodation', 'parental', 'leave', 'medical', 'vision', 'duties', 'responsibilities','business', 'needs','essential', 'functions','color','type', 'fulltime','verbal', 'communication','apply','work','national', 'status','closely','flexible', 'spending','’','ability', 'work','location', 'remote','capital', 'one','marital', 'status','team', 'members','work', 'location','applicants', 'without','paid', 'time','color','regard', 'race','apply','equal', 'employer','without', 'regard','united', 'states',"'s",'race', 'color','best', 'practices','physical', 'mental','health', 'savings','responsible','affirmative', 'action','iam','federal','state','local','required','ideal', 'candidate','individuals', 'disabilities','northrop', 'grumman','applicable', 'law','every', 'day','across', 'organization''travel', 'required','track', 'record''including', 'limited','employee', 'assistance','new', 'york','compensation', 'package','financial', 'services','travel', 'requirements','create', 'maintain','pre', 'posttax', 'options','discriminate','diversity', 'equity', 'inclusion','matrix', 'committed', 'providing','qualified', 'applicant','vista', 'portfolio', 'company','related', 'field', 'equivalent','bae','nike' ,'inc','familiar','children', 'hospital', 'philadelphia','weill', 'cornell', 'medicine', 'salt', 'city','eagle','title', 'vaccinated', 'covid19','ix','education','mantech','western','401','match','529','american,','express','colleague','baker','qualify', 'religious', 'exemption','regard', 'criminal', 'background','inquiry','us', 'puerto', 'rico','must', 'able', 'deliver','temperature', 'excursion','obtain', 'public','trust','arrest', 'conviction', 'record','including', 'pregnancy', 'childbirth','general', 'holdings', 'corp','genetic', 'registered', 'domestic']

technical_stop = ['identity','engineer','management','information','manager','security','user','administrator','hardware', 'software','analyst','architect','systems','engineering']

stop.extend(new_stop + technical_stop)

words=[]
for w in tokens:
    words.append(wordnet_lemmatizer.lemmatize(w))  

tokens_list=[]
for word in words:
    tokens_list.append(word.lower())

token_list1 = [ ]
for word in tokens_list:
    if word not in stop:
        token_list1.append(word)

print("Removed Stop Words = ",len(token_list1))

## Remove numbers and punctuation
punctuation = re.compile(r'[-.?!,":;()&@#%^*·`$[]')
token_list2 = [ ]
for token in token_list1:
    word = punctuation.sub("", token)
    if len(word)>0:
        token_list2.append(word)
print("Removed numbers and punctuation = ",len(token_list2))

#print(token_list2)

Uncleaned words =  9035
Removed Stop Words =  5091
Removed numbers and punctuation =  3819


## 3. Filtering non-nouns and Frequency Analysis

In [220]:

tokens_pos_tag = nltk.pos_tag(token_list2)
pos_df = pd.DataFrame(tokens_pos_tag, columns = ('word','POS'))

pos_sum = pos_df.groupby('POS', as_index=False).count() # group by POS tags
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

Unnamed: 0,POS,word
9,NN,2066
5,JJ,655
19,VBG,257
11,NNS,128
1,CD,116
21,VBP,111
14,RB,91
18,VBD,88
17,VB,76
20,VBN,57


In [221]:
filtered_pos = [ ]
for one in tokens_pos_tag:
    if one[1] == 'NN' or one[1] == 'NNS' or one[1] == 'NNP' or one[1] == 'NNPS':
        filtered_pos.append(one)
print ("Filtered words = ",len(filtered_pos))

fdist_pos = nltk.FreqDist(filtered_pos)
top_100_words = fdist_pos.most_common(100)
print(top_100_words)

Filtered words =  2201
[(('system', 'NN'), 47), (('access', 'NN'), 46), (('support', 'NN'), 40), (('year', 'NN'), 33), (('process', 'NN'), 28), (('service', 'NN'), 27), (('application', 'NN'), 25), (('problem', 'NN'), 25), (('knowledge', 'NN'), 23), (('position', 'NN'), 23), (('program', 'NN'), 22), (('customer', 'NN'), 21), (('specialist', 'NN'), 20), (('technology', 'NN'), 20), (('material', 'NN'), 19), (('role', 'NN'), 17), (('data', 'NNS'), 16), (('issue', 'NN'), 16), (('project', 'NN'), 15), (('database', 'NN'), 14), (('skill', 'NN'), 14), (('requirement', 'NN'), 13), (('development', 'NN'), 13), (('quality', 'NN'), 13), (('solution', 'NN'), 12), (('master', 'NN'), 12), (('environment', 'NN'), 12), (('benefit', 'NN'), 11), (('certification', 'NN'), 11), (('maintenance', 'NN'), 11), (('plan', 'NN'), 11), (('request', 'NN'), 11), (('control', 'NN'), 10), (('manage', 'NN'), 10), (('network', 'NN'), 10), (('level', 'NN'), 10), (('policy', 'NN'), 10), (('implementation', 'NN'), 10), ((

In [222]:
top_words_df = pd.DataFrame(top_100_words, columns = ('pos','count'))
top_words_df.head()

Unnamed: 0,pos,count
0,"(system, NN)",47
1,"(access, NN)",46
2,"(support, NN)",40
3,"(year, NN)",33
4,"(process, NN)",28


In [223]:
top_words_df['Word'] = top_words_df['pos'].apply(lambda x: x[0]) # split the tuple of POS
top_words_df = top_words_df.drop('pos', 1) # drop the previous column
top_words_df.head()

  top_words_df = top_words_df.drop('pos', 1) # drop the previous column


Unnamed: 0,count,Word
0,47,system
1,46,access
2,40,support
3,33,year
4,28,process


## 4. Word Cloud

In [224]:
subset_pos = top_words_df[['Word', 'count']]
tuples_pos = [tuple(x) for x in subset_pos.values]

print(tuples_pos)

wordcloud = WordCloud()
#wordcloud.generate_from_frequencies(tuples_pos)

plt.show()

[('system', 47), ('access', 46), ('support', 40), ('year', 33), ('process', 28), ('service', 27), ('application', 25), ('problem', 25), ('knowledge', 23), ('position', 23), ('program', 22), ('customer', 21), ('specialist', 20), ('technology', 20), ('material', 19), ('role', 17), ('data', 16), ('issue', 16), ('project', 15), ('database', 14), ('skill', 14), ('requirement', 13), ('development', 13), ('quality', 13), ('solution', 12), ('master', 12), ('environment', 12), ('benefit', 11), ('certification', 11), ('maintenance', 11), ('plan', 11), ('request', 11), ('control', 10), ('manage', 10), ('network', 10), ('level', 10), ('policy', 10), ('implementation', 10), ('contract', 9), ('duty', 9), ('activity', 9), ('product', 8), ('task', 8), ('government', 8), ('assurance', 8), ('resolution', 8), ('payroll', 8), ('compliance', 8), ('degree', 7), ('industry', 7), ('performance', 7), ('clearance', 7), ('life', 7), ('offering', 7), ('creation', 7), ('career', 7), ('help', 7), ('peraton', 7), ('

In [225]:
plt.figure(figsize=(20,15))
#plt.imshow(wordcloud, interpolation="bilinear")

<Figure size 2000x1500 with 0 Axes>

<Figure size 2000x1500 with 0 Axes>

## 5. BiGrams

In [226]:
bgs = nltk.bigrams(token_list2)
fdist2 = nltk.FreqDist(bgs) # selecting bigrams from tokens
bgs_100 = fdist2.most_common(100) # top-100 bigrams
bgs_df = pd.DataFrame(bgs_100, columns = ('bigram','count'))
bgs_df.head(7)

Unnamed: 0,bigram,count
0,"(material, master)",10
1,"(quality, assurance)",8
2,"(3, year)",7
3,"(year, preferred)",6
4,"(bachelor, degree)",4
5,"(high, school)",4
6,"(production, contingency)",4


In [227]:
tgs = nltk.trigrams(token_list2) 
fdist3 = nltk.FreqDist(tgs) # selecting trigrams from tokens
tgs_100 = fdist3.most_common(100) # top-100 trigrams
tgs_df = pd.DataFrame(tgs_100, columns = ('trigram','count'))
tgs_df.head(7)

Unnamed: 0,trigram,count
0,"(production, contingency, plan)",3
1,"(excellent, oral, written)",2
2,"(provide, technical, support)",2
3,"(testing, collaborating, global)",2
4,"(collaborating, global, ey)",2
5,"(disaster, recovery, disaster)",2
6,"(recovery, disaster, recovery)",2


In [228]:
tgs_df['phrase'] = tgs_df['trigram'].apply(lambda x: x[0]+" "+x[1]+" "+x[2]) # merging the tuple into a string
tgs_df['filter_tgs'] = tgs_df['phrase'].str.contains(punctuation) # finding strings with numbers and punctuation
tgs_df.head()

Unnamed: 0,trigram,count,phrase,filter_tgs
0,"(production, contingency, plan)",3,production contingency plan,False
1,"(excellent, oral, written)",2,excellent oral written,False
2,"(provide, technical, support)",2,provide technical support,False
3,"(testing, collaborating, global)",2,testing collaborating global,False
4,"(collaborating, global, ey)",2,collaborating global ey,False


In [229]:
tgs_df = tgs_df[tgs_df.filter_tgs == False] # removing strings with numbers and punctuation
tgs_df = tgs_df.drop('trigram', 1)
tgs_df = tgs_df.drop('filter_tgs', 1) # removing the excess columns
tgs_df.reset_index()
tgs_df.head(10) #Final trigrams

  tgs_df = tgs_df.drop('trigram', 1)
  tgs_df = tgs_df.drop('filter_tgs', 1) # removing the excess columns


Unnamed: 0,count,phrase
0,3,production contingency plan
1,2,excellent oral written
2,2,provide technical support
3,2,testing collaborating global
4,2,collaborating global ey
5,2,disaster recovery disaster
6,2,recovery disaster recovery
7,2,implementing project following
8,2,project following azure
9,2,following azure aws
