## 1. Import Data

In [13]:
import os
import nltk
import re
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import Word
from nltk.tokenize import RegexpTokenizer
from nltk import ngrams
from nltk.stem import WordNetLemmatizer

with open('../project_data/administrator_text.txt') as f:
    data = f.read()


## 2. Clean Data

In [14]:
tokens = nltk.word_tokenize(data)
wordnet_lemmatizer = WordNetLemmatizer()

print ("Uncleaned words = ", len(tokens))

## Remove Stop Words
stop = stopwords.words('english')

new_stop = ['and','experience','show','veteran','less','origin','sexual', 'orientation', 'dental', 'insurance', 'hour', 'shift','religion','sex','receive','consideration','pay', 'per','employment','opportunity','consideration', 'employment','job','description','start', 'job','click','gender','benefits', 'k','monday', 'friday','age', 'disability','please', 'visit','salary', 'range','characteristic', 'protected','minimum', 'qualifications','join','reasonable', 'accommodation', 'parental', 'leave', 'medical', 'vision', 'duties', 'responsibilities','business', 'needs','essential', 'functions','color','type', 'fulltime','verbal', 'communication','apply','work','national', 'status','closely','flexible', 'spending','’','ability', 'work','location', 'remote','capital', 'one','marital', 'status','team', 'members','work', 'location','applicants', 'without','paid', 'time','color','regard', 'race','apply','equal', 'employer','without', 'regard','united', 'states',"'s",'race', 'color','best', 'practices','physical', 'mental','health', 'savings','responsible','affirmative', 'action','iam','federal','state','local','required','ideal', 'candidate','individuals', 'disabilities','northrop', 'grumman','applicable', 'law','every', 'day','across', 'organization''travel', 'required','track', 'record''including', 'limited','employee', 'assistance','new', 'york','compensation', 'package','financial', 'services','travel', 'requirements','create', 'maintain','pre', 'posttax', 'options','discriminate','diversity', 'equity', 'inclusion','matrix', 'committed', 'providing','qualified', 'applicant','vista', 'portfolio', 'company','related', 'field', 'equivalent','bae','nike' ,'inc','familiar','children', 'hospital', 'philadelphia','weill', 'cornell', 'medicine', 'salt', 'city','eagle','title', 'vaccinated', 'covid19','ix','education','mantech','western','401','match','529','american,','express','colleague','baker','qualify', 'religious', 'exemption','regard', 'criminal', 'background','inquiry','us', 'puerto', 'rico','must', 'able', 'deliver','temperature', 'excursion','obtain', 'public','trust','arrest', 'conviction', 'record','including', 'pregnancy', 'childbirth','general', 'holdings', 'corp','genetic', 'registered', 'domestic']

technical_stop = ['identity','engineer','management','information','manager','security','user','administrator','hardware', 'software','analyst','architect','systems','engineering']

stop.extend(new_stop + technical_stop)

words=[]
for w in tokens:
    words.append(wordnet_lemmatizer.lemmatize(w))  

tokens_list=[]
for word in words:
    tokens_list.append(word.lower())

token_list1 = [ ]
for word in tokens_list:
    if word not in stop:
        token_list1.append(word)

print("Removed Stop Words = ",len(token_list1))

## Remove numbers and punctuation
punctuation = re.compile(r'[-.?!,":;()&@#%^*·`$[]')
token_list2 = [ ]
for token in token_list1:
    word = punctuation.sub("", token)
    if len(word)>0:
        token_list2.append(word)
print("Removed numbers and punctuation = ",len(token_list2))

#print(token_list2)

Uncleaned words =  12441
Removed Stop Words =  7193
Removed numbers and punctuation =  5459


## 3. Filtering non-nouns and Frequency Analysis

In [15]:

tokens_pos_tag = nltk.pos_tag(token_list2)
pos_df = pd.DataFrame(tokens_pos_tag, columns = ('word','POS'))

pos_sum = pos_df.groupby('POS', as_index=False).count() # group by POS tags
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

Unnamed: 0,POS,word
11,NN,2904
7,JJ,964
19,VBG,320
13,NNS,194
21,VBP,183
15,RB,154
3,CD,149
18,VBD,126
17,VB,111
22,VBZ,85


In [16]:
filtered_pos = [ ]
for one in tokens_pos_tag:
    if one[1] == 'NN' or one[1] == 'NNS' or one[1] == 'NNP' or one[1] == 'NNPS':
        filtered_pos.append(one)
print ("Filtered words = ",len(filtered_pos))

fdist_pos = nltk.FreqDist(filtered_pos)
top_100_words = fdist_pos.most_common(100)
print(top_100_words)

Filtered words =  3119
[(('access', 'NN'), 80), (('system', 'NN'), 79), (('application', 'NN'), 46), (('support', 'NN'), 38), (('service', 'NN'), 36), (('environment', 'NN'), 34), (('technology', 'NN'), 32), (('oracle', 'NN'), 32), (('year', 'NN'), 29), (('solution', 'NN'), 28), (('knowledge', 'NN'), 28), (('data', 'NNS'), 27), (('database', 'NN'), 26), (('computer', 'NN'), 25), (('directory', 'NN'), 23), (('position', 'NN'), 22), (('issue', 'NN'), 21), (('development', 'NN'), 21), (('administration', 'NN'), 20), (('design', 'NN'), 19), (('role', 'NN'), 18), (('process', 'NN'), 18), (('organization', 'NN'), 18), (('performance', 'NN'), 18), (('office', 'NN'), 17), (('documentation', 'NN'), 17), (('staff', 'NN'), 17), (('skill', 'NN'), 17), (('customer', 'NN'), 16), (('benefit', 'NN'), 15), (('problem', 'NN'), 15), (('account', 'NN'), 15), (('platform', 'NN'), 14), (('value', 'NN'), 14), (('policy', 'NN'), 14), (('implementation', 'NN'), 13), (('request', 'NN'), 13), (('program', 'NN'),

In [17]:
top_words_df = pd.DataFrame(top_100_words, columns = ('pos','count'))
top_words_df.head()

Unnamed: 0,pos,count
0,"(access, NN)",80
1,"(system, NN)",79
2,"(application, NN)",46
3,"(support, NN)",38
4,"(service, NN)",36


In [18]:
top_words_df['Word'] = top_words_df['pos'].apply(lambda x: x[0]) # split the tuple of POS
top_words_df = top_words_df.drop('pos', 1) # drop the previous column
top_words_df.head()

  top_words_df = top_words_df.drop('pos', 1) # drop the previous column


Unnamed: 0,count,Word
0,80,access
1,79,system
2,46,application
3,38,support
4,36,service


## 4. Word Cloud

In [19]:
subset_pos = top_words_df[['Word', 'count']]
tuples_pos = [tuple(x) for x in subset_pos.values]

print(tuples_pos)

wordcloud = WordCloud()
#wordcloud.generate_from_frequencies(tuples_pos)

plt.show()

[('access', 80), ('system', 79), ('application', 46), ('support', 38), ('service', 36), ('environment', 34), ('technology', 32), ('oracle', 32), ('year', 29), ('solution', 28), ('knowledge', 28), ('data', 27), ('database', 26), ('computer', 25), ('directory', 23), ('position', 22), ('issue', 21), ('development', 21), ('administration', 20), ('design', 19), ('role', 18), ('process', 18), ('organization', 18), ('performance', 18), ('office', 17), ('documentation', 17), ('staff', 17), ('skill', 17), ('customer', 16), ('benefit', 15), ('problem', 15), ('account', 15), ('platform', 14), ('value', 14), ('policy', 14), ('implementation', 13), ('request', 13), ('program', 13), ('community', 13), ('server', 13), ('university', 12), ('azure', 12), ('product', 11), ('task', 11), ('project', 11), ('culture', 11), ('berkeley', 11), ('practice', 11), ('area', 10), ('department', 10), ('integration', 10), ('procedure', 10), ('client', 10), ('people', 10), ('configuration', 10), ('integrity', 10), ('i

In [20]:
plt.figure(figsize=(20,15))
#plt.imshow(wordcloud, interpolation="bilinear")

<Figure size 2000x1500 with 0 Axes>

<Figure size 2000x1500 with 0 Axes>

## 5. BiGrams

In [21]:
bgs = nltk.bigrams(token_list2)
fdist2 = nltk.FreqDist(bgs) # selecting bigrams from tokens
bgs_100 = fdist2.most_common(100) # top-100 bigrams
bgs_df = pd.DataFrame(bgs_100, columns = ('bigram','count'))
bgs_df.head(7)

Unnamed: 0,bigram,count
0,"(active, directory)",14
1,"(2, year)",11
2,"(computer, system)",10
3,"(oracle, sso)",10
4,"(bachelor, degree)",9
5,"(sql, server)",8
6,"(account, access)",6


In [22]:
tgs = nltk.trigrams(token_list2) 
fdist3 = nltk.FreqDist(tgs) # selecting trigrams from tokens
tgs_100 = fdist3.most_common(100) # top-100 trigrams
tgs_df = pd.DataFrame(tgs_100, columns = ('trigram','count'))
tgs_df.head(7)

Unnamed: 0,trigram,count
0,"(least, 2, year)",3
1,"(written, interpersonal, skill)",3
2,"(version, 11g, 1112x)",3
3,"(11g, 1112x, 12c)",3
4,"(1112x, 12c, 1221x)",3
5,"(demonstrate, potential, perform)",2
6,"(potential, perform, function)",2


In [23]:
tgs_df['phrase'] = tgs_df['trigram'].apply(lambda x: x[0]+" "+x[1]+" "+x[2]) # merging the tuple into a string
tgs_df['filter_tgs'] = tgs_df['phrase'].str.contains(punctuation) # finding strings with numbers and punctuation
tgs_df.head()

Unnamed: 0,trigram,count,phrase,filter_tgs
0,"(least, 2, year)",3,least 2 year,False
1,"(written, interpersonal, skill)",3,written interpersonal skill,False
2,"(version, 11g, 1112x)",3,version 11g 1112x,False
3,"(11g, 1112x, 12c)",3,11g 1112x 12c,False
4,"(1112x, 12c, 1221x)",3,1112x 12c 1221x,False


In [24]:
tgs_df = tgs_df[tgs_df.filter_tgs == False] # removing strings with numbers and punctuation
tgs_df = tgs_df.drop('trigram', 1)
tgs_df = tgs_df.drop('filter_tgs', 1) # removing the excess columns
tgs_df.reset_index()
tgs_df.head(10) #Final trigrams

  tgs_df = tgs_df.drop('trigram', 1)
  tgs_df = tgs_df.drop('filter_tgs', 1) # removing the excess columns


Unnamed: 0,count,phrase
0,3,least 2 year
1,3,written interpersonal skill
2,3,version 11g 1112x
3,3,11g 1112x 12c
4,3,1112x 12c 1221x
5,2,demonstrate potential perform
6,2,potential perform function
7,2,perform function outlined
8,2,function outlined position
9,2,operating system application
