## 1. Import Data

In [1]:
import os
import nltk
import re
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords

with open('iam-engineer.txt') as f:
    data = f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'iam-engineer.txt'

## 2. Clean Data

In [71]:
data = data.lower()
tokens = nltk.word_tokenize(data)

print ("Uncleaned words = ", len(tokens))

## Remove Stop Words
stop = stopwords.words('english')

new_stop = ['and','experience']

stop.extend(new_stop)

token_list1 = [ ]
for token in tokens:
    if token not in stop:
        token_list1.append(token)

print("Removed Stop Words = ",len(token_list1))

## Remove numbers and punctuation
punctuation = re.compile(r'[-.?!,":;()&@#%^*·`$|0-9]')
token_list2 = [ ]
for token in token_list1:
    word = punctuation.sub("", token)
    if len(word)>0:
        token_list2.append(word)
print("Removed numbers and punctuation = ",len(token_list2))

#print(token_list2)

Uncleaned words =  106389
Removed Stop Words =  76523
Removed numbers and punctuation =  61537


## 3. Filtering non-nouns and Frequency Analysis

In [72]:

tokens_pos_tag = nltk.pos_tag(token_list2)
pos_df = pd.DataFrame(tokens_pos_tag, columns = ('word','POS'))

pos_sum = pos_df.groupby('POS', as_index=False).count() # group by POS tags
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

Unnamed: 0,POS,word
10,NN,25750
6,JJ,10393
13,NNS,9707
25,VBP,3671
23,VBG,3261
17,RB,1333
22,VBD,1169
26,VBZ,1118
24,VBN,1065
21,VB,1057


In [73]:
filtered_pos = [ ]
for one in tokens_pos_tag:
    if one[1] == 'NN' or one[1] == 'NNS' or one[1] == 'NNP' or one[1] == 'NNPS':
        filtered_pos.append(one)
print ("Filtered words = ",len(filtered_pos))

fdist_pos = nltk.FreqDist(filtered_pos)
top_100_words = fdist_pos.most_common(100)
print(top_100_words)

Filtered words =  35834
[(('management', 'NN'), 752), (('access', 'NN'), 692), (('identity', 'NN'), 677), (('security', 'NN'), 447), (('solutions', 'NNS'), 380), (('work', 'NN'), 362), (('years', 'NNS'), 360), (('team', 'NN'), 296), (('information', 'NN'), 296), (('systems', 'NNS'), 292), (('support', 'NN'), 273), (('data', 'NNS'), 268), (('development', 'NN'), 267), (('business', 'NN'), 267), (('skills', 'NNS'), 263), (('services', 'NNS'), 253), (('ability', 'NN'), 234), (('job', 'NN'), 226), (('technology', 'NN'), 218), (('knowledge', 'NN'), 216), (('position', 'NN'), 209), (('iam', 'NN'), 207), (('design', 'NN'), 207), (('software', 'NN'), 205), (('requirements', 'NNS'), 199), (('engineering', 'NN'), 193), (('environment', 'NN'), 187), (('technologies', 'NNS'), 183), (('engineer', 'NN'), 182), (('status', 'NN'), 182), (('application', 'NN'), 179), (('’', 'NNP'), 175), (('role', 'NN'), 166), (('directory', 'NN'), 152), (('benefits', 'NNS'), 152), (('teams', 'NNS'), 145), (('employees

In [74]:
top_words_df = pd.DataFrame(top_100_words, columns = ('pos','count'))
top_words_df.head()

Unnamed: 0,pos,count
0,"(management, NN)",752
1,"(access, NN)",692
2,"(identity, NN)",677
3,"(security, NN)",447
4,"(solutions, NNS)",380


In [75]:
top_words_df['Word'] = top_words_df['pos'].apply(lambda x: x[0]) # split the tuple of POS
top_words_df = top_words_df.drop('pos', 1) # drop the previous column
top_words_df.head()

  top_words_df = top_words_df.drop('pos', 1) # drop the previous column


Unnamed: 0,count,Word
0,752,management
1,692,access
2,677,identity
3,447,security
4,380,solutions


## 4. Word Cloud

In [None]:
subset_pos = top_words_df[['Word', 'count']]
tuples_pos = [tuple(x) for x in subset_pos.values]

print(tuples_pos)

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(tuples_pos)

plt.show()

In [None]:
plt.figure(figsize=(20,15))
plt.imshow(wordcloud, interpolation="bilinear")

## 5. BiGrams

In [76]:
bgs = nltk.bigrams(token_list2)
fdist2 = nltk.FreqDist(bgs) # selecting bigrams from tokens
bgs_100 = fdist2.most_common(100) # top-100 bigrams
bgs_df = pd.DataFrame(bgs_100, columns = ('bigram','count'))
bgs_df.head()

Unnamed: 0,bigram,count
0,"(access, management)",378
1,"(identity, access)",311
2,"(+, years)",143
3,"(show, less)",141
4,"(active, directory)",114


In [77]:
bgs_df['phrase'] = bgs_df['bigram'].apply(lambda x: x[0]+" "+x[1]) # merging the tuple into a string
bgs_df['filter_bgs'] = bgs_df['phrase'].str.contains(punctuation) # finding strings with numbers and punctuation
bgs_df = bgs_df[bgs_df.filter_bgs == False] # removing strings with numbers and punctuation
bgs_df = bgs_df.drop('bigram', 1)
bgs_df = bgs_df.drop('filter_bgs', 1) # removing the excess columns
bgs_df.reset_index()
bgs_df.head(10) #Final bigrams

  bgs_df = bgs_df.drop('bigram', 1)
  bgs_df = bgs_df.drop('filter_bgs', 1) # removing the excess columns


Unnamed: 0,count,phrase
0,378,access management
1,311,identity access
2,143,+ years
3,141,show less
4,114,active directory
5,106,less ''
6,82,national origin
7,80,sexual orientation
8,74,gender identity
9,67,veteran status


In [78]:
tgs = nltk.trigrams(tokens) 
fdist3 = nltk.FreqDist(tgs) # selecting trigrams from tokens
tgs_100 = fdist3.most_common(100) # top-100 trigrams
tgs_df = pd.DataFrame(tgs_100, columns = ('trigram','count'))
tgs_df.head()

Unnamed: 0,trigram,count
0,"(identity, and, access)",185
1,"(and, access, management)",184
2,"(less, ``, '')",106
3,"(show, less, ``)",104
4,"(years, of, experience)",102


In [79]:
tgs_df['phrase'] = tgs_df['trigram'].apply(lambda x: x[0]+" "+x[1]+" "+x[2]) # merging the tuple into a string
tgs_df['filter_tgs'] = tgs_df['phrase'].str.contains(punctuation) # finding strings with numbers and punctuation
tgs_df.head()

Unnamed: 0,trigram,count,phrase,filter_tgs
0,"(identity, and, access)",185,identity and access,False
1,"(and, access, management)",184,and access management,False
2,"(less, ``, '')",106,less `` '',True
3,"(show, less, ``)",104,show less ``,True
4,"(years, of, experience)",102,years of experience,False


In [80]:
tgs_df = tgs_df[tgs_df.filter_tgs == False] # removing strings with numbers and punctuation
tgs_df = tgs_df.drop('trigram', 1)
tgs_df = tgs_df.drop('filter_tgs', 1) # removing the excess columns
tgs_df.reset_index()
tgs_df.head(20) #Final trigrams

  tgs_df = tgs_df.drop('trigram', 1)
  tgs_df = tgs_df.drop('filter_tgs', 1) # removing the excess columns


Unnamed: 0,count,phrase
0,185,identity and access
1,184,and access management
4,102,years of experience
15,64,click apply now
16,64,apply now show
18,63,now show less
19,62,as well as
30,49,the ability to
33,48,without regard to
34,46,of experience in
