# Identify Common Work Roles

## 1. Import Packages

In [1]:
import nltk
import requests
import numpy as np
from pathlib import Path
from nltk.corpus import stopwords
from nltk import ngrams
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kenmye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 2. Load the Data set

In [4]:
data = Path('../project_data/data-job-title.txt').read_text()

print(data)

Identity and Access Management Senior Security Engineer.
Azure DevOps Engineer.
Senior Data Analyst.
IAM Engineer.
Engineer, Application Security (Identity & Access Management platform SME).
Data Analyst (Remote).
Identity and Access Management engineer.
REMOTE - Cybersecurity Senior Manager - Cloud Identity and Access Management.
VP, Product Management.
Oracle Access/Identity Management Application Developer.
Data & Analytics Product Owner (Remote - Home Based Worker).
REMOTE Golang Developer.
REMOTE C++ Developer (Multithreading).
Identity and Access Management - Manager.
Sr SAP Security and GRC Lead Remote.
Data Analyst.
Manager, Identity Engineering.
Identity Access Management (IAM) Engineer.
Identity and Access Management Engineer.
Director Information Security, Identity Access Management.
Identity and Access Management Manager- Remote.
Associate Director, Product Owner for Identity and Access Management.
Front End Developer - Identity & Access Management.
Identity & Access Manage

## 3. Tokenize words

In [None]:
#Tokenize words.
wordtext=data.replace('\n','')
token=re.findall('\w+', wordtext)

## 4. Work Role Analysis - Clean dataset

In [None]:
#ETL words into all lowercase.
words=[]
for word in token:
    words.append(word.lower())

#print(words)

#Create stopword list.
stopword = stopwords.words('english')
newStopWords =  ['identity', 'access','management','senior','security','azure','data','devops','iam','application','platform','sme','remote','cybersecurity','cloud','product', 'oracle','analytics','home', 'based', 'worker', 'golang','c','multithreading','sr', 'sap', 'grc','engineering','information','front', 'end','technical', 'business','principle', 'global','icrm', 'aml', 'compliance', 'governance', 'integrity','hybrid', 'team','ii','pm','conditional', 'team','privileged','backend', 'software','administration', 'sentinel', 'gbsd', 'staff','3036', '1','cyber', 'services', 'governance', 'resource', 'program','owner', 'storage', 'backup','service', 'operations', 'software','systems','mgmt','mid','iii','idam','ai','customer','site','care','date','ciam','admin','sso','air', 'vehicle', 'configuration', 'design', 'integration','sailpoint', 'iiq', 'applications', 'support','helpdesk', 'l2', 'l3', 'solution','web', 'app', 'manual', 'qa', '100', 'hris','december', '2022', 'may', '2023', 'graduates', 'control','assoc', 'dod', 'level', 'assurance', 'scrum', 'maestro', 'detection', 'sales', 'ml', 'open', 'protection', 'atlantic', 'lms', 'privacy','accesss', 'iga', 'saml', 'center', 'mgt', 'ad', 'role', 'icam', 'tester', 'engagement', 'system', 'secrets', 'needs', 'dlp', 'isd', 'arity', 'eligible', 'co', 'grad', 'strategy', 'digital', 'siteminder', 'trust', 'contractor', 'contract', 'soc', 'ui', 'issues', 'location', 'log', 'transformation', 'prevention', 'learning', 'austin', 'relations', 'college', 'related', 'arlington', 'summer', 'remotely', 'loss', 'na', 'ic', 'ny', 'assistant', 'science', 'health', 'credential', 'reliability', 'logistics', 'audit', 'consumer', 'saviynt', 'op', 'usds', 'option', 'federation', 'part', 'new', 'industry', 'ping', 'temp', 'innovation', 'epic', 'bi', 'vice', 'bis', 'enterprise', 'corporation', 'response', 'h', 'implementation', 'opportunity', 'hr', 'biometric', 'party', 'device', 'endpoint', 'technologies', 'compensation', 'change', 'payroll','basic', 'pasadena', 'third', 'beacon', 'machine', 'environmental', 'authentication', 'discovery', 'midlevel', 'database', 'enablement', 'wa', 'desk', 'infrastructure', 'technology', 'water', 'privilege', 'york', 'provisioning', 'delivery', 'salesforce', 'okta', 'group', 'wastewater', 'subsidiary', 'markets', 'benefits', 'jr', 'volunteer', 'firmware', 'schedule', 'solutions', 'development', 'products', 'marketing','foundational', 'available', 'oversight', 'aws', 'power', 'forgerock', 'virtual', 'duke', 'help', '2', 'risk', 'reporting', 'full', 'pam', 'platforms', 'c13', 'phc', 'exam', 'work', 'bellevue', 'us', 'tech', 'nj', 'cyberark', 'junior', 'back', 'experian', 'experience','fully', 'architecture', 'fraud', 'java', 'berkeley', 'working', 'kubernetes', 'linux', 'intelligence', 'scientist', 'monetization', '70', 'zero', '4', 'c360', '41278', 'executive_selling', '1235', 'workforce', '898168', 'verizon', 'time', 'mts', 'project', 'äîidentity', 'quality', 'coordinator', 'chief','0520u', 'o365','sql','hardware','master','flexible','3']

stopword.extend(newStopWords)

#Remove stopwords from tokenized words.
words_ne=[]
for word in words:
    if word not in stopword:
        words_ne.append(word)

## 5. Create x-grams and frequency analysis

In [None]:
# Set n-grams value
n = 1
xgrams = ngrams(words_ne, n)

frequency = nltk.FreqDist(xgrams)

#Print key value list of xgrams
for key,value in frequency.items():
    if value > 0:
        print(key, value)

values = frequency.values()
total = sum(values)
print("The Total Number of Values =", total)

('engineer',) 143
('analyst',) 75
('manager',) 51
('vp',) 8
('developer',) 34
('lead',) 19
('director',) 33
('associate',) 11
('specialist',) 10
('head',) 5
('architect',) 20
('principal',) 7
('officer',) 2
('leader',) 3
('administrator',) 16
('consultant',) 13
('supervisor',) 1
('strategist',) 2
('president',) 1
('expert',) 1
The Total Number of Values = 455


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.DataFrame(frequency.items())

print(df)

plt.bar(df['0'], df['1'])
plt.title('Unique Work Roles', fontsize=14)
plt.xlabel('Work Role', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.grid(True)
plt.show()

                   0    1
0        (engineer,)  143
1         (analyst,)   75
2         (manager,)   51
3              (vp,)    8
4       (developer,)   34
5            (lead,)   19
6        (director,)   33
7       (associate,)   11
8      (specialist,)   10
9            (head,)    5
10      (architect,)   20
11      (principal,)    7
12        (officer,)    2
13         (leader,)    3
14  (administrator,)   16
15     (consultant,)   13
16     (supervisor,)    1
17     (strategist,)    2
18      (president,)    1
19         (expert,)    1


KeyError: '0'