# Inspect Data

## 1. Import packages and data

In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist, sent_tokenize
import string
import os
for dirname, _, filenames in os.walk('dissertation/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("../project_data/data-jobs-20221123.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_title    452 non-null    object
 1   role         452 non-null    object
 2   description  452 non-null    object
dtypes: object(3)
memory usage: 10.7+ KB


## 2. Process data

In [10]:
print(df.iloc[0])
text = df['description'][0]
text

job_title      Business Intelligence Engineer, Identity and A...
role                                                    engineer
description    · 3+ years in relevant experience as Business ...
Name: 0, dtype: object


"· 3+ years in relevant experience as Business Intelligence Engineer, Data Engineer, Business Analyst, Data Scientist, or equivalent. · Proficiency with data querying or modeling technique with SQL · Experience with data visualization using QuickSight, Tableau, or similar tools · Proficiency with a scripting language (Java, Python, or R) · Bachelor's degree in Engineering, Statistics, Computer Science, Operations Research, Business Analytics, Information Systems or related field\n\nJob summary\nWe are looking for Business Intelligence Engineers to apply their talents to create new BI solutions, providing cutting edge insights and analytics that help tell the story of how our Stores Security programs are performing.\n\nOur Stores Security teams span over ten countries worldwide, and our focus areas include security intelligence, application security, incident response, security operations, risk and compliance, acquisitions and subsidiaries, and external partner security.\n\nWhat makes t

In [21]:
def process_description(description):
    tokens = nltk.word_tokenize(description)
    #tokens = nltk.sent_tokenize(description)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    return stopwords_removed  

#Tokenize words
tokens = nltk.word_tokenize(text)

new_stop = ['and','experience','show','veteran','less','origin','sexual', 'orientation', 'dental', 'insurance', 'hour', 'shift','religion','sex','receive','consideration','pay', 'per','employment','opportunity','consideration', 'employment','job','description','start', 'job','click','gender','benefits', 'k','monday', 'friday','age', 'disability','please', 'visit','salary', 'range','characteristic', 'protected','minimum', 'qualifications','join','reasonable', 'accommodation', 'parental', 'leave', 'medical', 'vision', 'duties', 'responsibilities','business', 'needs','essential', 'functions','color','type', 'fulltime','verbal', 'communication','apply','work','national', 'status','closely','flexible', 'spending','’','ability', 'work','location', 'remote','capital', 'one','marital', 'status','team', 'members','work', 'location','applicants', 'without','paid', 'time','color','regard', 'race','apply','equal', 'employer','without', 'regard','united', 'states',"'s",'race', 'color','best', 'practices','physical', 'mental','health', 'savings','responsible','affirmative', 'action','iam','federal','state','local','required','ideal', 'candidate','individuals', 'disabilities','northrop', 'grumman','applicable', 'law','every', 'day','across', 'organization''travel', 'required','track', 'record''including', 'limited','employee', 'assistance','new', 'york','compensation', 'package','financial', 'services','travel', 'requirements']

#Crate and remove stop words
stopwords_list = stopwords.words('english') + list(string.punctuation) + new_stop


## 4. Combine roles into new dataframes and concatenate descriptions

In [22]:
df['role'].value_counts()

engineer         147
analyst           88
leadership        63
manager           59
developer         42
architect         20
administrator     19
specialist        14
Name: role, dtype: int64

In [23]:

engineer_df = df[df['role'] == 'engineer']
analyst_df = df[df['role'] == 'analyst']
manager_df = df[df['role'] == 'manager']
developer_df = df[df['role'] == 'developer']
leadership_df = df[df['role'] == 'leadership']
administrator_df = df[df['role'] == 'administrator']
architect_df = df[df['role'] == 'architect']
specialist_df = df[df['role'] == 'specialist']

architect_df.head()

Unnamed: 0,job_title,role,description
69,Principal Foundational Architect‚ÄîIdentity & ...,architect,"Become a Part of the NIKE, Inc. Team\r\n\r\nNI..."
81,Workforce Identity and Access Management Archi...,architect,Cloud Security Services is currently looking f...
84,HR Solution Architect,architect,Comcast brings together the best in media and ...
90,IAM and Security Architect,architect,Company\r\nFederal Reserve Bank of Richmond\r\...
101,Enterprise Cloud Architect Remote,architect,Company Overview\nMotorola Solutions is there ...


In [24]:
def concat_description(df):
    # concat description
    desc = ''
    for i in range(len(df)):
        desc += df.iloc[i]['description']
    print('Finished Concatenation')
    return desc

engineer_text = concat_description(engineer_df)
engineer_text_processed = process_description(engineer_text)

analyst_text = concat_description(analyst_df)
analyst_text_processed = process_description(analyst_text)

manager_text = concat_description(manager_df)
manager_text_processed = process_description(manager_text)

developer_text = concat_description(developer_df)
developer_text_processed = process_description(developer_text)

leadership_text = concat_description(leadership_df)
leadership_text_processed = process_description(leadership_text)

administrator_text = concat_description(administrator_df)
administrator_text_processed = process_description(administrator_text)

architect_text = concat_description(architect_df)
architect_text_processed = process_description(architect_text)

specialist_text = concat_description(specialist_df)
specialist_text_processed = process_description(specialist_text)

Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation


## 5. Save and output processed files

In [25]:
# Saving files

text_file = open("../project_data/engineer_text.txt", "w")
text_file.write(engineer_text)
text_file.close()
temp = pd.DataFrame(engineer_text_processed)
temp.to_csv('../project_data/engineer_text_processed.csv')

text_file = open("../project_data/analyst_text.txt", "w")
text_file.write(analyst_text)
text_file.close()
temp = pd.DataFrame(analyst_text_processed)
temp.to_csv('../project_data/analyst_text_processed.csv')


text_file = open("../project_data/manager_text.txt", "w")
text_file.write(manager_text)
text_file.close()
temp = pd.DataFrame(manager_text_processed)
temp.to_csv('../project_data/manager_text_processed.csv')

text_file = open("../project_data/developer_text.txt", "w")
text_file.write(developer_text)
text_file.close()
temp = pd.DataFrame(developer_text_processed)
temp.to_csv('../project_data/developer_text_processed.csv')

text_file = open("../project_data/leadership_text.txt", "w")
text_file.write(leadership_text)
text_file.close()
temp = pd.DataFrame(leadership_text_processed)
temp.to_csv('../project_data/leadership_text_processed.csv')

text_file = open("../project_data/administrator_text.txt", "w")
text_file.write(administrator_text)
text_file.close()
temp = pd.DataFrame(administrator_text_processed)
temp.to_csv('../project_data/administrator_text_processed.csv')

text_file = open("../project_data/architect_text.txt", "w")
text_file.write(architect_text)
text_file.close()
temp = pd.DataFrame(architect_text_processed)
temp.to_csv('../project_data/architect_text_processed.csv')

text_file = open("../project_data/specialist_text.txt", "w")
text_file.write(specialist_text)
text_file.close()
temp = pd.DataFrame(specialist_text_processed)
temp.to_csv('../project_data/specialist_text_processed.csv')


## 6. Word frequency

In [26]:
FreqDist(engineer_text_processed).most_common(30)

[('management', 752),
 ('access', 692),
 ('identity', 690),
 ('security', 446),
 ('solutions', 380),
 ('years', 358),
 ('support', 320),
 ('data', 297),
 ('information', 296),
 ('systems', 292),
 ('skills', 270),
 ('technical', 269),
 ('development', 267),
 ('knowledge', 267),
 ('design', 241),
 ('engineer', 240),
 ('technology', 218),
 ('including', 217),
 ('position', 206),
 ('software', 205),
 ('working', 203),
 ('provide', 198),
 ('engineering', 193),
 ('application', 188),
 ('environment', 187),
 ('technologies', 183),
 ('directory', 172),
 ('role', 166),
 ('strong', 165),
 ('related', 159)]

In [27]:
FreqDist(analyst_text_processed).most_common(30)

[('management', 429),
 ('access', 343),
 ('data', 341),
 ('security', 291),
 ('identity', 276),
 ('information', 233),
 ('years', 208),
 ('support', 202),
 ('skills', 168),
 ('including', 165),
 ('systems', 160),
 ('knowledge', 151),
 ('analyst', 149),
 ('solutions', 145),
 ('preferred', 145),
 ('technical', 129),
 ('analysis', 125),
 ('position', 123),
 ('process', 121),
 ('application', 117),
 ('working', 116),
 ('technology', 113),
 ('related', 113),
 ('processes', 107),
 ('system', 102),
 ('provide', 101),
 ('user', 101),
 ('applications', 99),
 ('strong', 98),
 ('degree', 96)]

In [28]:
FreqDist(manager_text_processed).most_common(30)

[('management', 338),
 ('product', 248),
 ('access', 225),
 ('security', 186),
 ('identity', 185),
 ('solutions', 175),
 ('data', 165),
 ('project', 164),
 ('information', 149),
 ('years', 136),
 ('technical', 123),
 ('including', 115),
 ('teams', 110),
 ('role', 110),
 ('position', 107),
 ('skills', 106),
 ('support', 106),
 ('technology', 105),
 ('company', 104),
 ('development', 99),
 ('working', 96),
 ('engineering', 91),
 ('manager', 91),
 ('design', 90),
 ('provide', 86),
 ('employees', 86),
 ('products', 85),
 ('strong', 83),
 ('·', 78),
 ('ensure', 77)]

In [29]:
FreqDist(developer_text_processed).most_common(30)

[('development', 151),
 ('management', 137),
 ('identity', 127),
 ('data', 110),
 ('access', 107),
 ('skills', 104),
 ('information', 90),
 ('application', 86),
 ('solutions', 83),
 ('years', 80),
 ('applications', 79),
 ('strong', 77),
 ('working', 76),
 ('systems', 74),
 ('design', 71),
 ('knowledge', 68),
 ('software', 67),
 ('developer', 67),
 ('technical', 66),
 ('support', 64),
 ('including', 62),
 ('technology', 59),
 ('using', 56),
 ('position', 56),
 ('web', 55),
 ('may', 54),
 ('security', 51),
 ('sql', 47),
 ('company', 47),
 ('environment', 46)]