# Inspect Data

## 1. Import packages and data

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string
import os
for dirname, _, filenames in os.walk('dissertation/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("../project_data/data-jobs-20221123.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_title    453 non-null    object
 1   role         453 non-null    object
 2   description  453 non-null    object
dtypes: object(3)
memory usage: 10.7+ KB


## 2. Process data

In [3]:
print(df.iloc[0])
text = df['description'][0]
text

job_title           Identity and Access Management Administrator
role                                               administrator
description    Job Summary\nThe Identity and Access Managemen...
Name: 0, dtype: object




In [4]:
def process_description(description):
    tokens = nltk.word_tokenize(description)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    return stopwords_removed  

#Tokenize words
tokens = nltk.word_tokenize(text)

#Crate and remove stop words
stopwords_list = stopwords.words('english') + list(string.punctuation) + ['and','’']


## 4. Combine roles into new dataframes and concatenate descriptions

In [5]:
df['role'].value_counts()

engineer         147
analyst           88
manager           59
developer         42
director          41
leadership        22
administrator     20
architect         20
specialist        14
Name: role, dtype: int64

In [10]:

engineer_df = df[df['role'] == 'engineer']
analyst_df = df[df['role'] == 'analyst']
manager_df = df[df['role'] == 'manager']
developer_df = df[df['role'] == 'developer']
director_df = df[df['role'] == 'director']
leadership_df = df[df['role'] == 'leadership']
administrator_df = df[df['role'] == 'administrator']
architect_df = df[df['role'] == 'architect']
specialist_df = df[df['role'] == 'specialist']

director_df.head()

Unnamed: 0,job_title,role,description
170,"Director, Identity and Access Management",director,Abbott is a global healthcare leader that help...
171,"Director, Business Operations",director,Our Company\n\nChanging the world through digi...
172,"Director, Business Strategy",director,Our Company\n\nChanging the world through digi...
173,Director of Global Information Security - Iden...,director,Job Description Summary:\n\nWe currently have ...
174,Director Information Security – Technology Exa...,director,You Lead the Way. We’ve Got Your Back.\n\nWith...


In [11]:
def concat_description(df):
    # concat description
    desc = ''
    for i in range(len(df)):
        desc += df.iloc[i]['description']
    print('Finished Concatenation')
    return desc

engineer_text = concat_description(engineer_df)
engineer_text_processed = process_description(engineer_text)

analyst_text = concat_description(analyst_df)
analyst_text_processed = process_description(analyst_text)

manager_text = concat_description(manager_df)
manager_text_processed = process_description(manager_text)

developer_text = concat_description(developer_df)
developer_text_processed = process_description(developer_text)

director_text = concat_description(director_df)
director_text_processed = process_description(director_text)

leadership_text = concat_description(leadership_df)
leadership_text_processed = process_description(leadership_text)

administrator_text = concat_description(administrator_df)
administrator_text_processed = process_description(administrator_text)

architect_text = concat_description(architect_df)
architect_text_processed = process_description(architect_text)

specialist_text = concat_description(specialist_df)
specialist_text_processed = process_description(specialist_text)

Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation


## 5. Save and output processed files

In [12]:
# Saving files

text_file = open("../data/project_data/engineer_text.txt", "w")
text_file.write(engineer_text)
text_file.close()
temp = pd.DataFrame(engineer_text_processed)
temp.to_csv('../data/project_data/engineer_text_processed.csv')

text_file = open("../data/project_data/analyst_text.txt", "w")
text_file.write(analyst_text)
text_file.close()
temp = pd.DataFrame(analyst_text_processed)
temp.to_csv('../data/project_data/analyst_text_processed.csv')


text_file = open("../data/project_data/manager_text.txt", "w")
text_file.write(manager_text)
text_file.close()
temp = pd.DataFrame(manager_text_processed)
temp.to_csv('../data/project_data/manager_text_processed.csv')

text_file = open("../data/project_data/developer_text.txt", "w")
text_file.write(developer_text)
text_file.close()
temp = pd.DataFrame(developer_text_processed)
temp.to_csv('../data/project_data/developer_text_processed.csv')

text_file = open("../data/project_data/leadership_text.txt", "w")
text_file.write(leadership_text)
text_file.close()
temp = pd.DataFrame(leadership_text_processed)
temp.to_csv('../data/project_data/leadership_text_processed.csv')

text_file = open("../data/project_data/director_text.txt", "w")
text_file.write(director_text)
text_file.close()
temp = pd.DataFrame(director_text_processed)
temp.to_csv('../data/project_data/director_text_processed.csv')

text_file = open("../data/project_data/administrator_text.txt", "w")
text_file.write(administrator_text)
text_file.close()
temp = pd.DataFrame(administrator_text_processed)
temp.to_csv('../data/project_data/administrator_text_processed.csv')

text_file = open("../data/project_data/architect_text.txt", "w")
text_file.write(architect_text)
text_file.close()
temp = pd.DataFrame(architect_text_processed)
temp.to_csv('../data/project_data/architect_text_processed.csv')

text_file = open("../data/project_data/specialist_text.txt", "w")
text_file.write(specialist_text)
text_file.close()
temp = pd.DataFrame(specialist_text_processed)
temp.to_csv('../data/project_data/specialist_text_processed.csv')


## 6. Word frequency

In [13]:
FreqDist(engineer_text_processed).most_common(30)

[('experience', 1036),
 ('management', 752),
 ('access', 692),
 ('identity', 689),
 ('work', 472),
 ('iam', 448),
 ('security', 446),
 ('solutions', 380),
 ('years', 358),
 ('team', 344),
 ('support', 320),
 ('data', 297),
 ('information', 296),
 ('systems', 292),
 ('skills', 270),
 ('technical', 269),
 ('knowledge', 267),
 ('development', 267),
 ('business', 267),
 ('services', 253),
 ('design', 241),
 ('engineer', 240),
 ('ability', 234),
 ('job', 221),
 ('technology', 218),
 ('including', 217),
 ('position', 205),
 ('software', 204),
 ('working', 203),
 ('requirements', 199)]

In [14]:
FreqDist(analyst_text_processed).most_common(30)

[('experience', 515),
 ('management', 429),
 ('access', 343),
 ('data', 340),
 ('business', 336),
 ('work', 304),
 ('security', 291),
 ('identity', 275),
 ('information', 233),
 ('iam', 208),
 ('years', 208),
 ('requirements', 206),
 ('support', 202),
 ('team', 196),
 ('skills', 168),
 ('including', 165),
 ('systems', 160),
 ('required', 159),
 ('job', 155),
 ('knowledge', 151),
 ('analyst', 149),
 ('solutions', 145),
 ('preferred', 145),
 ('technical', 129),
 ("'s", 125),
 ('analysis', 125),
 ('position', 123),
 ('process', 121),
 ('ability', 119),
 ('application', 117)]

In [15]:
FreqDist(manager_text_processed).most_common(30)

[('experience', 387),
 ('management', 338),
 ('product', 248),
 ('team', 246),
 ('access', 225),
 ('work', 219),
 ('business', 202),
 ('security', 186),
 ('identity', 184),
 ('iam', 178),
 ('solutions', 175),
 ('data', 165),
 ('project', 163),
 ('information', 149),
 ('years', 136),
 ('technical', 123),
 ('status', 122),
 ('services', 119),
 ('including', 115),
 ('new', 111),
 ('role', 110),
 ('teams', 110),
 ('position', 107),
 ('support', 106),
 ('skills', 106),
 ('technology', 105),
 ('company', 104),
 ('across', 103),
 ('ability', 101),
 ('one', 100)]

In [16]:
FreqDist(developer_text_processed).most_common(30)

[('experience', 266),
 ('work', 166),
 ('development', 151),
 ('management', 137),
 ('identity', 127),
 ('business', 118),
 ('team', 117),
 ('data', 110),
 ('access', 107),
 ('skills', 104),
 ('information', 90),
 ('application', 86),
 ('required', 85),
 ('solutions', 83),
 ('requirements', 82),
 ('ability', 81),
 ('years', 80),
 ('applications', 79),
 ('job', 78),
 ('strong', 77),
 ('working', 76),
 ('systems', 74),
 ('design', 71),
 ('services', 71),
 ('status', 70),
 ('knowledge', 68),
 ('software', 67),
 ('developer', 67),
 ('technical', 66),
 ('support', 64)]