# Inspect Data

## 1. Import packages and data

In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist, sent_tokenize
import string
import os
for dirname, _, filenames in os.walk('dissertation/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("../project_data/data-jobs-20221123.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job_title        453 non-null    object 
 1   role             453 non-null    object 
 2   description      453 non-null    object 
 3   knowledge        0 non-null      float64
 4   skills           0 non-null      float64
 5   tasks            0 non-null      float64
 6   education_level  0 non-null      float64
 7   experience       0 non-null      float64
 8   certifications   0 non-null      float64
dtypes: float64(6), object(3)
memory usage: 32.0+ KB


## 2. Process data

In [3]:
print(df.iloc[0])
text = df['description'][0]
text

job_title           Identity and Access Management Administrator
role                                               administrator
description    Job Summary\nThe Identity and Access Managemen...
Name: 0, dtype: object




In [17]:
def process_description(description):
    tokens = nltk.word_tokenize(description)
    #tokens = nltk.sent_tokenize(description)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    return stopwords_removed  

#Tokenize words
tokens = nltk.word_tokenize(text)

#Crate and remove stop words
stopwords_list = stopwords.words('english') + list(string.punctuation) + ['and','’']


## 4. Combine roles into new dataframes and concatenate descriptions

In [18]:
df['role'].value_counts()

engineer         147
analyst           88
manager           59
developer         42
director          41
leadership        22
administrator     20
architect         20
specialist        14
Name: role, dtype: int64

In [7]:

engineer_df = df[df['role'] == 'engineer']
analyst_df = df[df['role'] == 'analyst']
manager_df = df[df['role'] == 'manager']
developer_df = df[df['role'] == 'developer']
leadership_df = df[df['role'] == 'leadership']
administrator_df = df[df['role'] == 'administrator']
architect_df = df[df['role'] == 'architect']
specialist_df = df[df['role'] == 'specialist']

director_df.head()

Unnamed: 0,job_title,role,description
170,"Director, Identity and Access Management",director,Abbott is a global healthcare leader that help...
171,"Director, Business Operations",director,Our Company\n\nChanging the world through digi...
172,"Director, Business Strategy",director,Our Company\n\nChanging the world through digi...
173,Director of Global Information Security - Iden...,director,Job Description Summary:\n\nWe currently have ...
174,Director Information Security – Technology Exa...,director,You Lead the Way. We’ve Got Your Back.\n\nWith...


In [8]:
def concat_description(df):
    # concat description
    desc = ''
    for i in range(len(df)):
        desc += df.iloc[i]['description']
    print('Finished Concatenation')
    return desc

engineer_text = concat_description(engineer_df)
engineer_text_processed = process_description(engineer_text)

analyst_text = concat_description(analyst_df)
analyst_text_processed = process_description(analyst_text)

manager_text = concat_description(manager_df)
manager_text_processed = process_description(manager_text)

developer_text = concat_description(developer_df)
developer_text_processed = process_description(developer_text)

leadership_text = concat_description(leadership_df)
leadership_text_processed = process_description(leadership_text)

administrator_text = concat_description(administrator_df)
administrator_text_processed = process_description(administrator_text)

architect_text = concat_description(architect_df)
architect_text_processed = process_description(architect_text)

specialist_text = concat_description(specialist_df)
specialist_text_processed = process_description(specialist_text)

Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation
Finished Concatenation


## 5. Save and output processed files

In [12]:
# Saving files

text_file = open("../project_data/engineer_text.txt", "w")
text_file.write(engineer_text)
text_file.close()
temp = pd.DataFrame(engineer_text_processed)
temp.to_csv('../project_data/engineer_text_processed.csv')

text_file = open("../project_data/analyst_text.txt", "w")
text_file.write(analyst_text)
text_file.close()
temp = pd.DataFrame(analyst_text_processed)
temp.to_csv('../project_data/analyst_text_processed.csv')


text_file = open("../project_data/manager_text.txt", "w")
text_file.write(manager_text)
text_file.close()
temp = pd.DataFrame(manager_text_processed)
temp.to_csv('../project_data/manager_text_processed.csv')

text_file = open("../project_data/developer_text.txt", "w")
text_file.write(developer_text)
text_file.close()
temp = pd.DataFrame(developer_text_processed)
temp.to_csv('../project_data/developer_text_processed.csv')

text_file = open("../project_data/leadership_text.txt", "w")
text_file.write(leadership_text)
text_file.close()
temp = pd.DataFrame(leadership_text_processed)
temp.to_csv('../project_data/leadership_text_processed.csv')

text_file = open("../project_data/administrator_text.txt", "w")
text_file.write(administrator_text)
text_file.close()
temp = pd.DataFrame(administrator_text_processed)
temp.to_csv('../project_data/administrator_text_processed.csv')

text_file = open("../project_data/architect_text.txt", "w")
text_file.write(architect_text)
text_file.close()
temp = pd.DataFrame(architect_text_processed)
temp.to_csv('../project_data/architect_text_processed.csv')

text_file = open("../project_data/specialist_text.txt", "w")
text_file.write(specialist_text)
text_file.close()
temp = pd.DataFrame(specialist_text_processed)
temp.to_csv('../project_data/specialist_text_processed.csv')


## 6. Word frequency

In [13]:
FreqDist(engineer_text_processed).most_common(30)

[('at the home depot, cybersecurity consists of architecture, governance, identity & access management, internal threat operations, issue and compliance management, risk assessment/advisory, security consulting, security operations, service optimization and strategic planning.',
  8),
 ('on rare occasions there may be a need to move or lift light articles.', 8),
 ('any unpleasant conditions would be infrequent and not objectionable.', 8),
 ('must be legally permitted to work in the united states.', 8),
 ('manager\nthis position has 0 direct reports\ntravel requirements:\nno travel required.',
  5),
 ('physical requirements:\nmost of the time is spent sitting in a comfortable position and there is frequent opportunity to move about.',
  5),
 ('working conditions:\nlocated in a comfortable indoor area.', 5),
 ('minimum qualifications:\nmust be eighteen years of age or older.', 5),
 ('amazon is an equal opportunity employer and does not discriminate on the basis of race, national origin, 

In [14]:
FreqDist(analyst_text_processed).most_common(30)

[('advanced knowledge of microsoft applications, including excel and access preferred.',
  7),
 ('all qualified applicants will receive consideration for employment without regard to sex, gender identity, sexual orientation, race, color, religion, national origin, disability, protected veteran status, age, or any other characteristic protected by law.',
  5),
 ('centene is transforming the health of our communities, one person at a time.',
  4),
 ('project management experience preferred.', 4),
 ('0-2 years of business process analysis (i.e.', 4),
 ('our comprehensive benefits package: flexible work solutions including remote options, hybrid work schedules and dress flexibility, competitive pay, paid time off including holidays, health insurance coverage for you and your dependents, 401(k) and stock purchase plans, tuition reimbursement and best-in-class training and development.',
  4),
 ('centene is an equal opportunity employer that is committed to diversity, and values the ways in 

In [15]:
FreqDist(manager_text_processed).most_common(30)

[('we collaborate internationally to deliver the services and solutions that help everyone to be more productive and enable innovation.',
  6),
 ('act as the primary contact between the scrum teams and stakeholders.', 6),
 ('the minimum and maximum full-time annual salaries for this role are listed below, by location.',
  4),
 ('please note that this salary information is solely for candidates hired to perform work within one of these locations, and refers to the amount capital one is willing to pay at the time of this posting.',
  4),
 ('salaries for part-time roles will be prorated based upon the agreed upon number of hours to be regularly worked.',
  4),
 ('no agencies please.', 4),
 ('capital one is an equal opportunity employer committed to diversity and inclusion in the workplace.',
  4),
 ('all qualified applicants will receive consideration for employment without regard to sex, race, color, age, national origin, religion, physical and mental disability, genetic information, mar

In [16]:
FreqDist(developer_text_processed).most_common(30)

[('applicants must be authorized to work in the u.s.\n\n\ncybercoders, inc is proud to be an equal opportunity employer\n\nall qualified applicants will receive consideration for employment without regard to race, color, religion, sex, national origin, disability, protected veteran status, or any other characteristic protected by law.',
  3),
 ('your right to work – in compliance with federal law, all persons hired will be required to verify identity and eligibility to work in the united states and to complete the required employment eligibility verification document form upon hire.',
  3),
 ('we are a cutting edge company that creates data management and storage solutions to address the issue of long-term digital preservation for businesses facing exponential data growth.',
  2),
 ('the adoption of its solutions by leaders in numerous industries around the world shows our product and customer focus, which has been dedicated completely to storage innovation for 40 years.',
  2),
 ('by 