<h1>Job Market Trends</h1>
<h2>Extract, Transform, and Load Data</h2>

Data Analyst vs Data Scientist job

In [None]:
import os
import codecs
from bs4 import BeautifulSoup
import csv

<h2>Part 1: Access data files within a Directory</h2>

The job postings are stored as files within a directory, so we will create a function to iterate through files in a directory to be able to open each one.

In [None]:
# first check that we are in the correct directory
print(os.getcwd())

In [None]:
# print a list of the files in the working directory
!ls

In [None]:
directory = "Data Analyst Feb 16"
fileList = []

# Iterate through each file in directory and make a list of each filename
for file in os.listdir(directory):
        if file.endswith(".txt"):
            # add each filename to list
            fileList.append(file)
            print(fileList)

In [None]:
# Check that fileList was populated. Sort list.
fileList_sorted = sorted(fileList)
print(fileList_sorted)

In [None]:
def get_raw_data(directory):
    '''Open file containing html of job description and prepare soup object.'''
    fileList = []
    soupList = []
    # Iterate through each file in directory
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            # add each filename to list
            fileList.append(file)
            print(fileList)
            # open and load html
            with codecs.open(directory + "/"+ file, 'r', "utf-8") as f:
                job_html = f.read()
                job_soup = BeautifulSoup(job_html, "html.parser")
                soupList.append(job_soup)
    return soupList

In [None]:
soupList = get_raw_data("test_folder")
#print(soupList)

In [None]:
# Check to make sure all items are in list
len(soupList)

In [None]:
#soupList[1] prints the second item in the soupList list

Great. We are able to open each of the .txt files that are in our directory of interest.

<h2>Part 2 : Opening and extracting information from files</h2>

First, we will use two test files to test to make sure we can pull out the information we want. This is because some companies have ratings available and some do not. This changes the html code slightly and caused some problems. Below is the result from one of the two test files.

"Data Analyst Feb 16/Untitled 14-22-33.txt"

"Data Analyst Feb 16/Untitled 14-36-26.txt" -- TROX

14-41-46 -- KILLI

14-45-32 -- Citi worked

14-25-49 -- CIBC worked

14-19-29 -- TalentSphere, all worked incl salary

In [92]:
with codecs.open("Data Analyst Feb 16/Untitled 14-19-29.txt", 'r', "utf-8") as f:
    job_html = f.read()
job_soup = BeautifulSoup(job_html, "html.parser")

#print(job_soup)

In [93]:
job_title = job_soup.find("h1").text.strip()
print(job_title)

Financial Data Analyst


In [94]:
#company = job_soup.find("div", class_="jobsearch-CompanyReview--heading").text.strip()
#company = job_soup.find("div", class_="jobsearch-DesktopStickyContainer").next_element.text
company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").next_element.text
print(company)

AttributeError: 'NavigableString' object has no attribute 'text'

The above code was good for only some of the job listings (many of which seem to have a hyperlink).
Try to find another way to extract company information from the job descriptions where NaN appeared.

In [95]:
# Sometimes includes the number of reviews
company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").text
print(company)



TalentSphere Staffing Solutions


14 reviews


In [96]:
try:
    company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").next_element.text.strip()
    print('try 1: ', company)
except:
    pass

try:
    company = job_soup.find("div", class_="jobsearch-CompanyReview--heading").text.strip()
    print('try 2: ', company)
except:
    pass

try:
    company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").next_element.next_element.text.strip()
    print('try 3: ', company)
except:
    pass

# some times contains number of reviews
try:
    job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").text
    print('try 4: ', company)
except:
    pass


try 2:  TalentSphere Staffing Solutions
try 3:  TalentSphere Staffing Solutions
try 4:  TalentSphere Staffing Solutions


In [97]:
# using .next_element give AttributeError: 'NavigableString' object has no attribute 'text'
# job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_element.text

for sibling in job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_siblings:
    print(repr(sibling))

<div>Toronto, ON</div>


In [98]:
job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.next_sibling.text.strip()
print(job_location)

AttributeError: 'NoneType' object has no attribute 'text'

In [100]:
# Use try-except blocks differently

try:
    job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.text.strip()
    print('location try')
    print(job_location)
except:
    try:
        job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.next_sibling.text.strip()
        print('location except -- ', job_location)
    except:
        job_location = 'NaN'

location try
Toronto, ON


In [None]:
try:
    job_salary = job_soup.find("span", class_="icl-u-xs-mr--xs").text.strip()
except AttributeError:
    job_salary = "NaN"
print(job_salary)

In [None]:
job_summary = job_soup.find("div", class_="jobsearch-jobDescriptionText").text.strip()
print(job_summary)

In [None]:
job_record = (job_title, company, job_location, job_salary, job_summary)
print(job_record)

<h2>Part 3 : Put it all together</h2>

Put all the steps together so that we can easily extract job information from each text file and keep a record of which files we have opened.

In [101]:
# Works!
import os
import codecs
from bs4 import BeautifulSoup
import csv

def get_raw_data(directory):
    '''Open file containing html of job description and prepare soup object.'''
    fileList = []
    soupList = []
    # Iterate through each file in directory
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            # add each filename to list
            fileList.append(file)
            #print(fileList)
            # open and load html
            with codecs.open(directory + "/"+ file, 'r', "utf-8") as f:
                job_html = f.read()
                job_soup = BeautifulSoup(job_html, "html.parser")
                soupList.append(job_soup)
    print("soup_list is done.")
    return soupList

# From the loaded text, extract job information using beautiful soup
def get_job_record(job_soup):
    '''Create a record of information for one job.'''
    # Title
    try:
        job_title = job_soup.find('h1').text.strip()
    except:
        job_title = "NaN"
    
    # Company
    try:
        company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").next_element.next_element.text.strip()
    except:    
        try:
            company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").text.strip()
        except:
            company = "NaN"

    # Location
    try:
        job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.text.strip()
    except:
        try:
            job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.next_sibling.text.strip()
        except:
            job_location = 'NaN'

    # Job Summary
    try:
        job_summary = job_soup.find("div", class_="jobsearch-jobDescriptionText").text.strip()
    except:
        job_summary = "NaN"

    # Not all postings have a salary available
    try:
        job_salary = job_soup.find("span", class_="icl-u-xs-mr--xs").text.strip()
    except AttributeError:
        job_salary = "NaN"
    
    job_record = (job_title, company, job_location, job_salary, job_summary)
    return job_record

def main_etl(directory):
    '''This function loads text data, extracts pertinent job information, and saves data in a csv file.'''
    #while True:
    soupList = get_raw_data(directory)
        
        # add each job record to a list
    job_records = []
    for job_soup in soupList:
        job_record = get_job_record(job_soup)
        job_records.append(job_record)
        print("Added to job_records list. Length of job_records is: ", len(job_records))

    # add job records to csv by row
    with open('results.csv', 'w', newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Job Title', 'Company', 'Location', 'Salary', 'Job Description'])
        writer.writerows(job_records)

Let's test out the functionality on another folder containing files with job description in html format.

In [None]:
print(os.getcwd())

In [None]:
!ls

In [103]:
dataAnalyst = main_etl("Data Analyst Feb 16")

soup_list is done.
Added to job_records list. Length of job_records is:  1
Added to job_records list. Length of job_records is:  2
Added to job_records list. Length of job_records is:  3
Added to job_records list. Length of job_records is:  4
Added to job_records list. Length of job_records is:  5
Added to job_records list. Length of job_records is:  6
Added to job_records list. Length of job_records is:  7
Added to job_records list. Length of job_records is:  8
Added to job_records list. Length of job_records is:  9
Added to job_records list. Length of job_records is:  10
Added to job_records list. Length of job_records is:  11
Added to job_records list. Length of job_records is:  12
Added to job_records list. Length of job_records is:  13
Added to job_records list. Length of job_records is:  14
Added to job_records list. Length of job_records is:  15
Added to job_records list. Length of job_records is:  16
Added to job_records list. Length of job_records is:  17
Added to job_records 

In [None]:
!ls