<h1>Job Market Trends</h1>
<h2>Extract, Transform, and Load Data</h2>

Add Data Scientist job postings to database.

<h2>Part 1: Access data files within a Directory</h2>

<h2>Part 2 : Opening and extracting information from files</h2>

<h2>Part 3 : Put it all together</h2>

Put all the steps together so that we can easily extract job information from each text file and keep a record of which files we have opened.

In [1]:
import os
import codecs
from bs4 import BeautifulSoup
import csv

from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, Text, insert, select, delete

def get_raw_data(directory):
    '''Open file containing html of job description and prepare soup object.'''
    fileList = []
    soupList = []
    # Iterate through each file in directory
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            # add each filename to list
            fileList.append(file)
            #print(fileList)
            # open and load html
            with codecs.open(directory + "/"+ file, 'r', "utf-8") as f:
                job_html = f.read()
                job_soup = BeautifulSoup(job_html, "html.parser")
                soupList.append(job_soup)
    print("soup_list is done.")
    return soupList

# From the loaded text, extract job information using beautiful soup
def get_job_record(job_soup):
    '''Create a record of information for one job.'''
    # Title
    try:
        job_title = job_soup.find('h1').text.strip()
    except:
        job_title = "NaN"
    
    # Company
    try:
        company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").next_element.next_element.text.strip()
    except:    
        try:
            company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").text.strip()
        except:
            company = "NaN"

    # Location
    try:
        job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.text.strip()
    except:
        try:
            job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.next_sibling.text.strip()
        except:
            job_location = 'NaN'

    # Job Description
    try:
        job_description = job_soup.find("div", class_="jobsearch-jobDescriptionText").text.strip()
    except:
        job_description = "NaN"

    # Not all postings have a salary available
    try:
        job_salary = job_soup.find("span", class_="icl-u-xs-mr--xs").text.strip()
    except AttributeError:
        job_salary = "NaN"
    
    job_record = {'jobtitle': job_title,
                  'company': company,
                  'location': job_location,
                  'salary': job_salary,
                  'jobdescription': job_description,
                  'label': 1
                 }
    return job_record

def main_etl(directory):
    '''This function loads text data, extracts pertinent job information, and saves data in a sql database.'''
    soupList = get_raw_data(directory)
        
    # add each job record to a list
    # this will create a list of dictionaries, making it easy to insert into a sql table
    job_records = []
    for job_soup in soupList:
        job_record = get_job_record(job_soup)
        job_records.append(job_record)
        print("Added to job_records list. Length of job_records is: ", len(job_records))

    # add job records to sqlite db
    # Create engine: engine
    engine = create_engine('sqlite:///joblist.sqlite')
    metadata = MetaData()

    # Define a new table
    data = Table('data', metadata,
                 Column('jobtitle', String(100)),
                 Column('company', String(100)),
                 Column('location', String(25)),
                 Column('salary', Integer()),
                 Column('jobdescription', Text()),
                 Column('label', Integer())
                )

    # Create table
    metadata.create_all(engine)

    # Print table details
    print(engine.table_names())

    # Build an insert statement to insert a record into the data table: insert_stmt
    insert_stmt = insert(data)

    # Execute the insert statement via the connection: results
    connection = engine.connect()
    results = connection.execute(insert_stmt, job_records)

    # Print result rowcount
    print("The number of rows added is: ", results.rowcount)

In [2]:
print(os.getcwd())
!ls

/Users/jennifer/nlp-jobmarket
[1m[36mData Analyst[m[m
[1m[36mData Scientist[m[m
README.md
default-a326eba2-8d86-4f82-add7-16670adff2d4.ipynb
joblist.sqlite
[31mmain_etl_analyst.ipynb[m[m
[31mmain_etl_analyst_sql.ipynb[m[m
[31mmain_etl_scientist_sql.ipynb[m[m
main_etl_scientist_sql.py
[31mmain_jobdesc_eda.ipynb[m[m
[31mmain_jobdesc_preproc copy.ipynb[m[m
[31mmain_jobdesc_preproc.ipynb[m[m
results.csv
[1m[36mtest_folder[m[m
[30m[43mtest_folder2[m[m


In [3]:
dataScientist = main_etl("Data Scientist")

soup_list is done.


NameError: name 'job_description' is not defined

In [None]:
!ls