# Job Market Opportunities in Toronto
## For Data Analyst and Data Scientist Positions

__By: Jennifer Ma

__Github: jma199

Data is a useful asset to a company and has been demonstrated to be key in driving companies' profits. With the ubiquity of the internet and social media, data is now easily available to companies, large and small. As a result, there has also been an increasing demand in positions for people to provide data-driven insights.

As part of my own job search, I wanted to know what the differences between data analyst and data scientist positions jobs and what kinds of companies are hiring in Toronto area.

## Data Collection

Information about available job openings can be found online at job-posting sites such as Indeed.
HTML for each job posting was saved as a text file. 
To parse html data in text files, BeautifulSoup was used.

In [2]:
import os
import codecs
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, Text, insert, select, delete

# The load_data function is used by the main_etl function to load data from a directory
def load_data(directory):
    '''Given a directory, open each text file containing html of job description and prepare soupList object.'''
    fileList = []
    soupList = []
    # Iterate through each file in directory
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            # add each filename to list
            fileList.append(file)
            #print(fileList)
            # open and load html
            with codecs.open(directory + "/"+ file, 'r', "utf-8") as f:
                job_html = f.read()
                job_soup = BeautifulSoup(job_html, "html.parser")
                soupList.append(job_soup)
    print("Data is loaded.")
    return soupList

# The get_job_record is used by the main_etl function to extract job information
# From the loaded text, extract job information using beautiful soup
def get_job_record(job_soup, position_type):
    '''Create a dictionary containing a record of information for one job. 
    Each record contains title, company, location, job description, and salary (if available).'''

    position_types = ['analyst', 'scientist']
    if position_type not in position_types:
        raise ValueError("Invalid position_type. Expected one of: {}.".format(position_types))
    
    # Title
    try:
        job_title = job_soup.find('h1').text.strip()
    except:
        job_title = "NaN"
    
    # Company
    try:
        company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").next_element.next_element.text.strip()
    except:    
        try:
            company = job_soup.find("div", class_="jobsearch-InlineCompanyRating").text.strip()
        except:
            company = "NaN"

    # Location
    try:
        job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.text.strip()
    except:
        try:
            job_location = job_soup.find("div", class_ = "jobsearch-InlineCompanyRating").next_sibling.next_sibling.text.strip()
        except:
            job_location = 'NaN'

    # Job Description
    try:
        job_description = job_soup.find("div", class_="jobsearch-jobDescriptionText").text.strip().replace('\n', ' ')
    except:
        job_description = "NaN"

    # Not all postings have a salary available
    try:
        job_salary = job_soup.find("span", class_="icl-u-xs-mr--xs").text.strip()
    except:
        job_salary = "NaN"
    
    if position_type is "analyst":
        label = 0
    else:   #datascientist
        label = 1
        
    job_record = {'jobtitle': job_title,
                    'company': company,
                    'location': job_location,
                    'salary': job_salary,
                    'jobdescription': job_description,
                    'label': label
                 }

    return job_record

def main_etl(directory, position_type):
    '''Takes two arguments: directory and position type ("analyst" or "scientist").
    Load text data, extract pertinent job information, and save data in a sql database.'''

    soupList = load_data(directory)
    # add each job record to a list
    # this will create a list of dictionaries, making it easy to insert into sqlite table
    job_records = []
    for job_soup in soupList:
        job_record = get_job_record(job_soup, position_type)
        job_records.append(job_record)
        #print("Added to job_records list. Length of job_records is: ", len(job_records))

    # add job records to sqlite db
    # Create engine: engine
    engine = create_engine('sqlite:///joblist.sqlite')
    metadata = MetaData()

    # Define a new table
    data = Table('data2', metadata,
                 Column('jobtitle', String(100)),
                 Column('company', String(100)),
                 Column('location', String(25)),
                 Column('salary', Integer()),
                 Column('jobdescription', Text()),
                 Column('label', Integer())
                )

    # Create table
    metadata.create_all(engine)
    print(engine.table_names())

    # Insert results into table
    insert_stmt = insert(data)
    connection = engine.connect()
    results = connection.execute(insert_stmt, job_records)

    # Print resulting rowcount
    print("The number of rows added is: ", results.rowcount)

# add more job records after indeed made change to its code
# used by update_db function
def get_job_record_update(job_soup, position_type):
    '''Create a record of information for one job.'''

    position_types = ['analyst', 'scientist']
    if position_type not in position_types:
        raise ValueError("Invalid position_type. Expected one of: {}.".format(position_types))
    
    # Title
    try:
        job_title = job_soup.find("div", id="vjs-jobtitle").text.strip()
    except:
        try:
            job_title = job_soup.find("h1", id="vjs-jobtitle").text.strip()
        except:
            job_title = "NaN"
    
    # Company
    try:
        company = job_soup.find("span", id="vjs-cn").text.strip()
    except:    
        company = "NaN"

    # Location
    try:
        job_location = job_soup.find("span", id="vjs-loc").text.strip().replace("- ", "")
    except:
        job_location = "NaN"
    
    # Job Salary
    try:
        job_salary = job_soup.find("span", attrs = {"id": None, "class": None, "aria-hidden": None}).text.strip()
    except:
        job_salary = "NaN"
    
    # Job Description
    try:
        job_description = job_soup.find("div", id="vsj-desc").text.strip().replace("\n", " ")
    except:
        try:
            job_description = job_soup.find("div", id="vjs-content").text.strip().replace("\n", " ")
        except:
            job_summary = "NaN"
    
    if position_type is "analyst":
        label = 0
    else:   #datascientist
        label = 1
    
    job_record = {'jobtitle': job_title,
                  'company': company,
                  'location': job_location,
                  'salary': job_salary,
                  'jobdescription': job_description,
                  'label': label
                 }
    return job_record


# update database with more entries
def update_db(directory, position_type, db_name):
    '''This function loads text data, extracts pertinent job information, and adds data to an existing sql database.'''
    
    soupList = load_data(directory)
        
    # add each job record to a list
    # this will create a list of dictionaries, making it easy to insert into a sql table
    job_records = []
    for job_soup in soupList:
        job_record = get_job_record_update(job_soup, position_type)
        job_records.append(job_record)
        #print("Added to job_records list. Length of job_records is: ", len(job_records))
    
    print("Finished extracting information for job_records list.")
    
    # add job records to existing sqlite db
    # Create engine: engine
    engine = create_engine(db_name)
    metadata = MetaData()
    
    # Reflect table
    data = Table('data2', metadata, autoload=True, autoload_with=engine)
    
    # Build an insert statement to insert a record into the data table: insert_stmt
    insert_stmt = insert(data)

    # Execute the insert statement via the connection: results
    connection = engine.connect()
    results = connection.execute(insert_stmt, job_records)

    # Print result rowcount
    print("The number of rows added is: ", results.rowcount)

  if position_type is "analyst":
  if position_type is "analyst":


Run main etl function for both data analyst and data scientist postings

In [8]:
# add data from folder "Data Analyst"
dataAnalyst = main_etl("Data Analyst", "analyst")

Data is loaded.
['data', 'data2']
The number of rows added is:  450


In [9]:
# add data from folder "Data Scientist"
dataScientist = main_etl("Data Scientist", "scientist")

Data is loaded.
['data', 'data2']
The number of rows added is:  175


In [10]:
# update database with more data scientist postings
dataScientist = update_db('Data Scientist update', 'scientist', db_name = 'sqlite:///joblist.sqlite')

Data is loaded.
Finished extracting information for job_records list.
The number of rows added is:  75


In [3]:
dataAnalyst_errorcheck = main_etl("Data/Data Analyst", 'engineer')

Data is loaded.


ValueError: Invalid position_type. Expected one of: ['analyst', 'scientist'].