In [1]:
from bs4 import BeautifulSoup #for HTML parsing
from selenium import webdriver #to scrape javascript code of page, 'requests' fails to scrap required data

In [2]:
skill_list = []     #list to store tagged skills in job postings
job_type_list = []  #list to store job type : Hourly or Fixed-Price
job_tier_list = []  #list to store job tier : Entry-Level, Intermediate or Expert
jobs_with_tagged_skills = 0

In [3]:
#Covering first 80 pages, where each page has 10 postings
for i in range(1,81):
    upwork_url = "https://www.upwork.com/o/jobs/browse/c/data-science-analytics/?page=" + str(i) #website url
    browser = webdriver.Safari()  # object for Safari browser
    browser.get(upwork_url)       # navigate to the page
    inhtml = browser.execute_script("return document.body.innerHTML") # extract inner HTML of the page as str
    soup = BeautifulSoup(inhtml, 'html.parser') # parse with BeautifulSoup
    
    # Each job listing is posted under HTML section with class 'job-tile'. 
    job_tile_soup = soup.select("div[id='jobs-list'] section[class='job-tile']") 
    
    # After selecting all HTML sections (class='job-title'), traverse through each 'section' extracting tagged
    # skills, job type and job tier. Ignore 'section' under which no skills are tagged.
    for job_tile in job_tile_soup:
        # scrape, clean and store 'tagged skills'
        temp_skill_list = job_tile.select("span[class='js-skills skills'] a")
        temp_skill_list = [x.text.strip() for x in temp_skill_list]
        if len(temp_skill_list) == 0:
            continue
        skill_list.extend(temp_skill_list)
        jobs_with_tagged_skills += 1
        
        # scrape, clean and store 'job type'
        temp_job_type_list = job_tile.select("strong[class='js-type']")
        temp_job_type_list = [x.text.strip() for x in temp_job_type_list]
        temp_job_type_list = temp_job_type_list * len(temp_skill_list)
        job_type_list.extend(temp_job_type_list)
        
        # scrape, clean and store 'job tier'
        temp_job_tier_list = job_tile.select("span[class='js-contractor-tier']")
        temp_job_tier_list = [x.text.strip().split()[0] for x in temp_job_tier_list]
        temp_job_tier_list = temp_job_tier_list * len(temp_skill_list)
        job_tier_list.extend(temp_job_tier_list)
    # close browser session after each page
    browser.quit()

In [4]:
for j in range(10):
    print("{}\t{}\t{}".format(skill_list[j], job_type_list[j], job_tier_list[j]))

Data Scraping	Fixed-Price	Intermediate
Web Scraping	Fixed-Price	Intermediate
Machine Learning	Fixed-Price	Expert
Appforfinance Financial Statement Analysis	Fixed-Price	Intermediate
Business Analysis	Fixed-Price	Intermediate
Business Intelligence	Fixed-Price	Intermediate
Data Analytics	Fixed-Price	Intermediate
Excel VBA	Fixed-Price	Intermediate
Microsoft Excel	Fixed-Price	Intermediate
Microsoft Power BI	Fixed-Price	Intermediate


In [5]:
len(skill_list), len(job_type_list), len(job_tier_list)

(2187, 2187, 2187)

In [6]:
jobs_with_tagged_skills

611

In [8]:
import numpy as np
import pandas as pd

In [9]:
# Create a dataframe of scraped data
skill_frame = pd.DataFrame({'Skill':skill_list, 'Job Type':job_type_list, 'Job Tier':job_tier_list})

In [10]:
skill_frame.head(10)

Unnamed: 0,Job Tier,Job Type,Skill
0,Intermediate,Fixed-Price,Data Scraping
1,Intermediate,Fixed-Price,Web Scraping
2,Expert,Fixed-Price,Machine Learning
3,Intermediate,Fixed-Price,Appforfinance Financial Statement Analysis
4,Intermediate,Fixed-Price,Business Analysis
5,Intermediate,Fixed-Price,Business Intelligence
6,Intermediate,Fixed-Price,Data Analytics
7,Intermediate,Fixed-Price,Excel VBA
8,Intermediate,Fixed-Price,Microsoft Excel
9,Intermediate,Fixed-Price,Microsoft Power BI


In [11]:
# Save the dataframe into a CSV file
skill_frame.to_csv('job_postings_data.csv')

In [12]:
df = pd.read_csv('job_postings_data.csv', index_col=0)

In [13]:
df.head()

Unnamed: 0,Job Tier,Job Type,Skill
0,Intermediate,Fixed-Price,Data Scraping
1,Intermediate,Fixed-Price,Web Scraping
2,Expert,Fixed-Price,Machine Learning
3,Intermediate,Fixed-Price,Appforfinance Financial Statement Analysis
4,Intermediate,Fixed-Price,Business Analysis
