In [28]:
'''
This program scraps the information from the freelancers section of the WORKANA website.
This program utilizes the BeautifulSoup 4 library for web scraping.

Author: Yu-Chang (Andy) Ho
Date: 2018/11/27
'''

'\nThis program scraps the information from the freelancers section of the WORKANA website.\nThis program utilizes the BeautifulSoup 4 library for web scraping.\n\nAuthor: Yu-Chang (Andy) Ho\nDate: 2018/11/27\n'

In [29]:
# import the required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

print( 'This section import the required libaries.' )

This section import the required libaries.


In [30]:
# parameters -------------------------------------------------------

target_class = 'js-worker'      # the frame containing worker info.

country_element = 'span'        # the element containing nationality
country_class = 'country-name'
rating_element = 'span'         # the element containing rating
rating_class = 'stars-bg'
pro_element = 'span'            # the element containing pro tag
pro_class = 'pro-label'
hrrate_element = 'span'         # the element containing hr_rate
hrrate_class = 'monetary-amount'
experience_element = 'p'        # the element containing experience info.
experience_class = 'hidden-xs'
skill_element = 'a'             # the element containing skills
skill_class = 'skill'

# csv header
header = 'name,country,rating,is_pro,hourly_rate,completed_jobs,hours_worked,skills'
# the website url
base_url = 'https://www.workana.com/en/freelancers?page='
# a large number to cover all the page number
limit = 3

# ------------------------------------------------------- parameters

print( 'Here are the parameters that able to be modified.' )

Here are the parameters that able to be modified.


In [31]:
# self-defined functions -------------------------------------------
def invalid_val(): return 'N/A'
# make sure there is no special char in a value
def clear_str( text ):
    text = str(text).replace( '\n', '' ).replace( '\r', '' ).replace( '\t', '' )
    return text
# replace ',' in the value
def clear_comma( text ):
    return str(text).replace( ',', '-' )
# make sure the value is numeric, otherwise return invalid_val()
def numeric( val, type='float' ):
    val = str(val) # make sure it is not 'NoneType'
    try:
        # make sure the value is numeric
        if( type == 'int' ): val = int(val)
        else: val = float(val)
        return str(val)
    except: return invalid_val()
# ------------------------------------------- self-defined functions

print( 'Here are some function to clear the data value.' )

Here are some function to clear the data value.


In [32]:
###  prepare the temporary storage

# header
header = ['name', 'country', 'rating', 'is_pro', 'hourly_rate', 'completed_jobs', 'hours_worked', 'skills']
# dataframe to contain the data
df = pd.DataFrame( columns=header )

print( 'Preparing the data storage.' )

Preparing the data storage.


In [33]:
### start scraping

# there are pages to show the result
for i in range( 1, (limit + 1) ):
    url = base_url + str(i)
    # get web code
    r = requests.get( url )

    # Beautiful Soup parser
    soup = BeautifulSoup( r.text, 'html.parser' )

    t = soup.find_all( 'div', class_=target_class )
    # if the result is empty
    if( len( t ) == 0 ): break

    for workers in t:
        row = []
        soup = BeautifulSoup( str(workers), 'html.parser' )

        # get the name of the applicant
        name = ''
        for e in soup.find_all( 'span' ):
            if( e.parent.name == 'a' and e.parent.parent.name == 'h3' ):
                name = clear_str( e.text )
                continue # only one element will contain this information

        row.append( name )

        # get the nationality of the applicant
        e = soup.find( country_element, class_=country_class )
        # temporary soup parser
        t_soup = BeautifulSoup( str(e), 'html.parser' )
        e = t_soup.find( 'a' )
        country = clear_str( e.text )
        row.append( country )

        # rating
        e = soup.find( rating_element, class_=rating_class )
        res =  str(e[ 'title' ]).replace( ' of 5.00', '' )
        res = clear_str( res )
        res = float(res)
        rating = clear_str( res )
        row.append( rating )

        # if this applicant is tagged as 'pro'
        e = soup.find( pro_element, class_=pro_class )
        is_pro = 0
        if( e != None ): is_pro = 1
        row.append( is_pro )

        # find hourly rate
        e = soup.find( hrrate_element, class_=hrrate_class )
        hourly_rate = invalid_val()
        if( e != None ): hourly_rate = numeric( e[ 'data-amount' ], type='float' )
        row.append( hourly_rate )

        # get the completed_jobs & hours_worked
        e = soup.find( experience_element, class_=experience_class )
        # temporary soup parser
        t_soup = BeautifulSoup( str(e), 'html.parser' )
        projects = hours = invalid_val()
        for e in t_soup.find_all( 'span' ):
            res = ''
            try:  # make sure the soup find the element
                # remove the text heading

                if( 'Completed' in e.text ): projects = numeric( str(e.text).replace( 'Completed projects: ', '' ), type='int' )
                else: hours = numeric( str(e.text).replace( 'Hours worked in hourly projects: ', '' ), type='int' )
            except: pass
        row.append( projects )
        row.append( hours )

        # get all the skills the applicant has
        skills = []
        e = soup.find_all( skill_element, class_=skill_class )
        for s in e:
            res = clear_comma( str(s.text).encode( 'ascii', 'ignore' ) )
            skills.append( res )
        skills = "|".join( skills )
        row.append( skills )

        # commit the result
        df = df.append( pd.Series( row, index=df.columns ), ignore_index=True )

# print the result
print( df )

                           name        country rating is_pro hourly_rate  \
0                   Veronica S.      Argentina    5.0      1       580.0   
1                 Julio Henrick         Brazil   4.89      1        60.0   
2                         Sammy         Brazil   4.92      1        60.0   
3                    luisana C.      Venezuela   4.95      1         5.0   
4                     Hector D.      Venezuela   4.91      0         5.0   
5                  Miguel Jaime       Colombia    4.8      1     45000.0   
6          Lucas Gabriel Coelho         Brazil   4.77      1        20.0   
7            Twixt Technologies          India   4.84      1         5.0   
8                     Karina M.         Brazil   4.86      1        20.0   
9                   Ana Martins         Brazil   4.73      1        42.0   
10            Michelle O. C. S.  United States   4.74      1        15.0   
11                  Liz Andreia         Brazil   4.86      1         N/A   
12          