# WORKANA Freelaners Scraping

## Lib. usage: Selenium

### Target website: [https://www.workana.com/freelancers](https://www.workana.com/freelancers)

This program scraps the information from the freelancers section of the WORKANA website.
This program utilizes the Selenium library for web scraping, which enables the program to change conditions or
filters the website provided to show different result

* Author: Yu-Chang (Andy) Ho
* Date: 2018/11/27
* Latest Update: 2018/12/04

### This section import the required libaries.

In [16]:
# import the required libraries
from selenium import webdriver
# the library to control program exection at run time
import time
import pandas as pd
import numpy as np

### Here are the parameters that able to be modified.

In [17]:
# parameter settings --------------------------------------------------

# the keyword which to perform a search
# as an example, we are scraping the freelancers who is familiar with SQL language
KEY_search = 'SQL'

# the target URL
URL_target = 'https://www.workana.com/freelancers'

# time to wait for the page to refresh
# to make sure the changes applied to the website and allow the page to reload
TIME_pending = 2

# the search results will be seperated into multiple subpages
# this is a large number to cover all the page number
limit = 3

# -------------------------------------------------- parameter settings

### Here are some function to clear the data value.

In [18]:
# self-defined functions -------------------------------------------

# return the string 'N/A'
def invalid_val(): return 'N/A'

# make sure there is no special char in a value
def clear_str( text ):
    text = str(text).replace( '\n', '' ).replace( '\r', '' ).replace( '\t', '' )
    return text

# replace ',' in the value
def clear_comma( text ):
    return str(text).replace( ',', '-' )

# make sure the value is numeric, otherwise return invalid_val()
def numeric( val, type='float' ):
    val = str(val) # make sure it is not 'None'
    try:
        # make sure the value is numeric
        if( type == 'int' ): val = int(val)
        else: val = float(val)
        return str(val)
    except: return invalid_val()

# dealing with special char
def parse_str( text ):
    text = text.encode( 'utf-8' ).decode( 'latin1' )
    return text
# ------------------------------------------- self-defined functions

### Loading the chrome driver executable file
This will open up a blank Chrome browser window.

In [19]:
### load the chrome driver executable

# path to the driver executable

### Windows
PATH_chrome_driver = './chrome-driver/chromedriver.exe'
### macOS
#PATH_chrome_driver = './chrome-driver/chromedriver'

# load the driver
driver = webdriver.Chrome( PATH_chrome_driver )

### Preparing the data storage

In [20]:
###  prepare the temporary storage

# header
header = ['name', 'country', 'rating', 'is_pro', 'hourly_rate', 'completed_jobs', 'hours_worked', 'skills']
# dataframe to contain the data
df = pd.DataFrame( columns=header )

### Scraping

In [21]:
### function to scrap the content from the page
def scrap():
    global df
    res = None
    try:
        # test if the list of freelancers exist
        # the element "#workers > div.js-worker" contains the list of the freelancers
        res = driver.find_elements_by_css_selector( '#workers > div.js-worker' )
    except: return # the page is empty

    # for each subpage, there are 10 results shown
    looker = 0  # location indicator (no. of row)
    for e in res:
        row = []
        # get the name of the applicant
        name = e.find_element_by_css_selector( 'div.row > div.col-sm-7 > div.row > div.worker-details > h3 > a' ).text
        name = clear_comma( clear_str( name ) )
        name = parse_str( name )
        row.append( name )

        # get the nationality information
        country = e.find_element_by_css_selector( 'div.row > div.col-sm-7 > div.row > div.worker-details > div.row > div > span.country > span > a' ).text
        country = clear_comma( clear_str( country ) )
        row.append( country )

        # if a applicant is tagged as 'Pro'
        is_pro = 0
        try:
            e.find_element_by_css_selector( 'div.row > div.col-sm-7 > div.row > div.worker-details > h3 > span.pro-label' )
            is_pro = 1
        except: pass
        is_pro = str(is_pro)
        row.append( is_pro )

        # the applicant rating
        rating = e.find_element_by_css_selector( 'div.row > div.col-sm-7 > div.row > div.worker-details > label > span.profile-stars > span.stars-bg' ).get_attribute( 'title' )
        rating = str(rating).replace( ' of 5.00', '' )
        rating = str(numeric( clear_comma( rating ) ))
        row.append( rating )

        # get the set of skills
        skills = e.find_elements_by_css_selector( 'div.row > div.col-sm-7 > div.row.hidden-xs > div.col-sm-12 > div.skills > div.expander > a' )
        collection = []
        # collect the skills the current freelancer has
        for s in skills: collection.append( clear_comma( s.text ) )
        skills = '|'.join( collection )

        # hourly rate information
        hourly_rate = 'N/A'  # set the default value
        try:
            hourly_rate = e.find_element_by_css_selector( 'div.row > div.col-sm-5 > div.row > div.worker-details > h4 > span > span' )
            # make sure the value is numeric
            hourly_rate = str(numeric( clear_comma( clear_str( str(hourly_rate.text) ) ) ))
        except: pass
        row.append( hourly_rate )

        # completed_projects and hour_worked
        exp = e.find_elements_by_css_selector( 'div.row > div.col-sm-5 > div.row > div.worker-details > p > span' )
        c = 0
        projects = hours = invalid_val()
        for e in exp:
            if( 'Completed' in e.text ): projects = str(numeric( clear_comma( clear_str( str(e.text).replace( 'Completed projects: ', '' ) ) ), 'int' ))
            else: hours = str(numeric( clear_comma( clear_str( str(e.text).replace( 'Hours worked in hourly projects: ', '' ) ) ), 'int' ))
            c += 1
            if( c == 2 ): break

        row.append( projects )
        row.append( hours ) 
        row.append( skills )
        # commit the result
        df = df.append( pd.Series( row, index=df.columns ), ignore_index=True )

### function to operate the web UI element on WORKANA webpage

# workana.com using input boxes and inputting text.
def keyword_search():
    driver.get( URL_target )
    time.sleep( TIME_pending )

    box_input = driver.find_element_by_id( 'Query' )
    box_input.send_keys( KEY_search )
    
    # Using brower's css selector
    btn_search = driver.find_element_by_css_selector( '#search-form > div > div.col-sm-8.col-md-9.col-full-left > button' )
    # perform the search
    btn_search.click()

    # wait for the query to be done
    time.sleep( TIME_pending )

### the main function
try:
    # open the webpage
    keyword_search()
    time.sleep( TIME_pending )

    # start scraping
    # the first page is already loaded, scrap first
    scrap()
    # get the modified url and append parameter
    cur = driver.current_url
    cur += '?page='
    for i in range( 2, limit ):
        driver.get( cur + str(i) )
        time.sleep( TIME_pending )
        scrap()
    
    # show the result
    print( df )
except KeyboardInterrupt: driver.close()

                  name    country rating is_pro hourly_rate completed_jobs  \
0        AndrÃ©s AdÃ¡n     Mexico      1   4.96         N/A            144   
1     ICETEC Solutions     Brazil      1    5.0       100.0             18   
2   JosÃ© Maria Amorim     Brazil      1    5.0        85.0              5   
3     HÃ©ctor F. V. T.     Brazil      1   4.97       150.0             85   
4          Hildegar M.  Venezuela      0   4.94         8.0             15   
5           Juninho M.     Brazil      1   4.94        39.0            148   
6         John Vercosa     Brazil      1    5.0        21.0              8   
7           Houri Tech     Brazil      0    5.0        12.0              5   
8         Miguel A. H.  Guatemala      1   4.77        20.0             45   
9         Rubens Matos     Brazil      0    5.0        70.0             11   
10             BMKero.  Venezuela      1   4.71        20.0              6   
11       Valter Junior     Brazil      0    5.0        80.0     