## Kalibrr Web Scraping

#### As of May 23, 2020 12:40 AM

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
import time

import pandas as pd
import numpy as np

##### Search for keyword 'data science'. Extract url for each search result.

In [None]:
from selenium.webdriver import Chrome


def get_serp_urls(lastpage=1):
    """Get all URLS of search results pages
    for data science job listings in PH.
    
    Keyword argument:
    lastpage -- last page of search results
    """
    base_url = 'https://www.kalibrr.com/job-board/co/Philippines/te/data-science/'
    return [base_url + str(i)
            for i in range(1, lastpage+1)]


# change path if you're on Windows
path = '/opt/WebDriver/bin/chromedriver'

def get_listings(lastpage=1):
    """Get URLs of job listings in Kalibrr.
    
    Keyword argument:
    lastpage -- last page of search results
    """
    listings = []
    for url in get_serp_urls(lastpage):
        with Chrome(executable_path=path) as driver:
            driver.get(url)
            # get all meta elements with itemprop="url"
            metas = driver.find_elements_by_xpath("//meta[@itemprop='url']")
            for meta in metas:
                listings.append(meta.get_attribute('content'))
    return listings

In [None]:
# As of May 23, 2020 12:40 AM, the last search result page is 72.
# Running this cell takes about 14 minutes on my smol laptop.

listings = get_listings(lastpage=72)

with open('listings.txt', 'w') as txt:
    for listing in listings:
        txt.write(listing)
        txt.write('\n')

#### Scrape all info from each url.

In [None]:
# Scraping done by batches
# Manually set loop number here
loop = 5

In [None]:
# Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)

with open("listings.txt", "r") as f:
    lines = f.readlines()[(loop-1)*100+90:loop*100]

title_list = []
company_list = []
location_list = []
min_salary_list = []
max_salary_list = []
currency_list = []
period_list = []
employment_type_list = []
date_posted_list = []
valid_until_list = []
job_desc_list = []
qualifications_list = []
benefits_list = []
skills_list = []
courses_list = []
job_level_list = []
job_category_list = []
educational_requirement_list = []
recruiter_response_list = []
address_list = []
industry_list = []
vacancy_list = []
website_list = []
company_desc_list = []
url_list = []

for url in lines:
    driver.get(url)

    time.sleep(10+np.random.randint(1,3))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    title = None
    company = None
    location = None
    min_salary = None
    max_salary = None
    currency = None
    period = None
    employment_type = None
    date_posted = None
    valid_until = None
    job_desct = None
    qualifications = None
    benefits = None
    skills = None
    courses = None
    job_level = None
    job_category = None
    educational_requirement = None
    recruiter_response = None
    address = None
    industry = None
    vacancy = None
    website = None
    company_desc = None
        
    title_element = soup.find(class_='job-post-title')
    
    if title_element:
        title = title_element.get_text()
    else:
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        title = soup.find(class_='job-post-title').get_text()
    
    company = soup.find(class_='job-post-company-name').get_text()
    location = soup.find('span',itemscope="itemscope").get_text()

    currency_element = soup.find('span',itemprop="currency")
    currency = currency_element.get_text() if currency_element else np.NaN

    min_salary_element = soup.find('span',itemprop="minValue")
    min_salary = min_salary_element.get_text() if min_salary_element else np.NaN

    max_salary_element = soup.find('span',itemprop="maxValue")
    max_salary = max_salary_element.get_text() if max_salary_element else np.NaN

    period_element = soup.find('span',itemprop="unitText")
    period = period_element.get_text() if period_element else np.NaN

    employment_type = soup.select('.job-post-quick-details .link-unstyled')
    employment_type = [typ.get_text() for typ in employment_type]
    date_posted = soup.find(itemprop='datePosted')['content']
    valid_until = soup.find(itemprop='validThrough')['content']
    job_desc = soup.find(itemprop='description').get_text()
    qualifications = soup.find(itemprop='qualifications').get_text()

    benefits_element = soup.find(itemprop='jobBenefits')
    benefits = benefits_element.get_text() if benefits_element else np.NaN
        
    description = soup.find(class_='row row-reverse').find(class_='col-sm-8')
    description = description.find_all(class_='')

    for entry in description:
        try:
            if entry.find('h2').get_text()=='Required Skills':
                skills = [ i.get_text() for i in entry.find_all('li')]
                skills = ','.join(skills)
            if entry.find('h2').get_text()=='Preferred Courses':
                courses = [ i.get_text() for i in entry.find_all('li')]
                courses = ','.join(courses)

        except:
            continue
               
    jobs_summary = soup.find_all('dl')
    for entry in jobs_summary:
        if entry.find('dt').get_text()=='Job level':
            job_level = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Job category':
            job_category = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Educational requirement':
            educational_requirement = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Recruiter response to application':
            recruiter_response = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Office Address':
            address = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Industry':
            industry = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Vacancy':
            vacancy = entry.find('dd').get_text()
        elif entry.find('dt').get_text()=='Website':
            website = entry.find('dd').get_text() 
                
    company_desc_element = soup.find(class_= 'careers-company-description')
    company_desc = company_desc_element.get_text() if company_desc_element else np.NaN

    page_url = url

    title_list.append(title)
    company_list.append(company)
    location_list.append(location)
    min_salary_list.append(min_salary)
    max_salary_list.append(max_salary)
    currency_list.append(currency)
    period_list.append(period)
    employment_type_list.append(employment_type)
    date_posted_list.append(date_posted)
    valid_until_list.append(valid_until)
    job_desc_list.append(job_desc)
    qualifications_list.append(qualifications)
    benefits_list.append(benefits)
    skills_list.append(skills)
    courses_list.append(courses)
    job_level_list.append(job_level)
    job_category_list.append(job_category)
    educational_requirement_list.append(educational_requirement)
    recruiter_response_list.append(recruiter_response)
    address_list.append(address)
    industry_list.append(industry)
    vacancy_list.append(vacancy)
    website_list.append(website)
    company_desc_list.append(company_desc)
    url_list.append(page_url)
    
job_post = pd.DataFrame({
    'Job Title': title_list,
    'Company': company_list,
    'Location': location_list,
    'Minimum Salary': min_salary_list,
    'Maximum Salary': max_salary_list,
    'Currency': currency_list,
    'Salary Period': period_list,
    'Employment Type': employment_type_list,
    'Date Posted': date_posted_list,
    'Valid Through': valid_until_list,
    'Job Description': job_desc_list,
    'Minimum Qualification': qualifications_list,
    'Benefits': benefits_list,
    'Required Skills': skills_list,
    'Preferred Courses': courses_list,
    'Job Level': job_level_list,
    'Job Category': job_category_list,
    'Educational Requirement': educational_requirement_list,
    'Recruiter Response': recruiter_response_list,
    'Office Address': address_list,
    'Industry': industry_list,
    'Vacancy': vacancy_list,
    'Website': website_list,
    'Company Description': company_desc_list,
    'Page URL': url_list
})

filename = 'job_post_' + str(loop) +'.xlsx'
job_post.to_excel(filename)

driver.quit()

### Data Wrangling

In [None]:
import pandas as pd
df = pd.read_excel('data/Kalibrr_ws_compiled.xlsx')
df = df.drop(columns='#')
df['Date Posted'] = df['Date Posted'].str.extract(r'([\d-]*)')
df['Date Posted'] = pd.to_datetime(df['Date Posted'])
df['Valid Through'] = df['Valid Through'].str.extract(r'([\d-]*)')
df['Valid Through'] = pd.to_datetime(df1['Valid Through'])
df['Vacancy'] = df['Vacancy'].str.extract(r'(\d*)')
df['Vacancy'] = df['Vacancy'].fillna(0)
df['Vacancy'] = df['Vacancy'].astype(int)
df['Employment Type'] = df['Employment Type'].str.strip('[\'')
df['Employment Type'] = df['Employment Type'].str.strip('\']')
df

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

#### Data Scientist

In [None]:
data_science = df[df['Job Title'].str.contains('Data Scientist') | df['Job Title'].str.contains('Data Science')]

NameError: name 'df' is not defined

#### Data Engineer

In [None]:
data_engineer = df[df['Job Title'].str.contains('Data') & df['Job Title'].str.contains('Engineer') & ~df['Job Title'].str.contains('Database')]

NameError: name 'df' is not defined

#### Data Analyst/BI/BA

In [None]:
analyst = df[df['Job Title'].str.contains('Data') & df['Job Title'].str.contains('Analyst')\
                 | df['Job Title'].str.contains('Business') & (df['Job Title'].str.contains('Intelligence') |\
                                                               df['Job Title'].str.contains('Analyst'))]

NameError: name 'df' is not defined

#### Data Warehouse/ETL

In [None]:
etl = df[df['Job Title'].str.contains('Data') & df['Job Title'].str.contains('Warehouse')\
                 | df['Job Title'].str.contains('ETL')]

NameError: name 'df' is not defined

#### Data Viz

In [None]:
data_viz = df[df['Job Title'].str.contains('Tableau')]

NameError: name 'df' is not defined

#### Final df for data jobs

In [None]:
data_jobs = pd.concat(data_science, data_engineer, analyst, etl, data_viz)

NameError: name 'data_science' is not defined

In [None]:
data_jobs

NameError: name 'data_jobs' is not defined

## New markdown cell

### Data Viz

In [None]:
for i in data_science.index:
    data_jobs.loc[i, 'Group'] =  'Data Science'
for i in data_engineer.index:
    data_jobs.loc[i, 'Group'] =  'Data Engineer'
for i in analyst.index:
    data_jobs.loc[i, 'Group'] =  'Analyst'
for i in etl.index:
    data_jobs.loc[i, 'Group'] =  'ETL/Data Warehouse'
for i in data_viz.index:
    data_jobs.loc[i, 'Group'] =  'Data Viz/Tableau'

In [None]:
vacancy = data_jobs.groupby('Group')['Vacancy'].sum().sort_values(ascending=False).reset_index()
vacancy

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12,8))

plt.bar(vacancy['Group'], vacancy['Vacancy'], color='#315751')
ax.grid('on', which='major', axis='y', linestyle='--', alpha=0.5, color='#315751')
plt.title("Data Job Vacancies", fontsize=14, pad=10)
plt.xlabel("Job", fontsize=12, labelpad=10)
plt.ylabel("Vacancies", fontsize=12, labelpad=10)
plt.ylim(0,120)
plt.show()

In [None]:
location = data_jobs.groupby('Location')['Location'].count().sort_values(ascending=False)
location = pd.DataFrame(location)
location.columns=['Count']
location = location.reset_index()
location['Location'] = location['Location'].str.extract(r'(\w+)')

In [None]:
fig, ax = plt.subplots(figsize=(12,8))

plt.bar(location['Location'], location['Count'], color='#a63b3b')
ax.grid('on', which='major', axis='y', linestyle='--', alpha=0.5, color='#a63b3b')
plt.title("Location", fontsize=14, pad=10)
plt.xlabel("City", fontsize=12, labelpad=10)
plt.ylabel("Count", fontsize=12, labelpad=10)
plt.ylim(0,25)
plt.show()

In [None]:
industry = data_jobs.groupby('Industry')['Industry'].count().sort_values(ascending=False)[:12]
industry = industry.sort_values()
industry = pd.DataFrame(industry)
industry.columns=['Count']
industry = industry.reset_index()
industry.loc[7,'Industry'] = 'Media/Publishing'
industry

In [None]:
fig, ax = plt.subplots(figsize=(10,8))

plt.barh(industry['Industry'], industry['Count'], color='#f59d00')
ax.grid('on', which='major', axis='x', linestyle='--', alpha=0.5, color='#f59d00')
plt.title("Top Industry", fontsize=14, pad=10)
plt.xlabel("Industry", fontsize=12, labelpad=10)
plt.show()

### Analysis

## Wordcloud Job Requirements

In [None]:
data_jobs = data_jobs[~data_jobs['Required Skills'].isnull()]
data_jobs['Required Skills'] = data_jobs['Required Skills'].astype(str)
y = data_jobs['Required Skills'].astype(str)
data_jobs.dtypes

In [None]:
stopwords = ['Name','Interpretat','Analyz']
querywords = y

resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)

In [None]:
www = result.replace('Financi','Financial')
www = www.replace('Busines','Business')
www = www.replace('Businesss','Business')
www = www.replace('Financialal','Financial')
www = www.replace('Analyzing','Analysis')

In [None]:
wordcloud = WordCloud(background_color="white", max_words=2000,width=1024, height=720)
wordcloud.generate(str(y))
plt.figure(dpi=500, figsize=(20,8))

plt.title('Required Skills WordCloud')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('figs/plot.png', bbox_inches='tight')
plt.show()

## New markdown cell