# WEB SCRAPING USING PYTHON


Extracting Jobs data from [here](https://ca.indeed.com/).  In this case, I wanted to get the data of a particular job when searched for a location in a tabulated format.

In [1]:
#import libraries

import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

## Set the Url Template to Scrape

In this case, I wanted to use this to explore all the jobs and not just one particular job.

In [2]:
url_template = "https://ca.indeed.com/jobs?q={}&l={}"

In [3]:
def get_url(jobtitle, place):
    """Generate a url given the job title and place(place,province)"""
    url_template = "https://ca.indeed.com/jobs?q={}&l={}"
    url = url_template.format(jobtitle, place)
    return url

In [4]:
url = get_url("Data Scientist", "Vancouver, BC")

## Connect to the URL and Extract Raw HTML

In [5]:
response = requests.get(url)

In [6]:
response

<Response [200]>

A response of 200 means that the request went through.

In [7]:
# Checking the reason and 'OK' means it went through
response.reason

'OK'

## Parse HTML and save to BeautifulSoup Object

In [8]:
soup = BeautifulSoup(response.text, "html.parser")

### Creating a Card for the Job positing to extract the data from a single job posting 

In [9]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [10]:
len(cards)

15

### Extracting information for a Single Job Record

In [11]:
card = cards[0]

In [12]:
atag = card.h2.a

In [13]:
job_title=atag.get('title')

In [14]:
job_url = 'https://ca.indeed.com/' + atag.get('href')

In [15]:
company=card.find('span', 'company').text.strip()


In [16]:
job_location=card.find('div', 'recJobLoc').get('data-rc-loc')

In [17]:
job_summary=card.find('div', 'summary').text.strip()

In [18]:
post_date=card.find('span','date').text.strip()

In [19]:
today = datetime.today().strftime('%Y-%m-%d')

In [20]:
try:
    job_salary=card.find('span','salaryText').text.strip()
except AttributeError:
    job_salary = ''

## Generalize the model for the Entire Page

In [21]:
def get_record(card):
    """Extract Job data from a single record"""
    atag = card.h2.a
    job_title=atag.get('title')
    job_url = 'https://ca.indeed.com/' + atag.get('href')
    company=card.find('span', 'company').text.strip()
    job_location=card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary=card.find('div', 'summary').text.strip()
    post_date=card.find('span','date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary=card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    record = (job_title, company, job_location, post_date, today, job_summary, job_salary, job_url)
    
    return record

In [22]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [23]:
records[0]

('Data Scientist, AI@Unity',
 'Unity Technologies',
 'Vancouver, BC',
 '1 day ago',
 '2020-10-15',
 'Experience working with petabytes of data.\nExperience working on data projects from problem identification to production.',
 '',
 'https://ca.indeed.com//rc/clk?jk=7b2b2221435812ac&fccid=880e4714f2ad94a8&vjs=3')

## Applying the model to various pages


In [24]:
while True:
    try:
        url = 'https://ca.indeed.com/' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    
    for card in cards:
        record = get_record(card)
        records.append(record)
    

In [25]:
len(records)

63

## Complete Model and File Generation

In [26]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_url(jobtitle, place):
    """Generate a url given the job title and place(place,province)"""
    url_template = "https://ca.indeed.com/jobs?q={}&l={}"
    url = url_template.format(jobtitle, place)
    return url

def get_record(card):
    """Extract Job data from a single record"""
    atag = card.h2.a
    job_title=atag.get('title')
    job_url = 'https://ca.indeed.com/' + atag.get('href')
    company=card.find('span', 'company').text.strip()
    job_location=card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary=card.find('div', 'summary').text.strip()
    post_date=card.find('span','date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary=card.find('span','salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    
    record = (job_title, company, job_location, post_date, today, job_summary, job_salary, job_url)
    
    return record

def main(jobtitle, place):
    """Run the main program routine"""
    records = []
    url = get_url(jobtitle, place)
    
    # job data extraction
    while True:
    
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    
        for card in cards:
            record = get_record(card)
            records.append(record)
        
        try:
            url = 'https://ca.indeed.com/' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
            
    # Saving the job data
    with open('job_data.csv', 'w', newline='', encoding='utf-8')as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'Joburl'])
        writer.writerows(records)
    

In [27]:
# run the main program
main('Data Scientist','Vancouver,BC')