<a href="https://colab.research.google.com/github/hibatallahk/WebScrapping/blob/main/webTest2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Indeed Job Scraper

create a general purpose job scraper

In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [2]:
def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://ma.indeed.com/jobs?q={}&l={}'   
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

In [3]:
url = get_url('data scientist', 'Maroc')
print(url)

https://ma.indeed.com/jobs?q=data+scientist&l=Maroc


# Extract html data


In [4]:
response = requests.get(url)

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
cards = soup.find_all('div', 'slider_container')

In [7]:
len(cards)

15

# Prototype the model with a single record


In [8]:
#ok
card = cards[0]
card

<div class="slider_container"><div class="slider_list"><div class="slider_item"><div class="job_seen_beacon"><table cellpadding="0" cellspacing="0" class="jobCard_mainContent" role="presentation"><tbody><tr><td class="resultContent"><div class="heading4 color-text-primary singleLineTitle tapItem-gutter"><h2 class="jobTitle jobTitle-color-purple"><span title="Stage en Ingénieur Développement – Data Scientist ( Rabat ) à Rabat">Stage en Ingénieur Développement – Data Scientist ( Rabat )...</span></h2></div><div class="heading6 company_location tapItem-gutter"><pre><div class="companyLocation">Maroc</div></pre></div><div class="heading6 error-text tapItem-gutter"></div></td></tr></tbody></table><table class="jobCardShelfContainer" role="presentation"><tbody><tr class="jobCardShelf"></tr><tr class="underShelfFooter"><td><div class="heading6 tapItem-gutter result-footer"><div class="job-snippet">Poste pour Ingénieur à Rabat – Offre d’Emploi : Stage en Ingénieur Développement – Data Scientis

In [9]:
#ok
job_title = card.h2.span.get('title')
job_title

'Stage en Ingénieur Développement – Data Scientist ( Rabat ) à Rabat'

In [10]:
#not ok
company = card.find('span', 'companyName')
company

In [11]:
#ok
location = card.find('div', 'companyLocation').text
location

'Maroc'

In [12]:
#ok
job_summary = card.find('div', 'job-snippet').text.strip().replace('\n', ' ')
job_summary

'Poste pour Ingénieur à Rabat – Offre d’Emploi : Stage en Ingénieur Développement – Data Scientist ( Rabat ) : Rabat – . Dans le cadre de développement de…'

In [13]:
#ok
post_date = card.find('span', 'date').text
post_date

'Postedil y a 30+ jours'

In [14]:
#ok
today = datetime.today().strftime('%Y-%m-%d')
today

'2021-12-21'

In [15]:
#job_url = 'https://www.indeed.com' + card.h2.a.get('href')

url = card.a.get('href')
job_url = 'https://ma.indeed.com' + url
job_url

'https://ma.indeed.com/Maroc-emplois'

# Generalize the code

In [16]:
def get_record(card):
    """Extract job data from a single record"""
    job_title = card.h2.span.get('title')
    #company = card.find('span', 'companyName').text.strip()
    location = card.find('div', 'companyLocation').text.strip()
    post_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    job_summary = card.find('div', 'job-snippet').text.strip().replace('\n', ' ')
    record = (job_title, location, post_date, today, job_summary)

    return record

In [17]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [18]:
records[2]

('Junior data scientist',
 'Casablanca',
 'Postedil y a 30+ jours',
 '2021-12-21',
 'Mettre en œuvre les processus de structurations de la données (toutes sources confondues internes et externes). Sens de l’organisation et du travail en équipe ;')

#Getting the next page

In [19]:
url = 'https://ma.indeed.com' + soup.find('a',{'aria-label':'Suivant'}).get('href')

In [20]:
url

'https://ma.indeed.com/jobs?q=data+scientist&l=Maroc&start=10'

In [21]:
while True:
    try:
        url = 'https://ma.indeed.com' + soup.find('a',{'aria-label':'Suivant'}).get('href')
    except AttributeError:
        break

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'job_seen_beacon')

    for card in cards:
        record = get_record(card)
        records.append(record)

In [22]:
len(records)

75

# Putting all together

In [23]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://ma.indeed.com/jobs?q={}&l={}'   
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    job_title = card.h2.span.get('title')
    #company = card.find('span', 'companyName').text.strip()
    location = card.find('div', 'companyLocation').text.strip()
    job_summary = card.find('div', 'job-snippet').text.strip().replace('\n', ' ')
    post_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    job_url = 'https://ma.indeed.com' + card.a.get('href')

    record = (job_title, location, post_date, today, job_summary)

    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'job_seen_beacon')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://ma.indeed.com' + soup.find('a',{'aria-label':'Suivant'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle','Location', 'PostDate', 'ExtractDate', 'Summary'])
        writer.writerows(records)

In [24]:
main('data scientist', 'Maroc')

In [25]:
import pandas as pd
df = pd.read_csv('results.csv', sep=',')

In [26]:
df

Unnamed: 0,JobTitle,Location,PostDate,ExtractDate,Summary
0,Stage en Ingénieur Développement – Data Scient...,Maroc,Postedil y a 30+ jours,2021-12-21,Poste pour Ingénieur à Rabat – Offre d’Emploi ...
1,Data Scientist-(H/F),Maroc,Postedil y a 30+ jours,2021-12-21,"Développer des use cases de bout en bout, en c..."
2,Junior data scientist,Casablanca,Postedil y a 30+ jours,2021-12-21,Mettre en œuvre les processus de structuration...
3,Data Scientist,Casablanca,Postedil y a 30+ jours,2021-12-21,Le Data Scientist aura comme principale missio...
4,Data Scientist (H/F),Casablanca,Postedil y a 30+ jours,2021-12-21,Exploration des données à l’aide d’outils de D...
...,...,...,...,...,...
70,Offre: Scrum master,Rabat,Postedil y a 13 jours,2021-12-21,"As a data scientist, you are tasked with: Enab..."
71,Offre: Hardware developer,Maroc,Postedil y a 30+ jours,2021-12-21,Nous cherchons pour notre entité AgriEdge de l...
72,Offre: Bscs developer,Casablanca,Postedil y a 30+ jours,2021-12-21,Secteur d'activité : Informatique Fonction : I...
73,Data Scientist,Settat,Postedil y a 16 jours,2021-12-21,Present information using data visualization t...


In [27]:
df.head()

Unnamed: 0,JobTitle,Location,PostDate,ExtractDate,Summary
0,Stage en Ingénieur Développement – Data Scient...,Maroc,Postedil y a 30+ jours,2021-12-21,Poste pour Ingénieur à Rabat – Offre d’Emploi ...
1,Data Scientist-(H/F),Maroc,Postedil y a 30+ jours,2021-12-21,"Développer des use cases de bout en bout, en c..."
2,Junior data scientist,Casablanca,Postedil y a 30+ jours,2021-12-21,Mettre en œuvre les processus de structuration...
3,Data Scientist,Casablanca,Postedil y a 30+ jours,2021-12-21,Le Data Scientist aura comme principale missio...
4,Data Scientist (H/F),Casablanca,Postedil y a 30+ jours,2021-12-21,Exploration des données à l’aide d’outils de D...
