# Wuzzuf Scraping 

## Check Allow and Disallow Scraping 

In [1]:
# https://wuzzuf.net/robots.txt

## Import Libraries and Modules 

In [2]:
import requests
from bs4 import BeautifulSoup

## Check website response 

In [4]:
response = requests.get(url='https://wuzzuf.net/search/jobs/?a=navbl%7Cspbl&q=data%20science')
response

<Response [200]>

## Collect Data 

In [None]:
# response.content  >>> Bring me the content inside this response 
# 'html.parser'     >>> Specify type of parsing {By default : html }
data = BeautifulSoup(response.content , 'html.parser')
data   

## Extract job title 

### Way 1

In [122]:
data.find('h2' , 'css-193uk2c')

<h2 class="css-193uk2c"><style data-emotion="css o171kl">.css-o171kl{-webkit-text-decoration:none;text-decoration:none;color:inherit;}</style><a class="css-o171kl" href="https://wuzzuf.net/jobs/p/f2mpxj1rzhp1-data-science-instructor-epsilonai-cairo-egypt" rel="noreferrer" target="_blank">Data Science Instructor</a></h2>

In [123]:
data.find('h2' , 'css-193uk2c').a

<a class="css-o171kl" href="https://wuzzuf.net/jobs/p/f2mpxj1rzhp1-data-science-instructor-epsilonai-cairo-egypt" rel="noreferrer" target="_blank">Data Science Instructor</a>

In [124]:
data.find('h2' , 'css-193uk2c').a.text

'Data Science Instructor'

In [125]:
data.find('h2' , 'css-193uk2c').a['href'] 

'https://wuzzuf.net/jobs/p/f2mpxj1rzhp1-data-science-instructor-epsilonai-cairo-egypt'

### Way 2

In [126]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').h2.text 

'Data Science Instructor'

## Extract Company Name

In [26]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.a.text

'EpsilonAI -'

#### Cleaning String 

##### By strip() function

In [28]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.a.text.strip(' -')

'EpsilonAI'

#### By Indexing 

In [29]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.a.text[: -2]

'EpsilonAI'

#### By replace() function 

In [31]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.a.text.replace(' -' , '')  

'EpsilonAI'

## Extract City Name 

In [35]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.span.text.strip()

'Nasr City, Cairo, Egypt'

In [38]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.span.text.strip().split(',')[0]

'Nasr City'

##  Extract Province Name 

In [40]:
data.find('div' , 'css-pkv5jc').find('div','css-lptxge').div.span.text.split(',')[1].strip()

'Cairo'

## Extract Experience 

In [65]:
data.find('div' , 'css-pkv5jc').find('div','css-1rhj4yg').find_all('div')[1].a.text

'Entry Level'

## Extract Years of Experience 

In [97]:
data.find('div' , 'css-pkv5jc').find('div','css-1rhj4yg').find_all('div')[1].span.text.strip('· ')

'1+ Yrs of Exp'

## Extract Skills 

In [107]:
data.find('div' , 'css-pkv5jc').find('div','css-1rhj4yg').find_all('div')[1].find_all('a')[1:]

[<a class="css-o171kl" href="/a/IT-Software-Development-Jobs-in-Egypt"> <!-- -->· <!-- -->IT/Software Development</a>,
 <a class="css-o171kl" href="/a/Engineering-Telecom-Technology-Jobs-in-Egypt"> <!-- -->· <!-- -->Engineering - Telecom/Technology</a>,
 <a class="css-o171kl" href="/a/Training-Instructor-Jobs-in-Egypt"> <!-- -->· <!-- -->Training/Instructor</a>,
 <a class="css-5x9pm1" href="/a/Data-Science-Jobs-in-Egypt"><span><strong class="highlight">Data</strong><span> </span><strong class="highlight">Science</strong></span></a>,
 <a class="css-5x9pm1" href="/a/Computer-Science-Jobs-in-Egypt"><span><span>Computer </span><strong class="highlight">Science</strong></span></a>,
 <a class="css-5x9pm1" href="/a/Computer-Engineering-Jobs-in-Egypt"> <!-- -->· <!-- -->Computer Engineering</a>,
 <a class="css-5x9pm1" href="/a/Python-Jobs-in-Egypt"> <!-- -->· <!-- -->Python</a>,
 <a class="css-5x9pm1" href="/a/Machine-Learning-Jobs-in-Egypt"> <!-- -->· <!-- -->Machine Learning</a>,
 <a class="

In [110]:
skills_list=data.find('div' , 'css-pkv5jc').find('div','css-1rhj4yg').find_all('div')[1].find_all('a')[1:]
skills= [skill.text.strip(' · ') for skill in skills_list]  
skills

['IT/Software Development',
 'Engineering - Telecom/Technology',
 'Training/Instructor',
 'Data Science',
 'Computer Science',
 'Computer Engineering',
 'Python',
 'Machine Learning',
 'Deep Learning',
 'Artificial Intelligence (AI)']

## Extract One page Jobs

In [132]:
response = requests.get(url='https://wuzzuf.net/search/jobs/?a=navbl%7Cspbl&q=data%20science&start')
data = BeautifulSoup(response.content , 'html.parser')
  

import csv

# Create CSV file
with open('Wuzzuf One Page.csv', mode= 'w', newline= '') as file:
     # newline >>>>>>  To ignore Empty rows
    # Create Columns Names
    writer = csv.DictWriter(f= file, fieldnames= ['job_title', 'company_name', 'city', 'province', 'experience', 'years_of_exp', 'skills'])
    writer.writeheader()  
    for job in data.find_all('div' , 'css-pkv5jc') : 
        job_title =   job.find('div','css-lptxge').h2.text 
        company_name =job.find('div','css-lptxge').div.a.text
        city          = job.find('div','css-lptxge').div.span.text.strip()
        province      = job.find('div','css-lptxge').div.span.text.split(',')[1].strip()
        experience     = job.find('div','css-1rhj4yg').find_all('div')[1].a.text
        years_of_exp   = job.find('div','css-1rhj4yg').find_all('div')[1].span.text.strip('· ')
        skills_list=data.find('div' , 'css-pkv5jc').find('div','css-1rhj4yg').find_all('div')[1].find_all('a')[1:]
        skills= [skill.text.strip(' · ') for skill in skills_list]  
        writer.writerow(
            {
                'job_title' :job_title, 
                'company_name' : company_name , 
                'city' : city , 
                'province' : province  , 
                'experience' : experience , 
                'years_of_exp': years_of_exp,  
                'skills' :skills

            }
            )
        






## Pagination

In [139]:
with open('Wuzzuf Full Pages.csv', mode= 'w', newline= '') as file:
   
    writer = csv.DictWriter(f= file, fieldnames= ['job_title', 'company_name', 'city', 'province', 'experience', 'years_of_exp', 'skills'])
    writer.writeheader() 
    page_num = 0
    while True : 
        response= requests.get(url=F'https://wuzzuf.net/search/jobs/?a=navbl%7Cspbl&q=data%20science&start={page_num}')
        data = BeautifulSoup(response.content , 'html.parser')
        if data.find('div' , 'css-pkv5jc')==None : 
            print('Done Scraping All Pages.')
            break 
         
        for job in data.find_all('div' , 'css-pkv5jc') : 
            try:
                job_title =   job.find('div','css-lptxge').h2.text 
                company_name =job.find('div','css-lptxge').div.a.text
                city          = job.find('div','css-lptxge').div.span.text.strip()
                province      = job.find('div','css-lptxge').div.span.text.split(',')[1].strip()
                experience     = job.find('div','css-1rhj4yg').find_all('div')[1].a.text
                years_of_exp   = job.find('div','css-1rhj4yg').find_all('div')[1].span.text.strip('· ')
                skills_list=data.find('div' , 'css-pkv5jc').find('div','css-1rhj4yg').find_all('div')[1].find_all('a')[1:]
                skills= [skill.text.strip(' · ') for skill in skills_list]  
            except Exception as ex: 
                years_of_exp='Missing'
            writer.writerow(
                        {
                            'job_title' :job_title, 
                            'company_name' : company_name , 
                            'city' : city , 
                            'province' : province  , 
                            'experience' : experience , 
                            'years_of_exp': years_of_exp,  
                            'skills' :skills

                        }
                        )
        
        page_num+=1

Done Scraping All Pages.
