In [1]:
import requests, bs4, time
import pandas as pd
import os.path
from datetime import datetime

In [2]:
path = os.getcwd()
parent_folder, current_folder = os.path.split(path)

# Scrape all links to job postings containing the word 'Data'
First searching for all job postings using the word 'Data' on www.cwjobs.de.uk

In [3]:
def extract_jobtype(soup):
    jobtype=[x.text for x in soup.find_all('li',{'class':'job-type'})]
    return jobtype

def extract_full_desc(soup):
    text=[x.text for x in soup.find_all('div',{'class':'job-description'})]
    return text

def extract_links(soup):
    links =[]
    for div in soup.find_all(name='div', attrs={'class':'job-title'}):
        for a in div.find_all('a'):
            links.append(a['href'])
    return links

def extract_company_from_result(soup): 
    company = []
    for div in soup.find_all(name="li", attrs={"class":"company"}):
        company.append(div.text.strip())
        
    return(company)

def extract_date_from_result(soup): 
    date = []
    for div in soup.find_all('li',{'class':'date-posted'}):
        date.append(div.text.strip())
   
    return(date)

def extract_location_from_result(soup): 
    location = []
    for div in soup.find_all('li', {'class':'location'}):
        for a in div.find('a'):
            location.append(a)
   
    return(location)

def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.find_all(name="li", attrs={"class":"salary"}):
        try:
            salaries.append(div.text)
        except:
            salaries.append("Nothing_found")
    return(salaries)

def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"job-title"}):
        for a in div.find_all(name="h2"):
            jobs.append(a)
    return(jobs)

In [4]:
#scraping code:

#decide what search term to use for finding jobs
searchTerm="data"

#create empty data frame with column headers
ads=pd.DataFrame(columns=['company','title','salary','location','date','full_description','jobtype'])

# loop for scraping

for i in range(0, 123):
    company = []
    job_title = []
    description = []
    salary = []
    location = []
    date = []
    full_description = []
    text_list = []
    type_list = []
    
    time.sleep(1) #ensuring at least 1 second between page grabs
    url = 'https://www.cwjobs.co.uk/jobs/'+searchTerm+'?s=header&page='+str(i)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.content)
    df = pd.DataFrame(columns=['company','title','salary','location','date','full_description','jobtype'])
    df['company'] = extract_company_from_result(soup)
    df['title'] = extract_job_title_from_result(soup)
    df['salary'] = extract_salary_from_result(soup)
    df['location'] = extract_location_from_result(soup)
    df['date'] = extract_date_from_result(soup)
    
    sub_urls=extract_links(soup)
    for j in sub_urls:
        res_sub = requests.get(j)
        soup_sub = bs4.BeautifulSoup(res_sub.content)
        desc = extract_full_desc(soup_sub)
        jobtype = extract_jobtype(soup_sub)
        text_list.append(desc)
        type_list.append(jobtype)
        
    df['full_description'] = text_list
    df['jobtype'] = type_list

    ads = ads.append(df, ignore_index=True)

today = datetime.now().strftime('%Y_%m_%d_%H_%M')
ads.to_csv(parent_folder+'/data/cwjobs_'+today+'.csv', index=True, sep='\t')

In [18]:
ads

Unnamed: 0,company,title,salary,location,date,full_description,jobtype
0,JK Technosoft (UK) Ltd,[Big Data Analytics],Unspecified,Update,Recently,[\n\nRole: Big Data Anlytics ArchitectLocation...,[\nContract\n]
1,Explore Open Source,[Data Engineer],Unspecified,City of London,Posted 3 days ago,[\n\nAn exciting start-up are looking for high...,[\nPermanent\n]
2,McGregor Boyall,[MI Data Analytics Analyst],£40000 - £48000 per annum + Benefits,London,Expires in 1 day,"[\n\nMI Data Analytics AnalystOur client, a fi...",[\nPermanent\n]
3,ARC IT Recruitment Ltd,[Project Manager / Data Governance],Unspecified,Update,Expires today,"[\n\nProject Manager / Data GovernanceLondon, ...",[\nContract\n]
4,ARC IT Recruitment Ltd,[Project Manager / Data Governance],£Competitive + Bonus + Benefits,City of London,Expires today,"[\n\nProject Manager / Data GovernanceLondon, ...",[\nPermanent\n]
...,...,...,...,...,...,...,...
2455,Exponential-e Limited,[Trainee Junior Service Desk Technician – No e...,"£18,000 per annum",Update,Recently,[\n\nTrainee Junior Service Desk Technician – ...,[\nPermanent\n]
2456,Computacenter,[UC Principal Consultant],Unspecified,Update,Recently,[\n\r\n About usComputacenter is a lead...,[\nPermanent\n]
2457,BAE Systems,[Project Management Professional],Unspecified,New Malden,Recently,[\n\nProject Management ProfessionalWould you ...,[\nPermanent\n]
2458,Leonardo,[Senior/Principal RF/Microwave Electronics Eng...,Competitive,Update,Recently,[\n\nLeonardo is an international leader in th...,[\nPermanent\n]


In [11]:
#establish how many ads had no salary
ads['salary'].value_counts()

Unspecified                                              360
Competitive                                               32
Market Rate                                               16
£50000 - £60000 per annum                                 15
Competitive Basic + Car Allowance + Benefits              14
                                                        ... 
£451.81 - £496.99 per day                                  1
£45000 - £55000 per annum + Car Allowance, Benefits        1
From £70,000 to £140,000 per annum + bonus + benefits      1
£95,000 per annum dependent on experience                  1
Up to £65000 per annum + +Bonus + Benefits                 1
Name: salary, Length: 1362, dtype: int64