Code for scraping from indeed.de

In [1]:
#import packages

import requests, bs4, time
import pandas as pd
import os.path
from datetime import datetime

In [2]:
path = os.getcwd()
parent_folder, current_folder = os.path.split(path)

In [3]:
#define functions for parsing HTML

def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            jobs.append(a["title"])
    return(jobs)

def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        try:
            salaries.append(div.find(name="span",attrs={"class":"salaryText"}).text)
        except:
            salaries.append("Nothing_found")
    return(salaries)

def extract_location_from_result(soup): 
    locations = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        try:
            locations.append(div.find("span", attrs={"class": "location accessible-contrast-color-location"}).text)
        except:
            locations.append("Nothing_found")
   
    return(locations)

def extract_description_from_result(soup): 
    description = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        try:
            description.append(div.find("div", attrs={"class": "summary"}).text)
        except:
            description.append("Nothing_found")
   
    return(description)

def extract_date_from_result(soup): 
    date = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        try:
            date.append(div.find("span", attrs={"class": "date"}).text)
        except:
            date.append("Nothing_found")
   
    return(date)

def extract_company_from_result(soup): 
    company = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        try:
            company.append(div.find("span", attrs={"class": "company"}).text)
        except:
            company.append("Nothing_found")
   
    return(company)

def extract_links(soup):
    links =[]
    for div in soup.find_all(name='a', attrs={'class':'jobtitle turnstileLink'}):
        links.append('https://de.indeed.com'+str(div['href']))
    return links

def extract_full_desc(soup):
    text=[x.text for x in soup.find_all(name="div",attrs={"id":"jobDescriptionText"})]
    return text


def extract_headlines_from_result(soup): 
    headlines = pd.DataFrame(columns= ["location","type","salary"])
    list=[x.text for x in soup.find_all(name="span",attrs={"class":"jobsearch-JobMetadataHeader-iconLabel"})]
    try:
        headlines['location']
    except:
        salaries.append("Nothing_found")
    return(salaries)


In [6]:
#scraping code:

#decide what search term to use for finding jobs
searchTerm="data"

#create empty data frame with column headers
ads=pd.DataFrame(columns=['company','title','salary','location','description','date','full_description','url'])

# loop for scraping

for i in range(701, 900): # range(0:1000)
    company = []
    job_title = []
    description = []
    salary = []
    location = []
    date = []
    full_description = []
    text_list = []
    
    time.sleep(1) #ensuring at least 1 second between page grabs
    url = "https://de.indeed.com/Jobs?q="+searchTerm+"&filter=0&start="+str(i)
    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.content)
    df = pd.DataFrame(columns=['company','title','salary','location','description','date','full_description','url'])
    df['company'] = extract_company_from_result(soup)
    df['title'] = extract_job_title_from_result(soup)
    df['salary'] = extract_salary_from_result(soup)
    df['location'] = extract_location_from_result(soup)
    df['description'] = extract_description_from_result(soup)
    df['date'] = extract_date_from_result(soup)
    
    sub_urls = extract_links(soup)
    text = []
    deets = []
    for j in sub_urls:
        res_sub = requests.get(j)
        soup_sub = bs4.BeautifulSoup(res_sub.content)
        desc=extract_full_desc(soup_sub)
        text_list.append(desc)
        
    df['full_description'] = text_list
    df['url'] = sub_urls

    ads = ads.append(df, ignore_index=True)

today = datetime.now().strftime('%Y_%m_%d_%H_%M')
ads.to_csv(parent_folder+'/data/indeed_de_'+today+'.csv', index=True, sep='\t')

In [5]:
ads

Unnamed: 0,company,title,salary,location,description,date,full_description,url
0,\nAnalytics Academy - The Information Lab Deut...,"Analytics Academy - Junior Data Analyst, m/w/d",Nothing_found,Nothing_found,\n Die Analytics Academy bildet die nächste Ge...,Nothing_found,[Die Analytics Academy– dein Karriere-Sprungbr...,https://de.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1,\nfos4X GmbH,Data Architect (m/w/d) (Domain Wind),Nothing_found,Nothing_found,\n Übernahme der Verantwortung für die Skalier...,vor 1 Tag,"[Über uns\nWir sind ein junges, stark wachsend...",https://de.indeed.com/pagead/clk?mo=r&ad=-6NYl...
2,\n\nUniper,Data Steward (m/w/d),Nothing_found,Nothing_found,"\n Front Office, Risk, und Senior Management) ...",Nothing_found,[Wofür wir jemanden suchen\nAls Data Steward b...,https://de.indeed.com/pagead/clk?mo=r&ad=-6NYl...
3,\nSpark Radiance GmbH,Programmierer (m/w/d) Datenmanagement - ETL / ...,Nothing_found,Nothing_found,\n Programmierer (m/w/d) Datenmanagement – ETL...,Nothing_found,[Spark Radiance ist die digitale Innovationssc...,https://de.indeed.com/pagead/clk?mo=r&ad=-6NYl...
4,\n\nUniversitätsklinikum Carl Gustav Carus Dre...,Data Warehouse Entwickler (w/m/d),Nothing_found,Nothing_found,\n Entwicklung von ETL-Prozessen zur Integrati...,Nothing_found,[Jobs mit Aussicht\nWir sorgen für eine zuverl...,https://de.indeed.com/pagead/clk?mo=r&ad=-6NYl...
...,...,...,...,...,...,...,...,...
4118,\nLufthansa Innovation Hub,Partnership Manager Asia (m/f/d),Nothing_found,Berlin,"\n Customer reach, brand, data assets, infrast...",vor 5 Monaten,[About us:\n\nThe Lufthansa Innovation Hub (LI...,https://de.indeed.com/rc/clk?jk=3c4c3a58969461...
4119,\n\nHolidu GmbH,Data Scientist - Rankings (f/m/d),Nothing_found,München,\n In-house Data Warehouse built on ElasticSea...,vor 30+ Tagen,[Holidu is a travel & tech start-up building t...,https://de.indeed.com/company/Holidu/jobs/Data...
4120,\nOn AG,Sales Representative Lifestyle,Nothing_found,Berlin,\n Durchführen von Sales Präsentation mit On M...,vor 30+ Tagen,[Auf einen Blick:\nSneaker Heads aufgepasst! W...,https://de.indeed.com/rc/clk?jk=daaf2220e276a6...
4121,\nfos4X GmbH,Data Architect (m/w/d) (Domain Wind),Nothing_found,Nothing_found,\n Übernahme der Verantwortung für die Skalier...,vor 1 Tag,"[Über uns\nWir sind ein junges, stark wachsend...",https://de.indeed.com/pagead/clk?mo=r&ad=-6NYl...


In [7]:
#establish how many ads had no salary
ads[ads['salary']=="Nothing_found"].shape

(157, 7)

In [249]:
#print data to csv
#ads.to_csv(r"data\indeed_scrape_data.csv")

In [14]:
i

461

In [15]:
today = datetime.now().strftime('%Y_%m_%d_%H_%M')
ads.to_csv(parent_folder+'/data/indeed_de_'+today+'.csv', index=True, sep='\t')