In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from random import randint
from tqdm import tqdm_notebook
import numpy as np


In [2]:
# This code creates a list of search URLs to scrape data from a website.

# The template variable contains the URL of the website to be scraped. The base_url variable contains the template variable plus the string '&p=' which will be replaced by the page number.

# The for loop iterates through the range of numbers from 1 to 15 (page_numbers variable) and replaces the '&p=' string in the base_url variable with the page number. This new URL is then appended to the search_urls list.

# The len() function is then used to print the number of URLs in the search_urls list.



search_urls = []
page_numbers = 15
template = 'https://www.datacareer.de/jobs/?q=data&l=&locale=de'
base_url = template
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
for page in range(1,page_numbers,1):
    search_url = f'{base_url}&p={page}'
    search_urls.append(search_url)
print(F'{len(search_urls)} search urls created')


14 search urls created


In [3]:
#This is a function that takes in a list of data (search_urls), and returns a list of links.
#The function first creates an empty list called links.
#Then, for each url in the list of data, it makes a GET request.
#It parses the content of the response using BeautifulSoup, looking specifically for div tags with the class 'media-heading listing-item__title'.
#Then, it sleeps for a random amount of time between 1 and 5 seconds.
#For each job that it finds, it extracts the link associated with that job.
#Then, it appends the link to the list of links, and sleeps for another random amount of time between 1 and 5 seconds.
#Finally, it returns the list of links.



def get_job_links(data):
    links = []
    for url in tqdm_notebook(data):
        r=requests.get(url)
        soup=BeautifulSoup(r.content,'lxml')
        jobs = soup.find_all('div', class_ = 'media-heading listing-item__title')
        time.sleep(randint(1,5))
        for i in jobs:
            link = i.find('a',href=True)
            link = link['href']
            final_link = link
            links.append(final_link)
            time.sleep(randint(1,5))
    return links
links = get_job_links(search_urls)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for url in tqdm_notebook(data):


  0%|          | 0/14 [00:00<?, ?it/s]

In [4]:
# This code is creating a list of links from a set of links. A set is a data structure that contains unique elements.
# A list is a data structure that contains a sequence of elements. The code is printing the length of the list and the first four elements of the list.


links = set(links)
links = list(links)
print(F'{len(links)} links found')
print(F'Exemples {links[:4]}')

180 links found
Exemples ['https://www.datacareer.de/job/10351/chief-data-governance-officer-m-w-d-master-data-management/', 'https://www.datacareer.de/job/10267/data-analyst-m-w-d/', 'https://www.datacareer.de/job/10276/business-data-analyst-level-1-m-f-d/', 'https://www.datacareer.de/job/10392/data-analyst-m-f-d/']


In [5]:
# The code is written in Python and uses the BeautifulSoup and tqdm_notebook libraries.

# The code defines a function called parsing_jobs_html that takes a list of links as input.

# For each link in the list of links, the code makes an HTTP GET request to the link.

# The code then parses the HTML content of the response using the BeautifulSoup library.

# The code then extracts information about the job (e.g. job title, company, location, etc.) from the parsed HTML content and stores it in a dictionary.

# The code then appends the dictionary to a list.

# The code returns the list of dictionaries.

# The code then creates a pandas DataFrame from the list of dictionaries.



def parsing_jobs_html(input):
    data = []
    for link in tqdm_notebook(input):
        r=requests.get(link)
        soup=BeautifulSoup(r.content,'lxml')
        jobs = soup.find_all('div', class_ = 'listing-results')
        for job in jobs:
            jobTitel = job.find('h1').text.strip()
            Company = job.find('li', class_ = 'listing-item__info--item-company').text.strip()
            Location = job.find('li', class_ = 'listing-item__info--item-location').text.strip()
            postDate = job.find('li', class_ = 'listing-item__info--item-date').text.strip()
            jobDescription =job.find('div', class_ = 'details-body__content').text.strip()
            dic = {'jobTitel':jobTitel, 'Company':Company, 'Location':Location, 'jobDescription':jobDescription,'postDate':postDate}
            data.append(dic)
            time.sleep(20)
    return data
example_html_parsing = parsing_jobs_html(links)
df_parsing_jobs_html = pd.DataFrame(example_html_parsing)
df_parsing_jobs_html.head(5)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for link in tqdm_notebook(input):


  0%|          | 0/180 [00:00<?, ?it/s]

Unnamed: 0,jobTitel,Company,Location,jobDescription,postDate
0,(Chief) Data Governance Officer (m/w/d) - Mast...,ROSEN,"Lingen, Germany",Am Standort in Lingen (Ems) oder Osnabrück suc...,23/05/2022
1,Data Analyst (m/w/d),BD,"Kelberg, Germany",Job Description Summary\r\nBD Rowa stands for ...,04/05/2022
2,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,"Berlin, Germany",Job Description\n\n\nemployment type\nfull tim...,06/05/2022
3,Data Analyst (m/f/d),Arvato infoscore GmbH,"Extersche Straße 33415, Bad Salzuflen, Germany",Job Description\n\n\nemployment type\nfull tim...,30/05/2022
4,Business Analyst / Data Scientist (m/w/d),FUNKE MEDIENGRUPPE,"Essen, Germany",Job Description\n\n\nemployment type\nfull tim...,09/05/2022


In [6]:
df_parsing_jobs_html.head(5)

Unnamed: 0,jobTitel,Company,Location,jobDescription,postDate
0,(Chief) Data Governance Officer (m/w/d) - Mast...,ROSEN,"Lingen, Germany",Am Standort in Lingen (Ems) oder Osnabrück suc...,23/05/2022
1,Data Analyst (m/w/d),BD,"Kelberg, Germany",Job Description Summary\r\nBD Rowa stands for ...,04/05/2022
2,Business Data Analyst – Level 1 (m/f/d),ICE International Copyright Enterprise Germany...,"Berlin, Germany",Job Description\n\n\nemployment type\nfull tim...,06/05/2022
3,Data Analyst (m/f/d),Arvato infoscore GmbH,"Extersche Straße 33415, Bad Salzuflen, Germany",Job Description\n\n\nemployment type\nfull tim...,30/05/2022
4,Business Analyst / Data Scientist (m/w/d),FUNKE MEDIENGRUPPE,"Essen, Germany",Job Description\n\n\nemployment type\nfull tim...,09/05/2022


In [7]:
# This Code is is saving the resulting DataFrame to a CSV file.
df_parsing_jobs_html.to_csv('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/01_Scraping/Scraped data/website_1/jobs_website_1_raw.csv',index=False)
