# Importing Libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
from datetime import datetime, timedelta
from random import randint
from tqdm import tqdm_notebook
import json
import re
import numpy as np


In [9]:
# This code opens the HTML file "jobs.html" from the given path and reads it into a BeautifulSoup object.

# The "with open" statement ensures that the file is properly closed after the nested block of code is executed.

# The "encoding" parameter specifies the character encoding of the file, in this case "utf8".

# The "soup" variable stores the BeautifulSoup object containing the parsed HTML from the "jobs.html" file.


with open("Master/01_Scraping/example_json_scraping_it_jobs/jobs.html",encoding="utf8") as file:
    soup = BeautifulSoup(file, 'html.parser')

In [19]:
# This code defines a function called 'extracting_links_from_html' that takes one input, 'soup'.

# This function will create an empty list called 'links'.

# This function will then loop through every 'a' tag in 'soup'. For each 'a' tag, it will take the 'href' attribute and append it to the 'links' list.

# This function will then convert the 'links' list into a set (which removes any duplicates), and then convert it back into a list.

# This function will then return the list of links.

# This code then calls the 'extracting_links_from_html' function with 'soup' as the input, and saves the output to a new variable called 'links'.

# This code then prints the length of the 'links' list and the first two elements in the list.



def extracting_links_from_html(soup):
    links = []
    for link in soup.find_all('a'):
        links.append(link.get('href'))
    links = list(set(links))
    return links   
links = extracting_links_from_html(soup)
print(len(links))
print(links[:2])

151
['https://it-jobs.de/data-analytics-ts-manager-sm-in-frankfurt-am-main_jpr-8509e9c9d44c9fd90729beaf7c41c036/', 'https://it-jobs.de/senior-data-analyst-in-hamburg_jpr-5385db2d04c604e1767a60b590f86171/']


In [21]:

# The code is iterating through the "links" list and if an element in the "links" list is also in the "bad_links" list, it removes that element from the "links" list. 

# The "final_links" variable is then set to the modified "links" list. The code then prints the length of the "final_links" list.


bad_links = ['https://trello.com/b/sMTo6oQp/open-source-licenses','https://it-jobs.de/', 'https://it-jobs.de/achievements/','https://it-jobs.de/join-the-fundrace/','https://it-jobs.de/arbeitgeber/','None','https://it-jobs.de/stellenangebote/unternehmen/']

for element in links:
    if element in bad_links:
        links.remove(element)
final_links = links
print(len(final_links))

145


In [17]:
# The code is importing the BeautifulSoup and tqdm modules, which are both libraries that help make web scraping easier. It is also importing the requests, json, re, datetime, and time modules. 
#
# The requests module allows you to send HTTP requests using Python, the json module allows you to work with JSON data, the re module allows you to use regular expressions, the datetime module allows you to work with dates and times, 
# and the time module allows you to work with time-related functions.

# The code is defining a function called extracting_job_postings. This function takes in an input (which in this case is a list of links to job postings) and returns a list of dictionaries. Each dictionary in the list represents a job posting, 
# and each key in the dictionary represents a piece of information about the job posting (e.g. job title, company, location, etc.).

# The code is creating an empty list called data. This list will be populated with dictionaries (one dictionary for each job posting).

# The code is looping through each link in the input list. For each link, the code is making an HTTP request to the link using the requests module. It is then using the BeautifulSoup module to parse the HTML of the job posting page.

# The code is trying to extract the job title, company, location, job description, and post date from the job posting page. If it is unable to extract this information (e.g. because the job posting page is not in the correct format), it will set the value to NaN.

# The code is creating a dictionary for each job posting with the extracted information. It is then adding this dictionary to the data list.

# The code is returning the data list.




def extracting_job_postings(input):
    
    data = []

    for element in tqdm_notebook(input):
        r=requests.get(element)
        soup = BeautifulSoup(r.text, 'html.parser')
        try:
            parsed_data = json.loads(soup.find('script',type='application/ld+json').string)
        except:
            parsed_data = np.nan
        regex_tags_remover = re.compile('<.*?>')
        try:
            jobTitle = parsed_data['title']
        except:
            jobTitle = np.nan
        try:
            Company = parsed_data['hiringOrganization']['name']
        except:
            Company = np.nan
        try:
            Location = parsed_data['jobLocation'][0]['address']['addressLocality']
        except:
            Location = np.nan
        try:
            jobDescription = re.sub(r'[\n\t]*','',regex_tags_remover.sub('',parsed_data['description']))
        except:
            jobDescription = np.nan
        try:
            postDate = datetime.strptime(parsed_data['datePosted'],'%Y-%m-%d').strftime('%d/%m/%Y')
        except:
            postDate = np.nan

        data_dictionary = {
            'jobTitle': jobTitle,
            'Company': Company,
            'Location': Location,
            'jobDescription': jobDescription,
            'postDate': postDate
            }
        data.append(data_dictionary)
        time.sleep(randint(1,5))
    return data

jobs_from_it_jobs = extracting_job_postings(final_links)
jobs_from_it_jobs_df = pd.DataFrame(jobs_from_it_jobs)
jobs_from_it_jobs_df.head(5)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for element in tqdm_notebook(final_links):


  0%|          | 0/373 [00:00<?, ?it/s]

In [None]:
# Saving the dataframe to a csv file.

jobs_from_it_jobs_df.to_csv('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/01_Scraping/Scraped data/website_2/jobs_website_2.csv', index=False)