In [1]:
import os
from tqdm import tqdm
print("Current Working Directory " , os.getcwd())

Current Working Directory  /Users/leducanh/Documents/pythonProject


In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd


class Scraper:
    def __init__(self, mainUrl):
        self.links = {
            'categories': [],
            'pages': {},
            'jobOffers': []
        }
        self.data = {
            'jobTitles': [],
            'company': [],
            'salaryRange': [],
            'Bonus': [],
            'category+progLang': [],
            'seniority': [],
            'workLanguage': [],
            'location': [],
            'remote':[]
        }
        self.mainUrl = mainUrl
        self.categList = []

    # getting the links for job categories
    def getUrlByCat(self):

        response = requests.get(self.mainUrl)

        soup = BeautifulSoup(response.text, 'lxml')
        linksByCat = soup.find_all('div',
                                   class_="d-flex justify-content-between align-items-center list-title-wrapper ng-star-inserted")
        
        urlList = [self.mainUrl + href.a['href'] for href in linksByCat[1:]]
        
        # removing redundant links in urlList
        self.links['categories'] = [link for link in urlList if '?criteria' not in link]

        #get a list of job categories, which will be used as keys in the links['pages'], links['jobOffers'] dictionary
        self.categList = [self.links['categories'][i].rsplit('/', 1)[-1] for i in range(len(self.links['categories']))]
        print('categorical urls success')

    def getPages(self):
        ''' fill the self.links['pages'], where categories are denoted as keys with the value being a list containing
                 the link to the first page of job offers of the particular category
                 later, the idea will be to append these lists, with the links to the next job offer pages'''

        self.links['pages'] = {k: [v] for v, k in zip(self.links['categories'], self.categList)}

        '''e.g. links['pages'] = {'backend': ['https://nofluffjobs.com/cz/it-prace/backend'], 
                                  'frontend': ['https://nofluffjobs.com/cz/it-prace/frontend'],
                                   ...}'''

        # adding the page url from the next button (first url page is already added)
        for url in self.links['pages'].values():
            while True:
                try:
                    response = requests.get(url[-1])
                    soup = BeautifulSoup(response.text, 'lxml')
                    pageLink = self.mainUrl + soup.find('a', {'aria-label': 'Next'}).get('href')
                    url.append(pageLink)
                except AttributeError:
                    break
                time.sleep(0.2)
                response = requests.get(pageLink)
                #soup = BeautifulSoup(response.text, 'lxml')
        print('page urls success')

    def getJobLinks(self):
        #add keys based on job categories to the self.links['jobOffers'] dictionary  ,the values are emtpy lists
        #self.links['jobOffers'] = {k: [] for k in self.categList}
        #pageLinks is a dictionary with key: list values, where key is the job category and lists contain links for the pages
        for lists in self.links['pages'].values():
            for link in lists:
                time.sleep(0.1)
                # request applied on a particular page
                response = requests.get(link).text
                soup = BeautifulSoup(response, 'lxml')
                #select all the 'a' tags on the particular page with the idea to capture all the 'hrefs' from there
                aTags = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-postings-search > div > common-main-loader > div > nfj-search-results > nfj-postings-list > div.list-container.ng-star-inserted>a')
                #looping through the 'a' tags to capture the 'hrefs' which are than added into the 'jobOffers' dictionary's list
                #keys denotes the job category, where the particular 'href' belongs to
                for href in aTags:
                    self.links['jobOffers'].append(self.mainUrl + href.get('href'))
        print('job sites url success')
    def getData(self):
        for link in tqdm(self.links['jobOffers']):    
            time.sleep(0.2)
            response = requests.get(link)
            soup = BeautifulSoup(response.text, 'lxml')
            #getting job titles
            jobTitle = soup.find('div',
                                 class_="posting-details-description d-flex align-items-center align-items-lg-start flex-column justify-content-center justify-content-lg-start")
            try:
                self.data['jobTitles'].append(jobTitle.h1.text)
            except:
                self.data['jobTitles'].append(None)
            self.data['company'].append(jobTitle.a.text)

            #getting offered salary ranges
            salaryRange = soup.find('div', class_="salary")
            self.data['salaryRange'].append(salaryRange.h4.text)

            #getting  bonus
            bonus = soup.find('common-postings-bonus')
            try:
                self.data['Bonus'].append(bonus.a.text)
            # if no bonus than assign None
            except:
                self.data['Bonus'].append(None)

            #getting job's location
            location = soup.find('common-posting-locations')
            try:
                self.data['location'].append(location.text)
            except:
                self.data['location'].append(None)
            
            #getting the info, whether it is a remote job
            remote = soup.find('li', class_="remote")
            try:
                self.data['remote'].append(remote.text)
            except:
                self.data['remote'].append(None)
            
            # [0] to unlist the soup.select
            categoryProgLang = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section> ul > li:nth-child(1)')[0]
            self.data['category+progLang'].append(categoryProgLang.text)
            
            seniority = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section> ul > li:nth-child(2)')[0]
            self.data['seniority'].append(seniority.text)
            
            
            workLang = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section> ul > li:nth-child(3)')[0]
            try:
                self.data['workLanguage'].append(workLang.text)
            except:
                self.data['workLanguage'].append(None)
            #getting offered job category (type of job + the main language
            #we will get an ul containing the info for job category, seniority and working language
            #[0] used to unlist the soup.select object
        print('data extraction success')

In [3]:
mainUrl = 'https://nofluffjobs.com'

site = Scraper(mainUrl)

site.getUrlByCat()

site.getPages()

site.getJobLinks()

site.getData()




categorical urls success
page urls success
job sites url success


100%|██████████| 63/63 [01:19<00:00,  1.26s/it]

data extraction success





In [4]:
df = pd.DataFrame(site.data)


In [5]:
df.head()

Unnamed: 0,jobTitles,company,salaryRange,Bonus,category+progLang,seniority,workLanguage,location,remote
0,Backend Engineer - SaaS Platform,SentinelOne,100k - 200k CZK,,"Kategorie: Backend , Golang","Mid, Senior",Pracovní jazyk:Angličtina,"Prague, Brno • Na dálku •Prague, Karolinská 7...",Na dálku
1,SW Engineer-Scala/Go: Realtime Detection,SentinelOne,100k - 200k CZK,,"Kategorie: Backend , scala","Mid, Senior",Pracovní jazyk:Angličtina,"Prague, Brno • Na dálku •Prague, Karolinská 7...",Na dálku
2,Senior Java Developer (Big Data),SentinelOne,100k - 200k CZK,,"Kategorie: Backend , java","Mid, Senior",Pracovní jazyk:Angličtina,"Prague, Brno • Na dálku •Prague, Karolinská 7...",Na dálku
3,Kotlin Developer,"Onlio, a.s.",46.2k - 92.4k CZK,,"Kategorie: Backend , java",Mid,Pracovní jazyk:Čeština,"Hybrid +1 •Praha 7, U garáží 1611/1 40% na m...",
4,Java Developer,Neovia,140k - 200k CZK,,"Kategorie: Backend , java","Mid, Senior",Pracovní jazyk:Čeština,Hybrid +1 •Praha,


In [6]:
df.to_csv('jobOffers-22.08.csv')