In [1]:
import os
from tqdm import tqdm
print("Current Working Directory " , os.getcwd())

Current Working Directory  /Users/leducanh/Documents/pythonProject


In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd


class Scraper:
    def __init__(self, mainUrl):
        self.links = {
            'categories': [],
            'pages': {},
            'jobOffers': []
        }
        self.data = {
            'jobTitles': [],
            'company': [],
            'salaryRange': [],
            'Bonus': [],
            'category+progLang': [],
            'seniority': [],
            'workLanguage': [],
            'location': [],
            'remote':[]
        }
        self.mainUrl = mainUrl
        self.categList = []

    # getting the links for job categories
    def getUrlByCat(self):

        response = requests.get(self.mainUrl)

        soup = BeautifulSoup(response.text, 'lxml')
        linksByCat = soup.find_all('div',
                                   class_="d-flex justify-content-between align-items-center list-title-wrapper ng-star-inserted")
        
        urlList = [self.mainUrl + href.a['href'] for href in linksByCat[1:]]
        
        # removing redundant links in urlList
        self.links['categories'] = [link for link in urlList if '?criteria' not in link]

        #get a list of job categories, which will be used as keys in the links['pages'], links['jobOffers'] dictionary
        self.categList = [self.links['categories'][i].rsplit('/', 1)[-1] for i in range(len(self.links['categories']))]
        print('categorical urls success')

    def getPages(self):
        ''' fill the self.links['pages'], where categories are denoted as keys with the value being a list containing
                 the link to the first page of job offers of the particular category
                 later, the idea will be to append these lists, with the links to the next job offer pages'''

        self.links['pages'] = {k: [v] for v, k in zip(self.links['categories'], self.categList)}

        '''e.g. links['pages'] = {'backend': ['https://nofluffjobs.com/cz/it-prace/backend'], 
                                  'frontend': ['https://nofluffjobs.com/cz/it-prace/frontend'],
                                   ...}'''

        # adding the page url from the next button (first url page is already added)
        for url in self.links['pages'].values():
            while True:
                try:
                    response = requests.get(url[-1])
                    soup = BeautifulSoup(response.text, 'lxml')
                    pageLink = self.mainUrl + soup.find('a', {'aria-label': 'Next'}).get('href')
                    url.append(pageLink)
                except AttributeError:
                    break
                time.sleep(0.2)
                response = requests.get(pageLink)
                #soup = BeautifulSoup(response.text, 'lxml')
        print('page urls success')

    def getJobLinks(self):
        #add keys based on job categories to the self.links['jobOffers'] dictionary  ,the values are emtpy lists
        #self.links['jobOffers'] = {k: [] for k in self.categList}
        #pageLinks is a dictionary with key: list values, where key is the job category and lists contain links for the pages
        for lists in self.links['pages'].values():
            for link in lists:
                time.sleep(0.1)
                # request applied on a particular page
                response = requests.get(link).text
                soup = BeautifulSoup(response, 'lxml')
                #select all the 'a' tags on the particular page with the idea to capture all the 'hrefs' from there
                aTags = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-postings-search > div > common-main-loader > div > nfj-search-results > nfj-postings-list > div.list-container.ng-star-inserted>a')
                #looping through the 'a' tags to capture the 'hrefs' which are than added into the 'jobOffers' dictionary's list
                #keys denotes the job category, where the particular 'href' belongs to
                for href in aTags:
                    self.links['jobOffers'].append(self.mainUrl + href.get('href'))
        print('job sites url success')
    def getData(self):
        for link in tqdm(self.links['jobOffers']):    
            time.sleep(0.2)
            response = requests.get(link)
            soup = BeautifulSoup(response.text, 'lxml')
            #getting job titles
            jobTitle = soup.find('div',
                                 class_="posting-details-description d-flex align-items-center align-items-lg-start flex-column justify-content-center justify-content-lg-start")
            try:
                self.data['jobTitles'].append(jobTitle.h1.text)
            except:
                self.data['jobTitles'].append(None)
            self.data['company'].append(jobTitle.a.text)

            #getting offered salary ranges
            salaryRange = soup.find('div', class_="salary")
            self.data['salaryRange'].append(salaryRange.h4.text)

            #getting  bonus
            bonus = soup.find('common-postings-bonus')
            try:
                self.data['Bonus'].append(bonus.a.text)
            # if no bonus than assign None
            except:
                self.data['Bonus'].append(None)

            #getting job's location
            location = soup.find('common-posting-locations')
            try:
                self.data['location'].append(location.text)
            except:
                self.data['location'].append(None)
            
            #getting the info, whether it is a remote job
            remote = soup.find('li', class_="remote")
            try:
                self.data['remote'].append(remote.text)
            except:
                self.data['remote'].append(None)
            
            # [0] to unlist the soup.select
            categoryProgLang = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section> ul > li:nth-child(1)')[0]
            self.data['category+progLang'].append(categoryProgLang.text)
            
            seniority = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section> ul > li:nth-child(2)')[0]
            self.data['seniority'].append(seniority.text)
            
            
            workLang = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section> ul > li:nth-child(3)')[0]
            try:
                self.data['workLanguage'].append(workLang.text)
            except:
                self.data['workLanguage'].append(None)
            #getting offered job category (type of job + the main language
            #we will get an ul containing the info for job category, seniority and working language
            #[0] used to unlist the soup.select object
            '''basicInfos = soup.select('body > nfj-root > nfj-layout > nfj-main-content > div > nfj-posting-details > common-main-loader > main > article > div.col.mobile-no-padding > common-posting-content-wrapper > div.border > section > ul')[0]
            
            #basicInfos contains 3 li objects 
            #1st li object contains category+programming language, 2nd li object contains seniority, and 3rd contains working lang.
            for index, li in enumerate(basicInfos.find_all('li')):
        
                    if index % 3 == 0:
                        self.data['category+progLang'].append(li.text)
                    elif index % 3 == 1:
                        self.data['seniority'].append(li.text)
                    else:
                        # sometimes, the job offer is missing the working language info, therefore we use try except 
                        try:
                            self.data['workLanguage'].append(li.text)
                        except:
                            self.data['workLanguage'].append(None)'''
            '''basicInfos = soup.find_all('div',
                                        class_="posting-info-row d-flex flex-column flex-md-row justify-content-between justify-content-center align-items-md-center")

            for index, info in enumerate(basicInfos):
                # if we are scraping from the upper row
                if index % 2 == 0:
                    # getting job category+language or work language (=english or czech)
                    spanBasicInfo = info.find('span', class_='font-weight-semi-bold')
                    self.data['category+progLang'].append(spanBasicInfo.text)

                    spanSenior = info.find('span', class_="mr-10 font-weight-medium")
                    self.data['seniority'].append(spanSenior.text)
                else:
                    try:
                        spanWorkLang = info.find('common-posting-work-languages')
                        self.data['workLanguage'].append(spanWorkLang.text)
                    except:
                        self.data['workLanguage'].append(None)'''
        print('data extraction success')

In [3]:
mainUrl = 'https://nofluffjobs.com'

site = Scraper(mainUrl)

site.getUrlByCat()

site.getPages()

site.getJobLinks()

site.getData()




categorical urls success
page urls success


  0%|          | 0/71 [00:00<?, ?it/s]

job sites url success


100%|██████████| 71/71 [01:37<00:00,  1.37s/it]

data extraction success





In [4]:
df2 = pd.DataFrame(site.data)


In [5]:
df2.head()

Unnamed: 0,jobTitles,company,salaryRange,Bonus,category+progLang,seniority,workLanguage,location,remote
0,Django backend Developer,RES-Q Global,55k - 80k CZK,,"Kategorie: Backend , python","Mid, Senior","Pracovní jazyk:Angličtina, Čeština","Brno, Božetěchova 62 •Brno, Božetěchova 62",
1,Team Lead Backend Development (m/f/d),PlanRadar,125.9k - 188.9k CZK,,"Kategorie: Backend , ruby on rails",Expert,Pracovní jazyk:Angličtina,"Hybrid +1 •Vienna, Kärntner Ring 5-7",
2,NodeJS Backend Developer 🔥,Dayswaps,67.2k - 117.6k CZK,10k CZK,"Kategorie: Backend , node",Mid,"Pracovní jazyk:Čeština, Slovenština","Hybrid +1 •Prague, Na Zderaze 5",
3,Backend Developer,"PECOSTA, a.s.",60k - 90k CZK,,"Kategorie: Backend , .net","Mid, Senior",Pracovní jazyk:Čeština,"Hybrid +1 •Ostrava, Nemocniční 987/12",
4,Backend Developer,wpj s.r.o.,40k - 70k CZK,,"Kategorie: Backend , php","Mid, Senior",Pracovní jazyk:Čeština,"Hybrid +1 •Vrchlabí, Lánovská 1475",


In [44]:
data = pd.DataFrame(site.data)
data.head()

Unnamed: 0,jobTitles,company,salaryRange,Bonus,category+progLang,seniority,workLanguage,location,remote
0,Java Software Developer,Intempt Technologies,23.2k - 34.9k CZK,,"Backend, java",Junior,Work language:angličtina,,Na dálku
1,C++ Developer,Sourcein,70k - 140k CZK,,"Backend, C#","Mid, Senior",Work language:česky,"Prague, Altajská 1568/4 (Po pandemi)• Prague,...",Covid-time na dálku
2,C# Developer,Sourcein,70k - 140k CZK,,"Backend, C#","Mid, Senior",Work language:česky,"Prague, Altajská 1568/4 (Po pandemi)• Prague,...",Covid-time na dálku
3,Java developer u Medvědů,Coding Bear,60k - 120k CZK,,"Backend, java","Mid, Senior",Work language:česky,"Praha, Vítkova 241/10 • Praha, Vítkova 241/10",
4,Senior Backend Developer,DX Heroes,60k - 100k CZK,,"Backend, node","Mid, Senior","Work language:česky, angličtina","Praha, Rohanské nábř. 19 • Remote• Praha, Roh...",Na dálku


In [7]:
df2.to_csv('jobOffers2.csv')