# Exercise: Web scrape [kariera.gr](https://www.kariera.gr/en) using Selenium
Retrieve all job ads for Data Analyst, Data Scientist and Data Engineer and store to a dataframe features like: Company, Job title, Content, Location and job occupation.

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from time import sleep

options = Options()
# Either point at the ELF binary:
options.binary_location = (
    "/snap/firefox/current/usr/lib/firefox/firefox"
)
# —or— point at the launcher stub:
# options.binary_location = "/snap/firefox/current/firefox.launcher"

url = "https://www.kariera.gr/en"
results = []
# Keep track of ingested ads to reduce completion time and skip duplicate removal
link_set = set()
driver = webdriver.Firefox(options=options)
driver.get(url)
wait = WebDriverWait(driver, 30)
cookie_allow_btn = wait.until(
    EC.element_to_be_clickable(
        (
            By.CSS_SELECTOR,
            "#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
        )
    )
)
cookie_allow_btn.click()
search_page = wait.until(
    EC.element_to_be_clickable(
        (
            By.XPATH,
            "/html/body/div[2]/div/div[2]/div/main/section[1]/div[1]/div[1]/div[3]",
        )
    )
)
search_page.click()
for job_role in ("Data Analyst", "Data Scientist", "Data Engineer"):
    search_box = wait.until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="rc_select_2"]'))
    )
    search_box.clear()
    search_box.send_keys(job_role + Keys.RETURN)
    while True:
        job_list = wait.until(
            EC.visibility_of_element_located(
                (By.CLASS_NAME, "Jobs_resultsContainer__xwjB_")
            )
        )
        job_listings = job_list.find_elements(
            By.CLASS_NAME, "BaseJobCard_jobTitleContainer__gfcyi"
        )
        for job_listing in job_listings:
            ad_link = job_listing.find_element(By.TAG_NAME, "a")
            ad_link_text = ad_link.get_property("href")
            if ad_link_text in link_set:
                continue
            else:
                link_set.add(ad_link_text)
            ad_link.click()
            driver.switch_to.window(driver.window_handles[-1])
            basic_info = wait.until(
                EC.visibility_of_element_located(
                    (
                        By.XPATH,
                        "/html/body/div[1]/div/div[2]/div/main",
                    )
                )
            )
            role = basic_info.find_element(
                By.CSS_SELECTOR, ".h4.JobTitle_title__irhyN"
            ).text
            try:
                company = basic_info.find_element(
                    By.CSS_SELECTOR,
                    ".h6.JobCompanyName_name__V9AaS ",
                ).text
            except:
                company = None
            sleep(0.5)
            location = basic_info.find_element(
                By.CSS_SELECTOR,
                ".JobDetail_value__1yhn_.main-body-text",
            ).text
            date_posted = basic_info.find_element(
                By.CSS_SELECTOR,
                "div.JobDetail_detail___Th__:nth-child(2) > div:nth-child(2)",
            ).text
            try:
                min_experience = basic_info.find_element(
                    By.CSS_SELECTOR,
                    "div.JobDetail_detail___Th__:nth-child(3) > a:nth-child(2)",
                ).text
            except:
                min_experience = None
            employment_type = basic_info.find_element(
                By.CSS_SELECTOR,
                "div.JobDetail_detail___Th__:nth-child(4) > a:nth-child(2)",
            ).text
            category = basic_info.find_element(
                By.CSS_SELECTOR,
                ".JobDetails_singleDoubleColumn__NwW1V > div:nth-child(1) > a:nth-child(2)",
            ).text
            try:
                remote = basic_info.find_element(
                    By.CSS_SELECTOR,
                    ".JobDetails_singleDoubleColumn__NwW1V > div:nth-child(2) > a:nth-child(2)",
                ).text
            except:
                remote = None
            details = []
            contents_prt = driver.find_element(
                By.CLASS_NAME, "HtmlRenderer_renderer__mr82C"
            )
            for contents_chd in contents_prt.find_elements(
                By.XPATH, ".//p | .//strong | .//li"
            ):
                if contents_chd.text.strip() != "":
                    details.append(contents_chd.text.strip())
            try:
                tags = basic_info.find_elements(
                    By.CSS_SELECTOR,
                    '[class*="Label_label__Llv6_"]',
                )
                tags = [tag.text for tag in tags]
            except:
                tags = None

            driver.close()
            driver.switch_to.window(driver.window_handles[-1])
            results.append(
                {
                    "role": role,
                    "company": company,
                    "location": location,
                    "date_posted": date_posted,
                    "min_experience": min_experience,
                    "employment_type": employment_type,
                    "category": category,
                    "remote": remote,
                    "details": details,
                    "tags": tags,
                    "ad_link": ad_link_text,
                }
            )
        button = wait.until(
            EC.visibility_of_element_located(
                (
                    By.CSS_SELECTOR,
                    ".ant-pagination-next > button:nth-child(1)",
                )
            )
        )
        if button.is_enabled():
            button.click()
        else:
            break

In [14]:
import pandas as pd
df = pd.DataFrame(results)

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 610 entries, 0 to 609
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   role             610 non-null    object
 1   company          608 non-null    object
 2   location         610 non-null    object
 3   date_posted      610 non-null    object
 4   min_experience   510 non-null    object
 5   employment_type  610 non-null    object
 6   category         610 non-null    object
 7   remote           85 non-null     object
 8   details          610 non-null    object
 9   tags             610 non-null    object
 10  ad_link          610 non-null    object
dtypes: object(11)
memory usage: 52.6+ KB


In [17]:
df2 = df.drop(['tags', 'details'], axis=1)

In [18]:
df2.nunique()

role               401
company            209
location            77
date_posted         36
min_experience       5
employment_type      6
category            30
remote               2
ad_link            453
dtype: int64

In [19]:
df.to_pickle('data3.pkl')

In [20]:
df.to_csv('kariera_jobs.ccsv', index=False)

In [21]:
df[df['role'].str.lower().str.contains('data')]

Unnamed: 0,role,company,location,date_posted,min_experience,employment_type,category,remote,details,tags,ad_link
0,FMS Administrator & Data Analyst (Maternity Co...,UNISON FACILITY & HUMAN SOLUTIONS,Παιανία,πριν 20 μέρες,Με κάποια εμπειρία,Πλήρης απασχόληση,Επιχειρησιακά,,"[The UNISON Group, is looking for a Facility M...",[],https://www.kariera.gr/en/jobs/operations-jobs...
1,Business Intelligence and Data Analyst (Atrium...,Atrium Hotels & Resorts,Ρόδος,πριν 4 μέρες,Με κάποια εμπειρία,Πλήρης απασχόληση,Αναλυτής / BI,,"[Job Description:, Job Description:, Responsib...","[data analyst, rhodes, #bi, business intellige...",https://www.kariera.gr/en/jobs/bi-or-business-...
2,Data Analyst,PHARMASERVE-LILLY,Κηφισιά,πριν 16 μέρες,Με κάποια εμπειρία,Πλήρης απασχόληση,Αναλυτής / BI,,[MULTINATIONAL PHARMACEUTICAL COMPANY is looki...,"[gather and scrutinize data, specialist to gen...",https://www.kariera.gr/en/jobs/bi-or-business-...
8,MSC Service Center Greece: Data Processing Spe...,MSC - MEDITERRANEAN SHIPPING COMPANY,Πειραιάς,πριν 25 μέρες,Entry / Αρχάριος,Πλήρης απασχόληση,Άλλη κατηγορία,,"[Your Role: Join our vibrant team, where your ...",[],https://www.kariera.gr/en/jobs/other-jobs/220709
9,Data Engineer – Cyprus or Greece,XM,Αθήνα,πριν μία μέρα,Με κάποια εμπειρία,Πλήρης απασχόληση,Πληροφορική,Εργασία από απόσταση κάποιες ημέρες,"[Data Engineer – Cyprus or Greece, Data Engine...",[],https://www.kariera.gr/en/jobs/it-jobs/226263?...
10,Lead Data Scientist - Cyprus or Greece,XM,Αθήνα,πριν μία μέρα,Με μεγάλη εμπειρία,Πλήρης απασχόληση,Άλλη κατηγορία,Εργασία από απόσταση κάποιες ημέρες,"[Lead Data Scientist – Cyprus or Greece, The R...",[],https://www.kariera.gr/en/jobs/other-jobs/2262...
12,OPERATIONS SUPPORT ASSOCIATE & DATA ENTRY,NATIONS S.A.,Αθήνα,πριν μία μέρα,Entry / Αρχάριος,Εποχιακός/ή,Γραμματειακή Υποστήριξη / Υπάλληλος Γραφείου,,[Ψάχνουμε για δυναμικά άτομα για να ενταχθούν ...,"[customer service, operation support, υπάλληλο...",https://www.kariera.gr/en/jobs/administrative-...
14,Data Scientist (Finance Department),Etraveli Group,Αθήνα,πριν μία μέρα,Με μεγάλη εμπειρία,Πλήρης απασχόληση,Οικονομικά,,"[Who we are, Who we are, Etraveli Group is the...",[],https://www.kariera.gr/en/jobs/finance-jobs/22...
15,Data Entry Specialist,ΛΑΚΙΩΤΗΣ Α.Ε.,Ασπρόπυργος,πριν 7 μέρες,Entry / Αρχάριος,Πλήρης απασχόληση,Μάρκετινγκ / Διαφήμιση,,"[Η εταιρία Λακιώτης, στο πλαίσιο της δυναμικής...",[],https://www.kariera.gr/en/jobs/marketing-or-ad...
16,Data Warehouse and Business Intelligence Engineer,Netcompany-Intrasoft,Αθήνα,πριν 4 μέρες,,Πλήρης απασχόληση,Πληροφορική,,[Data Warehouse and Business Intelligence Engi...,[],https://www.kariera.gr/en/jobs/it-jobs/225950?...


In [7]:
import pandas as pd

In [None]:
df = pd.read_pickle('data3.pkl')

In [None]:
df