install Required Packages

In [8]:
#!pip3 install selenium geopy pandas dash plotly webdriver-manager


Import the Libraries

In [1]:
import time
from datetime import timedelta
import traceback
import pandas as pd
import os
from datetime import datetime
import plotly.express as px
import dash
from dash import dcc, html, Input, Output, dash_table
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from geopy.geocoders import Nominatim
import io
import base64
import re


Configure WebDriver

In [2]:
# Configure WebDriver for Chrome browser (headless)
def configure_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver


 Create Helper Function to Calculate "Days Ago"

In [3]:
# Helper function to calculate days ago
def days_ago(posted_time):
    now = datetime.now()
    if 'hour' in posted_time:
        return 0  # Posted today
    match = re.search(r'(\d+)\s*(days?|weeks?|months?|years?)\s*ago', posted_time)
    if match:
        number = int(match.group(1))
        unit = match.group(2)
        
        if 'day' in unit:
            delta = timedelta(days=number)
        elif 'week' in unit:
            delta = timedelta(weeks=number)
        elif 'month' in unit:
            delta = timedelta(weeks=number * 4)
        elif 'year' in unit:
            delta = timedelta(weeks=number * 52)
        
        posted_date = now - delta
        return (now - posted_date).days  # Return number of days ago
    return 0  # Default if no match


Scraping LinkedIn Jobs

In [4]:
# Scrape LinkedIn for jobs
def scrape_linkedin_jobs(keyword, location):
    print("\n🔍 Scraping LinkedIn Jobs...\n")
    driver = configure_driver()
    if not driver:
        return []
    try:
        search_url = f"https://www.linkedin.com/jobs/search?keywords={keyword.replace(' ', '%20')}&location={location.replace(' ', '%20')}"
        driver.get(search_url)
        time.sleep(3)
        
        for _ in range(3):  # Scroll 3 times
            driver.execute_script("window.scrollBy(0, 800);")
            time.sleep(2)
        
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "base-card")))
        
        jobs = []
        job_elements = driver.find_elements(By.CLASS_NAME, "base-card")
        
        for job in job_elements[:20]:  # Limit to first 20 jobs
            try:
                title = job.find_element(By.CSS_SELECTOR, "h3").text.strip()
                company = job.find_element(By.CSS_SELECTOR, "h4").text.strip()
                location = job.find_element(By.CLASS_NAME, "job-search-card__location").text.strip()
                link = job.find_element(By.TAG_NAME, "a").get_attribute("href")
                posted_time = job.find_element(By.CLASS_NAME, "job-search-card__listdate").text.strip()
                
                # Get days ago posted
                days_posted = days_ago(posted_time)
                
                jobs.append({"title": title, "company": company, "location": location, "link": link, "days_ago_posted": days_posted})
            except Exception as e:
                print(f"Error scraping job: {e}")
                continue
        return jobs
    except Exception as e:
        print(f"Error during scraping: {e}")
        return []
    finally:
        driver.quit()


Clean Location Data (Geocoding)

In [5]:
# Clean and add geolocation data (latitude/longitude)
def clean_location_data(df):
    geolocator = Nominatim(user_agent="job_scraper")
    df[['City', 'Country']] = df['location'].str.rsplit(',', n=1, expand=True)
    df['Country'] = df['Country'].str.strip()
    df.dropna(inplace=True)
    
    df['Latitude'] = None
    df['Longitude'] = None
    for index, row in df.iterrows():
        try:
            location = geolocator.geocode(row['location'])
            if location:
                df.at[index, 'Latitude'] = location.latitude
                df.at[index, 'Longitude'] = location.longitude
        except:
            continue
    
    return df


Save Jobs to Excel and CSV

In [6]:
# Save jobs to Excel first, then clean and save as CSV
def save_jobs_to_file(jobs, keyword, location):
    if not jobs:
        print("No jobs found.")
        return None
    df = pd.DataFrame(jobs)
    
    today_date = datetime.today().strftime("%Y-%m-%d")
    filename = f"linkedin_jobs_{keyword.replace(' ', '_')}_{location.replace(' ', '_')}_{today_date}"
    excel_file = f"{filename}.xlsx"
    df.to_excel(excel_file, index=False)
    print(f"Saved data to {excel_file}")
    
    # Now clean and save as CSV
    df = clean_location_data(df)
    csv_file = f"{filename}.csv"
    df.to_csv(csv_file, index=False)
    print(f"Cleaned data saved to {csv_file}")
    return csv_file


Create Dash App for Visualization

Execute the Script

In [7]:
# Run the script
keyword = input("Enter job title: ")
location = input("Enter location: ")
jobs = scrape_linkedin_jobs(keyword, location)
csv_file = save_jobs_to_file(jobs, keyword, location)
if csv_file:
    print(f"CSV saved: {csv_file}")




🔍 Scraping LinkedIn Jobs...

Error scraping job: Message: no such element: Unable to locate element: {"method":"css selector","selector":".job-search-card__listdate"}
  (Session info: chrome=134.0.6998.89); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000100e01804 cxxbridge1$str$ptr + 2785964
1   chromedriver                        0x0000000100df9ddc cxxbridge1$str$ptr + 2754692
2   chromedriver                        0x000000010094dea8 cxxbridge1$string$len + 92928
3   chromedriver                        0x00000001009951d0 cxxbridge1$string$len + 384552
4   chromedriver                        0x000000010098a878 cxxbridge1$string$len + 341200
5   chromedriver                        0x00000001009d6678 cxxbridge1$string$len + 651984
6   chromedriver                        0x000000010098935c cxxbridge1$string$len + 335796
7   chrome