In [1]:
# Loading requests library
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import mysql.connector
import numpy as np
from datetime import datetime
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from dotenv import load_dotenv
import os

In [3]:
# Loading configuration
with open("config.json",'r') as f:
    config = json.load(f)

In [4]:
job_portals = config["job_portal"]
job_title = config["job_title"]
locations = config["location"]
db_config = config["database"]

In [5]:
# Connecting to MySQL
conn = mysql.connector.connect(
    host=db_config["host"],
    user=db_config["user"],
    password=db_config["password"],
    database=db_config["database"]
)

cursor = conn.cursor()

In [6]:
# Checking if the database job_listings is available
cursor.execute("SHOW DATABASES")

# Fetching all databases
databases = cursor.fetchall()

database_names = [db[0] for db in databases]
if "job_listings" in database_names:
    print("Database available")
else:
    print("Database not available")

Database available


In [7]:
# Creating a table for jobs in job listings database
cursor.execute("""
CREATE TABLE IF NOT EXISTS jobs (
            id INT AUTO_INCREMENT PRIMARY KEY,
            job_title VARCHAR(255),
            job_company VARCHAR(255),
            job_description VARCHAR(255),
            job_location VARCHAR(255),
            link VARCHAR(255),
            post_date VARCHAR(255)
            )
"""
)

In [15]:
# Fetching data fron linkedin
def fetch_jobs_from_linkedib(job_title,location):

    # linkedin url
    base_url = "https://www.linkedin.com/jobs/search"
    url = f"{base_url}?keywords={job_title.replace(' ', '%20')}&location={locations.replace(' ', '%20')}"


    # Setting up selenium webdriver
    driver=webdriver.Chrome()
    driver.get(url)

    # waiting for the job listings to load
    WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME, 'base-card')))

    # Parsing the html content using beautifulsoup
    soup = BeautifulSoup(driver.page_source,"html.parser")

    # Finding all the job containers
    job_cards = soup.find_all("div",class_="base-card")

    # Empty list that will hold the results
    jobs = []
    
    # Looping through all job containers and extracting data
    for job_card in job_cards:
        # Extracting the job title
        title = job_card.find("h3",class_="base-search-card__title").text.strip()
        
        # Extracting the companies name
        company_name = job_card.find("h4", class_="base-search-card__subtitle").text.strip()
        
        # Extracting the location of the job
        location=job_card.find("span",class_="job-search-card__location").text.strip()
        
        # Extracting the link to the job
        link = job_card.find("a")["href"]
        
        # Extracting the date the job was posted
        post_date = job_card.find("time")["datetime"]


        # # Extracting the job id from the link
        # job_id = link.split('/')[-1].split('?')[0]
        # job_description_url = f"{base_url}?keywords={job_title.replace(' ', '%20')}&location={location.replace(' ', '%20')}&position=1&currentJobId={job_id}"


        # """
        # Navigating to another page to extract the job description 
        # since they are on different pages with the other attributes.
        # """

        # # Navigating to the page with the job description
        # driver.get(job_description_url)

        # # Waiting for the description to load
        # WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"")))

        # # Extracting job description
        # description_soup = BeautifulSoup(driver.page_source,"html.parser")
        # description = description_soup.find("div",class_="show-more-less-html__markup")

        # Appending the results to the empty list
        # jobs.append((job_id,title,company_name,description,location,link,post_date))
        jobs.append((title,company_name,location,link,post_date))


        """
        After extracting the description ,the scraper must return to the main 
        page to extract the next job
        """
        # # Returning to the main page
        # driver.back()
        
        # # Waiting for the main page to load again
        # WebDriverWait(driver,10).until(EC.presence_of_element_locate((By.CLASS_NAME,'base-card')))

    driver.quit()
    return jobs
        


In [16]:
def save_jobs_to_db(jobs):
    for job in jobs:
        cursor.execute("""
            INSERT INTO jobs (id,job_title, job_company,job_location, link, post_date)
            VALUES (%s, %s, %s, %s, %s,%s)
        """, job)
    conn.commit()
