In [9]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait  # Import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC  # Import expected_conditions
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd  # Import pandas

# Set up Selenium WebDriver with options
chrome_options = Options()
chrome_options.add_argument("user-agent=YOUR_USER_AGENT_STRING")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

job_roles = ["Data Analyst", "Data Engineer", "Machine Learning Engineer"]

def scrape_jobs(role):
    driver.get("https://www.indeed.com/")

    #Locate search fields
    search_box = driver.find_element(By.NAME, "q")
    search_box.send_keys(role)

    location_box = driver.find_element(By.NAME, "l")
    location_box.clear()
    location_box.send_keys("Remote")

    search_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
    search_button.click()

    time.sleep(3)

    # Parse the page content using Beautiful Soup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    jobs = soup.find_all('div', class_='job_seen_beacon')

    # Check if jobs were found
    if not jobs:
        print(f"No jobs found for {role}")

    job_list = []
    for job in jobs:
        try:
            title = job.find('span')['title']
            link = "https://www.indeed.com" + job.find('a')['href']
            company = job.find('span', class_='css-1h7lukg eu4oa1w0').text.strip()
            job_type = job.find('div', class_= "css-1restlb eu4oa1w0")
            job_type = job_type.text.strip() if job_type else 'N/A'
            description = job.find('div', class_='css-156d248 eu4oa1w0').text.strip()

            job_list.append({
                'title': title,
                'link': link,
                'company': company,
                'job_type': job_type,
                'description': description
            })

        except Exception as e:
            print(f"Error processing job listing: {e}")

    return job_list
    
#Creating a postGRE database and stores the data in it.
import os

db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_pass = os.getenv('DB_PASS')
db_port = os.getenv('DB_PORT')

import psycopg2

connection = psycopg2.connect(
    host=db_host,
    database=db_name,
    user=db_user,
    password=db_pass,
    port=db_port
)
cursor = connection.cursor()

create_table_query = '''
CREATE TABLE jobs (
    id SERIAL PRIMARY KEY,
    title TEXT,
    link TEXT,
    company TEXT,
    job_type TEXT,
    description TEXT
);
'''
cursor.execute(create_table_query)
connection.commit()


# Create an empty list to store all jobs from all roles
all_jobs = []

# Loop through each job role and scrape data
for role in job_roles:
    jobs = scrape_jobs(role)
    all_jobs.extend(jobs)  # Add each role's jobs to the list

# Convert the collected job data into a pandas DataFrame
df = pd.DataFrame(all_jobs)

# Optionally, save the DataFrame to a CSV file
df.to_csv('indeed_jobs.csv', index=False)

print("Successfully Scraped data,stored in a csv file and stored data in a PostgreSQL ")

# Close the web driver
driver.quit()


Successfully Scraped data,stored in a csv file and stored data in a PostgreSQL 
