In [None]:
import requests
def fetch_page(url):
    """Fetch HTML content from a given URL."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    return None

#Fetch Rendered HTML
html_content = fetch_page("http://jobs.accel.com/jobs")

print(html_content)

In [None]:
import google.generativeai as genai

GEMINI_API_KEY = "AIzaSyC6FRgWtdR4D_8ZrZq2pBnax1X0ZWxfPN0"
genai.configure(api_key=GEMINI_API_KEY)

def ai_extract_jobs(html):
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""
    You are an expert web scraper. Analyze the following HTML and **extract job-related data**.
    
    - Look for elements inside `<a>`, `<div>`, `<meta>`, and `<h4>` tags.
    - Identify **job titles** from elements like `<div itemprop="title">` or `<h4>`.
    - Extract **job URLs** from `<a>` tags with attributes like `data-testid="job-title-link"`.
    - Parse **company names** from `<meta itemProp="description">` or parent elements.
    - Identify **job locations** if available.

    **Return output as a structured JSON list:**
    [
      {{"title": "Operations Associate", "company": "Gopuff", "url": "https://jobs.example.com/job1", "location": "Berlin, Germany"}},
      {{"title": "Software Engineer", "company": "Stripe", "url": "https://jobs.example.com/job2", "location": "Remote"}}
    ]

    Here is the HTML content:
    {html}
    """

    response = model.generate_content(prompt)

    raw_response = response.text

    #print("RAW AI RESPONSE:", response.text)
    job_listings = process_raw_response(raw_response)
    return job_listings

import json
import re

def process_raw_response(raw_response):
    """
    Cleans and parses the raw response from the Gemini model.
    """
    if not raw_response:
        print("Error: raw_response is None or empty.")
        return []

    try:
        cleaned_response = re.sub(r'^```json\s*|\s*```$', '', raw_response, flags=re.MULTILINE).strip()
        if not cleaned_response.endswith("]"):
            cleaned_response = "[" + cleaned_response + "]"
        cleaned_response = re.sub(r',\s*}$', '}', cleaned_response) 
        #print("CLEANED RESPONSE:", cleaned_response)
        job_listings = json.loads(cleaned_response)
        return job_listings
    except json.JSONDecodeError as e:
        print("JSON Parsing Error:", e)
        return []
    except Exception as e:
        print("Unexpected Error:", e)
        return []
    



In [None]:
job_listing = ai_extract_jobs(html_content)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

def setup_selenium():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless") 
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def extract_text_from_url(url):
    driver = setup_selenium()
    driver.get(url)

    try:
        # Wait for the page to load completely
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Get the page HTML
        html = driver.page_source
        driver.quit()

        # Extract readable text using BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text(separator="\n", strip=True)  # Extract clean text

        return text

    except Exception as e:
        driver.quit()
        return f"Error fetching {url}: {e}"
    
job_listing = ai_extract_jobs(html_content)

def process_job_urls(job_listings):
    """Extract URLs and scrape text from each job page."""
    job_urls = extract_job_urls(job_listings)
    
    extracted_texts = {}
    for url in job_urls:
        print(f"Extracting text from: {url}")
        extracted_texts[url] = extract_text_from_url(url)

    return extracted_texts

def extract_job_urls(job_listings):
    if not job_listings or not isinstance(job_listings, list):
        print("No valid job listings found.")
        return []

    #Extract URLs from job listings
    job_urls = [job["url"] for job in job_listings if "url" in job]
    return job_urls


#Extract and process text from job URLs
job_texts = process_job_urls(job_listing)

#Print extracted job texts
print("\n Extracted Text from Job Pages:")
for url, text in job_texts.items():
    print(f"\n {url}\n{text[:1000]}...")  # Print first 1000 characters for preview
