In [1]:
#!pip3 install python-docx
#!pip3 install openai
#!pip3 install spacy

In [2]:
import os       # For file path operations
import re       # For regular expressions (finding keywords)
import requests # For making HTTP requests to fetch job description
from docx import Document     # From python-docx for reading/writing Word documents
from docx.shared import Pt    # For setting font sizes, etc.
import time
import datetime
import pandas as pd

import spacy
from datetime import datetime, timedelta
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### Getting the job from linked in then put the file as input file

In [3]:
def configure_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = uc.Chrome(options=options)
    return driver

def scrape_linkedin_jobs(keyword, location):
    print("\n🔍 Scraping LinkedIn Jobs...\n")
    driver = configure_driver()
    search_url = f"https://www.linkedin.com/jobs/search?keywords={keyword.replace(' ', '%20')}&location={location.replace(' ', '%20')}"
    driver.get(search_url)
    
    for _ in range(3):  
        driver.execute_script("window.scrollBy(0, 800);")
        time.sleep(2)
    
    wait = WebDriverWait(driver, 15)
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "base-card")))
    except:
        print("❌ No LinkedIn jobs found.")
        driver.quit()
        return []

    jobs = []
    today = datetime.today()
    job_elements = driver.find_elements(By.CLASS_NAME, "base-card")
    
    for job in job_elements[:10]:
        try:
            title = job.find_element(By.CSS_SELECTOR, "h3").text.strip()
            company = job.find_element(By.CSS_SELECTOR, "h4").text.strip()
            link = job.find_element(By.TAG_NAME, "a").get_attribute("href")
            
            try:
                date_element = job.find_element(By.CSS_SELECTOR, "time")
                posted_time = date_element.get_attribute("datetime")
                if posted_time:
                    posted_date = datetime.strptime(posted_time[:10], "%Y-%m-%d")
                    days_ago = (today - posted_date).days
                    if days_ago > 14:
                        print(f"⏳ Skipping job: {title} (Posted {days_ago} days ago)")
                        continue
            except:
                print(f"⚠️ Could not find post time for: {title}, assuming it's recent.")
                days_ago = "Unknown"
            
            jobs.append({"title": title, "company": company, "link": link, "source": "LinkedIn", "posted_days_ago": days_ago})
        except Exception as e:
            print(f"⚠️ Skipping a job entry due to error: {e}")
            continue
    
    driver.quit()
    return jobs

if __name__ == "__main__":
    keyword = input("Enter job title (e.g., Software Engineer): ")
    location = input("Enter location (e.g., Remote, New York, Berlin): ")
    
    linkedin_jobs = scrape_linkedin_jobs(keyword, location)
    
    if linkedin_jobs:
        df = pd.DataFrame(linkedin_jobs)
        today_date = datetime.today().strftime("%Y-%m-%d")
        filename = f"linkedin_jobs_{today_date}.xlsx"
        
        folder_path = "/Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker"
        os.makedirs(folder_path, exist_ok=True)  # Ensure directory exists
        input_file = os.path.join(folder_path, filename)
        
        df.to_excel(input_file, index=False)
        print(f"\n✅ Jobs saved to {input_file}")
    else:
        print("\n❌ No LinkedIn jobs found.")
        input_file = None



🔍 Scraping LinkedIn Jobs...

⏳ Skipping job:  (Posted 15 days ago)
⏳ Skipping job: ******* ******* *********** ********* - *-******** (*/*/*) (Posted 115 days ago)

✅ Jobs saved to /Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker/linkedin_jobs_2025-02-27.xlsx


In [4]:
#chekcing the input file is getting correctly
print(input_file)

/Users/eimon/Desktop/Code/AI works/Job apply AI agent/Job-apply-AI-agent/CV maker/linkedin_jobs_2025-02-27.xlsx


## Getting the description of the job. fetch_full_job_details

In [5]:
def fetch_full_job_details(job_url: str) -> tuple:
    """
    Opens the LinkedIn job page, fetches the job title, company name, and full job description.
    Returns (job_title, company_name, job_description).
    """
    options = uc.ChromeOptions()
    options.add_argument("--headless")           # or remove this if you want to see the browser
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = uc.Chrome(options=options)
    driver.get(job_url)

    # Default empty values
    job_title = ""
    company_name = ""
    job_description = ""

    try:
        wait = WebDriverWait(driver, 15)

        # 1) Job Title (example selector)
        title_elem = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.topcard__title"))
        )
        job_title = title_elem.get_attribute("innerText")

        # 2) Company Name (example selector)
        company_elem = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a.topcard__org-name-link"))
        )
        company_name = company_elem.get_attribute("innerText")

        # 3) Full Job Description (often "description__text" class)
        desc_elem = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "description__text"))
        )
        job_description = desc_elem.get_attribute("innerText")

    except Exception as e:
        print(f"Error scraping {job_url}: {e}")

    finally:
        driver.quit()

    return job_title.strip(), company_name.strip(), job_description.strip()

#### After modifying the excel sheet with description

In [16]:
import pandas as pd
import re

# Load the updated Excel file
input_file = "final_job_descriptions.xlsx"
df = pd.read_excel(input_file)

# Define your existing skills and categories
my_skills = {
    "Data Science & Machine Learning": ["Python", "R", "TensorFlow", "NumPy", "Pandas", "Seaborn", "Scikit-learn"],
    "Statistical Modeling & AI": ["ML models", "AI", "Custom-GPT", "Deep Learning"],
    "AI Agent": ["n8n", "Python AI Agent", "Automation"],
    "Business Intelligence & Dashboarding": ["Power BI", "Tableau", "SQL", "Data Visualization"],
    "Database Optimization": ["SQL", "MySQL", "PostgreSQL"],
    "Programming Languages": ["Python", "Java", "C", "JavaScript"],
    "Microsoft Tools": ["Azure", "Microsoft 365", "Dynamics 365"]
}

# Common requirement phrases
requirement_keywords = ["experience in", "knowledge of", "proficiency in", "familiarity with", "required", "preferred", "must have", "ability to"]

def extract_skills_and_requirements(description):
    """
    Extracts relevant skills and job requirements from the job description
    based on predefined skills and requirement keywords.
    """
    description = description.lower()  # Convert to lowercase for easier matching

    # Identify matching skills
    matched_skills = set()
    for category, skills in my_skills.items():
        for skill in skills:
            pattern = rf"\b{re.escape(skill.lower())}\b"
            if re.search(pattern, description):
                matched_skills.add(skill)

    # Extract job requirements based on common keywords
    matched_requirements = set()
    for keyword in requirement_keywords:
        if keyword in description:
            matched_requirements.add(keyword)

    return list(matched_skills), list(matched_requirements)

def process_job_descriptions(df, desc_col="description", title_col="title"):
    """
    Extracts skills and requirements from job descriptions and stores them in the DataFrame.
    """
    skills_list = []
    requirements_list = []

    for idx, row in df.iterrows():
        description_text = str(row.get(desc_col, ""))
        job_title = str(row.get(title_col, "No Title Provided"))

        if not description_text.strip():
            skills_list.append([])
            requirements_list.append([])
            continue
        
        matched_skills, matched_requirements = extract_skills_and_requirements(description_text)
        skills_list.append(matched_skills)
        requirements_list.append(matched_requirements)

    df["Extracted Skills"] = skills_list
    df["Extracted Requirements"] = requirements_list
    return df

# Process the job descriptions and display results
df = process_job_descriptions(df)
print(df)


                                               title  \
0  Working Student - Digital Analytics (all genders)   
1             Working Student Graphic Design (m/w/d)   
2                                        Werkstudent   
3  Working Student Corporate and Business Develop...   
4               Working Student in Product Marketing   

                      company  \
0                      Digitl   
1                    Fanblast   
2  DDC Management Consultants   
3                PRIOjet GmbH   
4                Rabot Energy   

                                                link    source  \
0  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   
1  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   
2  https://de.linkedin.com/jobs/view/werkstudent-...  LinkedIn   
3  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   
4  https://de.linkedin.com/jobs/view/working-stud...  LinkedIn   

   posted_days_ago                                        descripti

##### Getting some keywords