<a href="https://colab.research.google.com/github/iambhati/daily-job-digest/blob/main/daily_job_digest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install requests beautifulsoup4 schedule


Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [27]:
DEBUG_MODE = True  # Keep this True initially

In [26]:
pip install webdriver-manager



In [25]:
pip install selenium webdriver-manager requests beautifulsoup4

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [31]:
"""
Enhanced Daily Job Digest Script with Better Location Filtering and Remote Jobs
"""

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import time
import random
from urllib.parse import quote_plus
import re

# ---------------- USER SETTINGS ----------------
KEYWORDS = [
    "Entry level Data Analyst",
    "Entry level Business Analyst",
    "Junior Analyst",
    "Fresher Analyst",
    "Associate Data Analyst",
    "Associate Business Analyst",
    "Trainee Analyst",
    "Graduate Analyst",
    "Junior Data Scientist",
    "Business Intelligence Analyst"
]

# Target locations - expanded list
TARGET_LOCATIONS = ["Gurgaon", "Noida", "Jaipur", "Delhi", "NCR", "Gurugram", "New Delhi"]
INCLUDE_REMOTE = True

# Remote work keywords - comprehensive list
REMOTE_KEYWORDS = [
    "remote", "work from home", "wfh", "hybrid", "anywhere", "telecommute",
    "distributed", "virtual", "home office", "flexible location", "remote work",
    "work remotely", "home based", "location independent"
]

RESULTS_PER_SITE = 10  # Increased from 5
SENDER_EMAIL = "learnxaiml@gmail.com"
APP_PASSWORD = "jkfr lftg gjta sadq"
RECEIVER_EMAIL = "learnxaiml@gmail.com"

# ---------------- ENHANCED HELPERS ----------------
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def is_remote_job(location: str, title: str = "", description: str = "") -> bool:
    """Enhanced remote job detection"""
    if not location and not title and not description:
        return False

    text_to_check = f"{location} {title} {description}".lower()
    return any(keyword in text_to_check for keyword in REMOTE_KEYWORDS)

def location_allowed(location: str, title: str = "", description: str = "") -> bool:
    """Enhanced location filtering with better remote detection"""
    if not location:
        return False

    loc_lower = location.lower()

    # Check for remote work
    if INCLUDE_REMOTE and is_remote_job(location, title, description):
        return True

    # Check for target cities
    for city in TARGET_LOCATIONS:
        if city.lower() in loc_lower:
            return True

    return False

def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return "N/A"
    return re.sub(r'\s+', ' ', text.strip())

def random_delay():
    """Add random delay to avoid being blocked"""
    time.sleep(random.uniform(1, 3))

# ---------------- ENHANCED SCRAPERS ----------------
def search_naukri(keyword):
    """Enhanced Naukri scraper with multiple selectors"""
    jobs = []
    search_term = keyword.replace(' ', '-').lower()
    url = f"https://www.naukri.com/{search_term}-jobs"

    try:
        random_delay()
        r = requests.get(url, headers=HEADERS, timeout=30)
        soup = BeautifulSoup(r.text, "html.parser")

        # Try multiple selectors for job cards
        selectors = [
            "article.jobTuple",
            "div.jobTuple",
            "div[class*='jobTuple']",
            "div.srp-jobtuple-wrapper",
            "div.job-tuple"
        ]

        cards = []
        for selector in selectors:
            cards = soup.select(selector)
            if cards:
                break

        cards = cards[:RESULTS_PER_SITE]
        print(f"[DEBUG] Found {len(cards)} Naukri cards for '{keyword}'")

        for card in cards:
            # Try multiple selectors for each field
            title_selectors = ["a.title", "h2 a", "h3 a", "a[class*='title']", ".jobTupleHeader a"]
            company_selectors = ["a.subTitle", ".companyInfo a", "a[class*='subTitle']", ".company a"]
            location_selectors = ["li.location", ".locWdth", ".location", "[class*='location']"]

            title = company = location = link = None

            # Extract title and link
            for sel in title_selectors:
                elem = card.select_one(sel)
                if elem:
                    title = clean_text(elem.get_text())
                    link = elem.get('href', '#')
                    if not link.startswith('http'):
                        link = f"https://www.naukri.com{link}"
                    break

            # Extract company
            for sel in company_selectors:
                elem = card.select_one(sel)
                if elem:
                    company = clean_text(elem.get_text())
                    break

            # Extract location
            for sel in location_selectors:
                elem = card.select_one(sel)
                if elem:
                    location = clean_text(elem.get_text())
                    break

            if title and title != "N/A" and location_allowed(location, title):
                jobs.append((title, company or "N/A", location or "N/A", link or "#"))

    except Exception as e:
        print(f"[WARN] Naukri fetch failed for '{keyword}': {e}")

    return jobs

def search_linkedin(keyword):
    """Enhanced LinkedIn scraper"""
    jobs = []
    search_term = quote_plus(keyword)
    url = f"https://www.linkedin.com/jobs/search?keywords={search_term}&location=India"

    try:
        random_delay()
        r = requests.get(url, headers=HEADERS, timeout=30)
        soup = BeautifulSoup(r.text, "html.parser")

        # Try multiple selectors
        selectors = [
            "div.base-card",
            "li.result-card",
            "div[class*='job-search-card']",
            "div.job-card-container"
        ]

        cards = []
        for selector in selectors:
            cards = soup.select(selector)
            if cards:
                break

        cards = cards[:RESULTS_PER_SITE]
        print(f"[DEBUG] Found {len(cards)} LinkedIn cards for '{keyword}'")

        for card in cards:
            # Multiple selectors for each field
            title_selectors = ["h3", "h3 a", ".job-card-list__title", "a .sr-only"]
            company_selectors = ["h4", "h4 a", ".job-card-container__company-name", ".job-result-card__company-name"]
            location_selectors = [".job-search-card__location", ".job-result-card__location", ".job-card-container__metadata-item"]
            link_selectors = ["a", "h3 a"]

            title = company = location = link = None

            # Extract title
            for sel in title_selectors:
                elem = card.select_one(sel)
                if elem:
                    title = clean_text(elem.get_text())
                    break

            # Extract company
            for sel in company_selectors:
                elem = card.select_one(sel)
                if elem:
                    company = clean_text(elem.get_text())
                    break

            # Extract location
            for sel in location_selectors:
                elem = card.select_one(sel)
                if elem:
                    location = clean_text(elem.get_text())
                    break

            # Extract link
            for sel in link_selectors:
                elem = card.select_one(sel)
                if elem and elem.get('href'):
                    link = elem['href']
                    if not link.startswith('http'):
                        link = f"https://www.linkedin.com{link}"
                    break

            if title and title != "N/A" and location_allowed(location, title):
                jobs.append((title, company or "N/A", location or "N/A", link or "#"))

    except Exception as e:
        print(f"[WARN] LinkedIn fetch failed for '{keyword}': {e}")

    return jobs

def search_indeed(keyword):
    """New Indeed scraper"""
    jobs = []
    search_term = quote_plus(keyword)
    url = f"https://in.indeed.com/jobs?q={search_term}&l=India"

    try:
        random_delay()
        r = requests.get(url, headers=HEADERS, timeout=30)
        soup = BeautifulSoup(r.text, "html.parser")

        cards = soup.select("div.job_seen_beacon, div[data-jk], td.resultContent")[:RESULTS_PER_SITE]
        print(f"[DEBUG] Found {len(cards)} Indeed cards for '{keyword}'")

        for card in cards:
            title_elem = card.select_one("h2 a span, h2 span a, a[data-testid='job-title']")
            company_elem = card.select_one("span.companyName, a .companyName, span[data-testid='company-name']")
            location_elem = card.select_one("div.companyLocation, div[data-testid='job-location']")
            link_elem = card.select_one("h2 a, a[data-testid='job-title']")

            title = clean_text(title_elem.get_text()) if title_elem else "N/A"
            company = clean_text(company_elem.get_text()) if company_elem else "N/A"
            location = clean_text(location_elem.get_text()) if location_elem else "N/A"
            link = link_elem.get('href', '#') if link_elem else "#"

            if not link.startswith('http') and link != "#":
                link = f"https://in.indeed.com{link}"

            if title != "N/A" and location_allowed(location, title):
                jobs.append((title, company, location, link))

    except Exception as e:
        print(f"[WARN] Indeed fetch failed for '{keyword}': {e}")

    return jobs

def search_glassdoor(keyword):
    """New Glassdoor scraper"""
    jobs = []
    search_term = quote_plus(keyword)
    url = f"https://www.glassdoor.co.in/Job/jobs.htm?sc.keyword={search_term}&locT=N&locId=115"

    try:
        random_delay()
        r = requests.get(url, headers=HEADERS, timeout=30)
        soup = BeautifulSoup(r.text, "html.parser")

        cards = soup.select("li[data-test='jobListing'], div.react-job-listing")[:RESULTS_PER_SITE]
        print(f"[DEBUG] Found {len(cards)} Glassdoor cards for '{keyword}'")

        for card in cards:
            title_elem = card.select_one("a[data-test='job-link'], .jobLink")
            company_elem = card.select_one("[data-test='employer-name'], .jobEmpolyerName")
            location_elem = card.select_one("[data-test='job-location'], .jobLocation")

            title = clean_text(title_elem.get_text()) if title_elem else "N/A"
            company = clean_text(company_elem.get_text()) if company_elem else "N/A"
            location = clean_text(location_elem.get_text()) if location_elem else "N/A"
            link = title_elem.get('href', '#') if title_elem else "#"

            if not link.startswith('http') and link != "#":
                link = f"https://www.glassdoor.co.in{link}"

            if title != "N/A" and location_allowed(location, title):
                jobs.append((title, company, location, link))

    except Exception as e:
        print(f"[WARN] Glassdoor fetch failed for '{keyword}': {e}")

    return jobs

# ---------------- ENHANCED EMAIL ----------------
def send_email(jobs):
    """Enhanced email with better formatting"""
    now = datetime.now().strftime("%Y-%m-%d")
    subject = f"🎯 Daily Analyst Job Digest – {now} ({len(jobs)} opportunities)"

    # Separate remote and location-based jobs
    remote_jobs = []
    location_jobs = []

    for job in jobs:
        title, company, location, link = job
        if is_remote_job(location, title):
            remote_jobs.append(job)
        else:
            location_jobs.append(job)

    body = f"""
    <html>
    <body style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;">
        <h2 style="color: #2c5aa0;">🎯 Daily Analyst Job Opportunities</h2>
        <p>Hello Sam,</p>
        <p>Here are today's top entry-level analyst job opportunities ({len(jobs)} total jobs found):</p>
    """

    if remote_jobs:
        body += f"""
        <h3 style="color: #28a745;">🏠 Remote/WFH Opportunities ({len(remote_jobs)} jobs)</h3>
        <div style="margin-left: 20px;">
        """
        for job in remote_jobs:
            title, company, location, link = job
            body += f"""
            <div style="margin-bottom: 15px; padding: 10px; border-left: 3px solid #28a745; background-color: #f8f9fa;">
                <strong style="color: #28a745;">{title}</strong><br>
                <span style="color: #666;">{company} – {location}</span><br>
                <a href="{link}" style="color: #2c5aa0; text-decoration: none;">Apply Here →</a>
            </div>
            """
        body += "</div>"

    if location_jobs:
        body += f"""
        <h3 style="color: #dc3545;">📍 Location-Based Opportunities ({len(location_jobs)} jobs)</h3>
        <div style="margin-left: 20px;">
        """
        for job in location_jobs:
            title, company, location, link = job
            body += f"""
            <div style="margin-bottom: 15px; padding: 10px; border-left: 3px solid #dc3545; background-color: #f8f9fa;">
                <strong style="color: #dc3545;">{title}</strong><br>
                <span style="color: #666;">{company} – {location}</span><br>
                <a href="{link}" style="color: #2c5aa0; text-decoration: none;">Apply Here →</a>
            </div>
            """
        body += "</div>"

    body += """
        <hr style="margin: 30px 0;">
        <p style="color: #666; font-size: 14px;">
            💡 <strong>Tips:</strong><br>
            • Apply early for better chances<br>
            • Customize your resume for each role<br>
            • Research the company before applying<br>
            • Follow up after 1-2 weeks if no response
        </p>
        <p>Good luck with your applications! 🚀</p>
    </body>
    </html>
    """

    msg = MIMEMultipart()
    msg["From"] = SENDER_EMAIL
    msg["To"] = RECEIVER_EMAIL
    msg["Subject"] = subject
    msg.attach(MIMEText(body, "html"))

    try:
        server = smtplib.SMTP("smtp.gmail.com", 587)
        server.starttls()
        server.login(SENDER_EMAIL, APP_PASSWORD)
        server.sendmail(SENDER_EMAIL, RECEIVER_EMAIL, msg.as_string())
        server.quit()
        print(f"[INFO] ✅ Email sent successfully to {RECEIVER_EMAIL}")
    except Exception as e:
        print(f"[ERROR] ❌ Failed to send email: {e}")

# ---------------- ENHANCED MAIN ----------------
def fetch_all_and_send():
    """Enhanced main function with multiple job sites"""
    print(f"[INFO] 🔍 Starting job search at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"[INFO] 📍 Target locations: {', '.join(TARGET_LOCATIONS)}")
    print(f"[INFO] 🏠 Include remote: {INCLUDE_REMOTE}")

    all_jobs = []

    # Search functions with their names
    search_functions = [
        ("Naukri", search_naukri),
        ("LinkedIn", search_linkedin),
        ("Indeed", search_indeed),
        ("Glassdoor", search_glassdoor)
    ]

    for kw in KEYWORDS:
        print(f"\n[INFO] 🔎 Searching for: '{kw}'")
        for site_name, search_func in search_functions:
            try:
                jobs = search_func(kw)
                all_jobs.extend(jobs)
                print(f"[INFO] {site_name}: Found {len(jobs)} jobs")
            except Exception as e:
                print(f"[WARN] {site_name}: Search failed - {e}")

    print(f"\n[INFO] 📊 Total jobs before deduplication: {len(all_jobs)}")

    # Enhanced deduplication - normalize titles for better matching
    def normalize_title(title):
        return re.sub(r'[^\w\s]', '', title.lower()).strip()

    seen = set()
    unique_jobs = []
    for job in all_jobs:
        title, company, location, link = job
        # Create key with normalized title and company
        key = (normalize_title(title), company.lower().strip())
        if key not in seen:
            seen.add(key)
            unique_jobs.append(job)

    print(f"[INFO] 📊 Unique jobs after deduplication: {len(unique_jobs)}")

    if unique_jobs:
        # Sort jobs - remote jobs first, then by company name
        unique_jobs.sort(key=lambda x: (not is_remote_job(x[2], x[0]), x[1].lower()))
        send_email(unique_jobs)

        # Print summary
        remote_count = sum(1 for job in unique_jobs if is_remote_job(job[2], job[0]))
        location_count = len(unique_jobs) - remote_count
        print(f"[INFO] 📧 Email sent with {len(unique_jobs)} jobs ({remote_count} remote, {location_count} location-based)")
    else:
        print("[INFO] ❌ No matching jobs found today.")

if __name__ == "__main__":
    fetch_all_and_send()

[INFO] 🔍 Starting job search at 2025-08-25 18:18:10
[INFO] 📍 Target locations: Gurgaon, Noida, Jaipur, Delhi, NCR, Gurugram, New Delhi
[INFO] 🏠 Include remote: True

[INFO] 🔎 Searching for: 'Entry level Data Analyst'
[DEBUG] Found 0 Naukri cards for 'Entry level Data Analyst'
[INFO] Naukri: Found 0 jobs
[DEBUG] Found 10 LinkedIn cards for 'Entry level Data Analyst'
[INFO] LinkedIn: Found 0 jobs
[DEBUG] Found 0 Indeed cards for 'Entry level Data Analyst'
[INFO] Indeed: Found 0 jobs
[DEBUG] Found 0 Glassdoor cards for 'Entry level Data Analyst'
[INFO] Glassdoor: Found 0 jobs

[INFO] 🔎 Searching for: 'Entry level Business Analyst'
[DEBUG] Found 0 Naukri cards for 'Entry level Business Analyst'
[INFO] Naukri: Found 0 jobs
[DEBUG] Found 10 LinkedIn cards for 'Entry level Business Analyst'
[INFO] LinkedIn: Found 2 jobs
[DEBUG] Found 0 Indeed cards for 'Entry level Business Analyst'
[INFO] Indeed: Found 0 jobs
[DEBUG] Found 0 Glassdoor cards for 'Entry level Business Analyst'
[INFO] Glassdoor