In [1]:
# ✅ Install required packages (run once)
!pip install requests beautifulsoup4 tqdm

# ✅ Imports
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import pandas as pd
from tqdm import tqdm

# ✅ Email pattern (Regex)
EMAIL_REGEX = r"[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# ✅ Validate internal links
def is_valid_url(url, base_domain):
    parsed = urlparse(url)
    return parsed.netloc == base_domain and parsed.scheme in ["http", "https"]

# ✅ Extract links from a page
def get_all_links(url, base_domain):
    try:
        resp = requests.get(url, timeout=5)
        soup = BeautifulSoup(resp.text, "html.parser")
        links = set()
        for a_tag in soup.find_all("a", href=True):
            href = urljoin(url, a_tag['href'])
            if is_valid_url(href, base_domain):
                links.add(href.split("#")[0])  # Remove anchor parts
        return links
    except:
        return set()

# ✅ Crawl and collect emails
def crawl_website(start_url, max_pages=20):  # Changed from 50 to 20
    visited = set()
    emails = set()
    queue = [start_url]
    domain = urlparse(start_url).netloc

    print(f"Starting crawl on {start_url}...\n")

    while queue and len(visited) < max_pages:
        current_url = queue.pop(0)
        if current_url in visited:
            continue

        visited.add(current_url)
        print(f"🕸 Crawling: {current_url}")

        try:
            r = requests.get(current_url, timeout=5)
            found_emails = re.findall(EMAIL_REGEX, r.text)
            emails.update(found_emails)

            new_links = get_all_links(current_url, domain)
            queue.extend(new_links - visited)

        except:
            print(f"❌ Failed: {current_url}")

    print(f"\n✅ Crawl complete. {len(emails)} email(s) found.")
    return emails

# ✅ Run the crawler
start_url = "https://www.thapar.edu"  # 🔁 Change this to your target site
emails = crawl_website(start_url, max_pages=20)  # Changed here too

# ✅ Save results
df = pd.DataFrame(sorted(emails), columns=["Email"])
df.to_csv("thapar_emails.csv", index=False)
print("\n📁 Emails saved to thapar_emails.csv")




Starting crawl on https://www.thapar.edu...

🕸 Crawling: https://www.thapar.edu
🕸 Crawling: http://www.thapar.edu/upload/files/Faculty TIET & LMTSM Recruitment Policy.pdf
🕸 Crawling: https://www.thapar.edu/Anti Ragging
🕸 Crawling: http://www.thapar.edu/upload/files/Group Accident Insurance Policy.pdf
🕸 Crawling: http://www.thapar.edu/upload/files/Policy for Post Doc Fellowship
🕸 Crawling: https://www.thapar.edu/aboutus/newspage/alumni-achievement
🕸 Crawling: https://www.thapar.edu/pages/contactus
🕸 Crawling: https://www.thapar.edu/students/pages/webkiosk
🕸 Crawling: https://www.thapar.edu/students
🕸 Crawling: https://www.thapar.edu/upload/files/Accessibility Tools for Divyangjan.pdf
🕸 Crawling: https://www.thapar.edu/aboutus/pages/financial
🕸 Crawling: https://www.thapar.edu/misces/latestpages/opening
🕸 Crawling: http://www.thapar.edu/upload/files/Redressal, Prevention and Prohibition of Sexual Harassment at Workplace Policy.pdf
🕸 Crawling: http://www.thapar.edu/upload/files/ABET final