<a href="https://colab.research.google.com/github/kanika26187/Web-Crawling/blob/main/web_Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


# Install required packages (only needed once)
!pip install requests beautifulsoup4 tqdm

# Imports
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import pandas as pd
from tqdm import tqdm

# Email pattern
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Validate internal links
def is_valid_url(url, base_domain):
    parsed = urlparse(url)
    return parsed.netloc == base_domain and parsed.scheme in ["http", "https"]

# Extract links from a page
def get_all_links(url, base_domain):
    try:
        resp = requests.get(url, timeout=5)
        soup = BeautifulSoup(resp.text, "html.parser")
        links = set()
        for a_tag in soup.find_all("a", href=True):
            href = urljoin(url, a_tag['href'])
            if is_valid_url(href, base_domain):
                links.add(href.split("#")[0])  # Remove # anchors
        return links
    except:
        return set()

# Crawl and collect emails
def crawl_website(start_url, max_pages=50):
    visited = set()
    emails = set()
    queue = [start_url]
    domain = urlparse(start_url).netloc

    print(f"Starting crawl on {start_url}...\n")

    while queue and len(visited) < max_pages:
        current_url = queue.pop(0)
        if current_url in visited:
            continue
        visited.add(current_url)
        print(f" Crawling: {current_url}")

        try:
            r = requests.get(current_url, timeout=5)
            found_emails = re.findall(EMAIL_REGEX, r.text)
            emails.update(found_emails)
            new_links = get_all_links(current_url, domain)
            queue.extend(new_links - visited)
        except:
            print(f" Failed: {current_url}")

    print(f"\n Crawl complete. {len(emails)} email(s) found.")
    return emails

# --- Run the crawler ---
start_url = "https://www.thapar.edu"  # Change to your target site
emails = crawl_website(start_url, max_pages=50)

# Save results
df = pd.DataFrame(sorted(emails), columns=["Email"])
df.to_csv("thapar_emails.csv", index=False)
print(" Emails saved to thapar_emails.csv")

🔍 Starting crawl on https://www.thapar.edu...

🔗 Crawling: https://www.thapar.edu
🔗 Crawling: https://www.thapar.edu/students/pages/student-grievance-redressal
🔗 Crawling: http://www.thapar.edu/upload/files/Group Medical Insurance Policy.pdf
🔗 Crawling: http://www.thapar.edu/upload/files/Specializations - TIET.pdf
🔗 Crawling: http://www.thapar.edu/upload/files/Initial Research Grant
🔗 Crawling: https://www.thapar.edu/upload/files/STARTUP POLICY_201_NISP_Final OCT2021.pdf
🔗 Crawling: http://www.thapar.edu/upload/files/Redressal, Prevention and Prohibition of Sexual Harassment at Workplace Policy.pdf
🔗 Crawling: http://www.thapar.edu/misces/pages/lmtsm
🔗 Crawling: https://www.thapar.edu/misces/pages/education-verification
🔗 Crawling: https://www.thapar.edu/misces/pages/forms
🔗 Crawling: https://www.thapar.edu/Faculty Development Policies
🔗 Crawling: https://www.thapar.edu/students/pages/teqip
🔗 Crawling: https://www.thapar.edu/outreaches/pages
🔗 Crawling: http://www.thapar.edu/misces/pag