# Scrape University Websites

## Setup

In [None]:
# Import required modules
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

import chromedriver_autoinstaller
import tldextract
import random

from random import randint
from time import sleep

pd.set_option("display.max_columns", None)
from tqdm import tqdm

tqdm.pandas()

from skimpy import skim

import pickle

In [None]:
# Selenium headless chrome driver

options = Options()
options.add_argument("--headless=new")
options.add_argument("user-agent=INDIAN_UNIVERSITIES")

# block file downloads
prefs = {"download_restrictions": 3}
options.add_experimental_option("prefs", prefs)

# Initialize the Selenium WebDriver
driver = webdriver.Chrome(options=options)

# driver.set_page_load_timeout(60)  # 60 seconds timeout for page load only if needed

In [None]:
non_webpage_extensions = [
    ".pdf",
    ".doc",
    ".docx",
    ".xls",
    ".xlsx",
    ".ppt",
    ".pptx",  # Documents
    ".png",
    ".jpg",
    ".jpeg",
    ".gif",
    ".bmp",
    ".tiff",
    ".svg",  # Images
    ".mp3",
    ".wav",
    ".ogg",
    ".mp4",
    ".avi",
    ".mkv",
    ".flv",  # Audio & Video
    ".zip",
    ".rar",
    ".tar",
    ".gz",
    ".7z",  # Archive
    ".exe",
    ".dmg",
    ".iso",
    ".bin",  # Executable & Disk Images
]

In [None]:
# Extract domain name from URL
def extract_domain(url):
    extracted = tldextract.extract(url)
    # This will combine the domain and the top-level domain (TLD)
    return f"{extracted.domain}.{extracted.suffix}"

In [None]:
# Functions to get all links within the website. Manual checks are needed later on misses.


def extract_links_from_page(driver, url, base_url):
    """
    Extracts all the links from a given webpage using Selenium, ignoring specific file types and external links.

    Args:
    - driver: A Selenium WebDriver instance.
    - url: The URL of the webpage to extract links from.
    - base_url: The base URL of the website to ensure links are internal.

    Returns:
    - A set of links from the webpage.
    """
    driver.get(url)
    links_on_page = set()
    elements = driver.find_elements(By.TAG_NAME, "a")
    for elem in elements:
        href = elem.get_attribute("href")
        if (
            href
            and extract_domain(base_url) in href
            and not href.startswith("mailto:")
            and not any(ext in href for ext in non_webpage_extensions)
        ):
            links_on_page.add(href)
    return links_on_page


def get_all_website_links(base_url):
    visited_links = set()
    links_to_visit = [(base_url, 0)]  # Queue initialized with base_url and depth 0
    all_links = set()

    while links_to_visit:
        current_link, depth = links_to_visit.pop(0)  # Dequeue from the front
        if current_link not in visited_links:
            visited_links.add(current_link)
            try:
                links_on_current_page = extract_links_from_page(
                    driver, current_link, base_url
                )
                all_links.update(links_on_current_page)
                # Add new links with incremented depth
                links_to_visit.extend(
                    [
                        (link, depth + 1)
                        for link in links_on_current_page
                        if extract_domain(base_url) in link
                        and link not in visited_links
                    ]
                )
                print(
                    f"Total links found so far: {len(all_links)}. Current depth: {depth}"
                )

            except TimeoutException:
                print(f"Timeout while processing {current_link}")
                continue
            except Exception as e:
                print(f"Error while processing {current_link}. Exception: {e}")
                continue

    return list(all_links)

In [None]:
def extract_page_data(url):
    """Extracts page title and cleaned text from the given URL."""
    try:
        # WebDriver load page
        driver.get(url)

        # Create BeautifulSoup object and specify the parser
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract title of the page
        title = soup.title.string if soup.title else "No title"

        # Extract all text from the page
        raw_text = soup.get_text()
        # Clean the text by replacing multiple white spaces and line breaks with single space
        cleaned_text = " ".join(raw_text.split())

        return title, cleaned_text
    except TimeoutException:
        print(f"Timeout exception for URL: {url}")
        return None, None
    except Exception as e:
        print(f"Error for URL {url}: {repr(e)}")
        return None, None

## Phase 1: Fetch All URLs

In [None]:
website_list = list(
    pd.read_csv("../data/ugc/india_websites_cleaned.csv")["website"].dropna()
)
len(website_list)

In [None]:
# List to store all URLs from all websites
all_urls = []

# Loop through each website
for website in tqdm(website_list):
    # print(f"Processing: {website}")
    urls_from_website = get_all_website_links(website)
    all_urls.extend(
        urls_from_website
    )  # Add URLs from the current website to the master list

len(all_urls)

In [None]:
# save list of all URLs before we continue
with open("../data/all_urls_v1.pkl", "wb") as file:
    pickle.dump(all_urls, file)

# # read list of urls if needed
# with open("../data/all_urls_v1.pkl", "rb") as file:
#     all_urls = pickle.load(file)

## Phase 2: Fetch Data from URLs

In [None]:
random.seed(210420)  # set a seed for consistency

random.shuffle(
    all_urls
)  # randomly shuffle all collected URLs to avoid pinging the same website at short intervals

In [None]:
# Initialize an empty list to hold the data
data = []

# For each URL in the list of URLs
for url in tqdm(all_urls):
    title, cleaned_text = extract_page_data(url)
    # Append the page URL, page title and cleaned text to the data list
    data.append([url, title, cleaned_text])

# Convert the data into a pandas DataFrame
df = pd.DataFrame(data, columns=["URL", "Title", "Text"])

skim(df)

In [None]:
# Save as CSV file
df.to_csv(
    "../data/scraped_sites_v1.csv",
    encoding="utf-8",
    index=False,
)

# Also save as a Parquet file as it takes less space on server
df.to_parquet("../data/scraped_sites_v1.parquet", engine="pyarrow")

In [None]:
driver.quit()  # Close the WebDriver instance