In [None]:
#Libraries
import os
import json
import time
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from collections import deque


#v15 Scraper - Chat Context limited

In [None]:
#Scraper makes sure all domains scraped fall within or are sub-domains under these higher level subdomains
VALID_DOMAINS = [
    'jindal.utdallas.edu', 'infosystems.utdallas.edu', 'fin.utdallas.edu',
    'accounting.utdallas.edu', 'osim.utdallas.edu', 'om.utdallas.edu',
    'marketing.utdallas.edu', 'sem.utdallas.edu', 'mba.utdallas.edu',
    'execed.utdallas.edu', 'https://catalog.utdallas.edu/now/graduate/programs/jsom/', 'utdsolv.utdallas.edu', 'cometscommunity.utdallas.edu',
    'www.utdallas.edu', 'jsom.utdallas.edu', 'https://jindal.utdallas.edu/faculty/', 'https://chairs.utdallas.edu/biographies/',
    'https://policy.utdallas.edu/', 'http://deanofstudents.utdallas.edu/policies/'
]

In [None]:
#These subdomains were found to only produce noisy data due to their design or dynamic nature, thus are specifically mentioned to be excluded
EXCLUDE_DOMAINS = ['coursebook.utdallas.edu', 'news.utdallas.edu', 'math.utdallas.edu']

In [None]:
#In case more links need to be added, use this
extra_links = []
if os.path.exists("extra_links.json"):
    with open("extra_links.json", "r") as file:
        extra_links = json.load(file)

start_urls = list(set(seed_urls + extra_links))


In [None]:
CHECKPOINT_INTERVAL = 500  #Saves a copy of progress every 500 pages to prevent complete loss of data incase script stops prematurely

scraped_count = 0
visited = set()
data = []

def init_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    return webdriver.Chrome(options=chrome_options)

def is_valid_url(url):
    if not url:
        return False
    parsed = urlparse(url)
    domain = parsed.netloc
    if any(excl in domain for excl in EXCLUDE_DOMAINS):
        return False
    if not any(domain.endswith(valid) for valid in VALID_DOMAINS):
        return False
    return True

def scrape_data(driver, url):
    driver.get(url)
    data = {'url': url, 'title': driver.title, 'text': ''}      #Data is scrapes as url, title and text based on their tags
    try:
        paragraphs = driver.find_elements(By.TAG_NAME, 'p')
        data['text'] = ' '.join(p.text for p in paragraphs if p.text)
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")               #Error handling during scraping
        data['title'] = 'No title found'
        data['text'] = 'No text found'
    return data

def get_links(driver):
    links = []
    try:
        elements = driver.find_elements(By.TAG_NAME, 'a')
        for e in elements:
            href = e.get_attribute('href')
            if href and is_valid_url(href):
                links.append(href)
    except:
        pass
    return links

def save_checkpoint():
    filename = f"recursive_checkpoint_{scraped_count}.json"
    with open(filename, 'w') as f:
        json.dump(data, f, indent=2)
    print(f"\n💾 Checkpoint saved: {filename}")               #Checkpoint (every 500 pages saving confirmation)

MAX_DEPTH = 4

def crawl(seed_urls):
    global scraped_count
    driver = init_driver()
    queue = deque([(url, 0) for url in seed_urls])

    while queue:
        url, depth = queue.popleft()
        if url in visited or depth > MAX_DEPTH:
            continue
        visited.add(url)
        page = scrape_data(driver, url)
        data.append(page)
        scraped_count += 1

        links = get_links(driver)
        new_links = [l for l in links if l not in visited]
        queue.extend((link, depth + 1) for link in new_links)

        print(f"Scraped: {scraped_count:<5}, To Scrape: {len(queue):<5} ({len(new_links)} links added) | Depth: {depth} | {url}")     #Real time log functionality to monitor scraper progress

        if scraped_count % CHECKPOINT_INTERVAL == 0:
            save_checkpoint()

    driver.quit()
    return data

#Seed URLs
sitemap_urls = [
    'https://accounting.utdallas.edu/sitemap/', 'https://fin.utdallas.edu/sitemap/',
    'https://osim.utdallas.edu/sitemap/', 'https://om.utdallas.edu/sitemap/',
    'https://marketing.utdallas.edu/sitemap/', 'https://jindal.utdallas.edu/masters-programs/ms-mba-engineering-management/',
    'https://sem.utdallas.edu/', 'https://mba.utdallas.edu/sitemap/',
    'https://execed.utdallas.edu/sitemap/', 'https://jindal.utdallas.edu/masters-programs/double-degree/',
    'https://infosystems.utdallas.edu/sitemap/', 'https://jindal.utdallas.edu/sitemap/', 'https://catalog.utdallas.edu/now/graduate/programs/jsom/',
    'https://jindal.utdallas.edu/faculty/', 'https://chairs.utdallas.edu/biographies/', 'https://policy.utdallas.edu/', 'http://deanofstudents.utdallas.edu/policies/'
]

scraped_data = crawl(sitemap_urls)

with open("recursive_full_scraped_data.json", "w") as f:
    json.dump(scraped_data, f, indent=2)

print("\n✅ Full recursive scrape complete.")                               #File saved name and comfirmation


#v15 Data Cleaner

In [None]:
import json
import re

# Load raw scraped data
with open('recursive_full_scraped_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Regex patterns
footer_patterns = [
    re.compile(r"The University of Texas at Dallas.*?Back to Top", re.DOTALL | re.IGNORECASE),
    re.compile(r"©? ?\d{4},? All rights reserved.*?\(?972\)?[-\d\s]+", re.IGNORECASE),
    re.compile(r"View Published Works", re.IGNORECASE),
    re.compile(r"Sitemap.*?", re.IGNORECASE),
    re.compile(r"Last Updated.*?", re.IGNORECASE),
    re.compile(r"Policy Form: UT Dallas Faculty Authored Textbook Approval Form.*?Return to Top", re.DOTALL | re.IGNORECASE),
    re.compile(r"Naveen Jindal School of Management Business School", re.IGNORECASE),
    re.compile(r"Last Updated:.*", re.IGNORECASE),
    re.compile(r"Copyright © 2025, all rights reserved\.", re.IGNORECASE),
    re.compile(r"800 w campbell road,", re.IGNORECASE),
    re.compile(r"richardson, tx 75080, usa · \(972\) 883-2705", re.IGNORECASE),
    re.compile(r"the university of texas at dallas", re.IGNORECASE),
    re.compile(r"- MA Arts, Technology, and Emerging Communication - MFA Arts, Technology, and Emerging Communication - PhD Audiology - AuD Bioinformatics and Computational Biology - MS Biomedical Engineering - MS Biomedical Engineering - PhD Biotechnology - MS Business Analytics - MS Business Administration", re.IGNORECASE)
]

email_regex = re.compile(r"[a-zA-Z0-9_.+-]+@utdallas\.edu")
phone_regex = re.compile(r"\(\d{3}\) \d{3}-\d{4}")
room_regex = re.compile(r"JSOM \d+\.\d+")

def clean_title(title):
    if not title:
        return "Untitled"
    title = re.sub(r"Naveen Jindal School of Management", "", title)
    title = re.sub(r"The University of Texas at Dallas", "", title)
    title = re.sub(r"\s*[\|\-]\s*(UT Dallas|The University of Texas at Dallas)", "", title)
    title = re.sub(r"[\-|]", "", title).strip()
    return title.strip()

def clean_text(text, faculty_name=""):
    for pattern in footer_patterns:
        text = re.sub(pattern, "", text)

    # Annotate and replace contact details
    email_match = email_regex.search(text)
    phone_match = phone_regex.search(text)
    room_match = room_regex.search(text)

    if email_match:
        text = text.replace(email_match.group(0), f"{faculty_name} email id: {email_match.group(0)}")
    if phone_match:
        text = text.replace(phone_match.group(0), f"{faculty_name} phone number: {phone_match.group(0)}")
    if room_match:
        text = text.replace(room_match.group(0), f"{faculty_name} office room number: {room_match.group(0)}")

    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Clean all data
cleaned_data = []
for entry in data:
    title = clean_title(entry.get("title", ""))
    text = entry.get("text", "")
    faculty_name = title.split(",")[0].strip() if "," in title else title.strip()

    cleaned_entry = {
        "url": entry.get("url", "").strip(),
        "title": title,
        "text": clean_text(text, faculty_name)
    }
    cleaned_data.append(cleaned_entry)

# Cleaned output
with open('v15_owlie_cleaned.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

print("✅ All pages cleaned and saved to 'v15_owlie_cleaned.json'.")


#JSON to XLSX converter
To manually check and ensure that the data is clean

In [None]:
with open('v15_owlie_cleaned.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

df.to_excel('v15_owlie_cleaned.xlsx', index=False)
print("✅ JSON successfully converted to Excel: v15_owlie_cleaned.xlsx")