In [53]:
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urlparse, urljoin

import os
from tqdm import tqdm

import pdfplumber
from docx import Document
import pandas as pd
from urllib.parse import urlparse

In [55]:
def download_file(url, save_dir="downloaded_files"):
    """Downloads a file from a URL and saves it locally."""
    os.makedirs(save_dir, exist_ok=True)  # Ensure directory exists
    filename = os.path.join(save_dir, os.path.basename(urlparse(url).path))
    
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            return filename
        else:
            return None
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

def extract_text_from_html(url):
    """Extracts text from an HTML webpage."""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.text, "html.parser")
        text = "\n".join([p.get_text(strip=True) for p in soup.find_all("p")])
        return text.strip() if text else None
    except Exception as e:
        return None

def extract_text_from_pdf(file_path):
    """Extracts text from a PDF file."""
    try:
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        
        print(f"text: {text}")
        return text.strip() if text else None
    except Exception as e:
        print(e)
        return None

def extract_text_from_docx(file_path):
    """Extracts text from a DOCX file."""
    try:
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text.strip() if text else None
    except Exception as e:
        print(e)
        return None

def extract_text_from_excel(file_path):
    """Extracts text from an Excel file (reads all sheets)."""
    try:
        text = ""
        xls = pd.ExcelFile(file_path)
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet_name)
            text += df.to_string(index=False) + "\n"
        return text.strip() if text else None
    except Exception as e:
        print(e)
        return None

def extract_text_from_plaintext(file_path):
    """Extracts text from a TXT file."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read().strip()
    except Exception as e:
        print(e)
        return None

def extract_text_from_json(file_path):
    """Extracts text from a JSON file by flattening its values."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return json.dumps(data, indent=2)  # Convert JSON structure into readable text
    except Exception as e:
        print(e)
        return None

def deal_file_url(url, file_type):
    """Processes a URL, extracts text, and returns a RAG-compatible JSON dictionary."""
    text = None

    # Directly extract HTML text without downloading
    if file_type == "html":
        text = extract_text_from_html(url)

    else:
        # Download the file first
        file_path = download_file(url)
        if not file_path:
            return None
        

        # Extract text based on file type
        if file_type == "pdf":
            text = extract_text_from_pdf(file_path)
        elif file_type == "docx":
            text = extract_text_from_docx(file_path)
        elif file_type == "excel":
            text = extract_text_from_excel(file_path)
        elif file_type == "plaintext":
            text = extract_text_from_plaintext(file_path)
        elif file_type == "json":
            text = extract_text_from_json(file_path)

    if not text:
        return None  # Return None if text extraction failed

    # Format JSON entry for RAG
    json_entry = {
        "id": os.path.basename(urlparse(url).path).split(".")[0],
        "url": url,
        "title": os.path.basename(urlparse(url).path),
        "file_type": file_type,
        "content": text
    }

    return json_entry

In [58]:
def clean_text(text):
    """Remove extra spaces, newlines, and unwanted characters."""
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\[.*?\]", "", text)  # Remove references like [1]
    return text

def filter_links(links, base_url):
    base_domain = urlparse(base_url).netloc

    # Convert relative links to absolute URLs
    absolute_links = [urljoin(base_url, link) for link in links]

    useful_links = []
    for link in absolute_links:
        parsed_link = urlparse(link)
        netloc = parsed_link.netloc
        path = parsed_link.path.lower()

        # ✅ Remove external links
        if netloc != base_domain:
            continue

        # ✅ Remove unwanted navigation and UI links
        unwanted_keywords = [
            "login", "signup", "account", "profile", "settings", "cart", "terms", "privacy",
            "help", "contact", "about", "faq"
        ]
        if any(word in path for word in unwanted_keywords):
            continue

        # ✅ Remove JavaScript, email, and phone links
        if link.startswith(("javascript:", "mailto:", "tel:")):
            continue

        # ✅ Remove pagination links
        if "page=" in path or "offset=" in path:
            continue

        # ✅ Remove tracking, ad, and referral links
        if any(param in link for param in ["utm_", "ref=", "tracking"]):
            continue

        useful_links.append(link)
    
    return useful_links

def check_valid_links(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.head(url, headers=headers, timeout=5, allow_redirects=True)

        # Extract Content-Type from headers
        content_type = response.headers.get("Content-Type", "").lower()

        if "text/html" in content_type:
            return "Web"
        
        if "text/html" in content_type:
            return "html"
        elif "application/json" in content_type:
            return "json"
        elif "application/pdf" in content_type:
            return "pdf"
        elif "application/msword" in content_type or "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type:
            return "docx"
        elif "application/vnd.ms-excel" in content_type or "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" in content_type:
            return "excel"
        elif "text/plain" in content_type:
            return "plaintext"

        return None
    except requests.RequestException as e:
        print(f"Error checking URL: {url} - {e}")
        return None
    
def extract_website_content(url):
    """Fetch and parse content from a webpage, then convert it into a structured JSON entry."""

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }

    try:

        response = requests.get(url, headers=headers)
    except Exception as e:
        print(f"Can't open {url} gor error {e}")
        return {}, []
    
    if response.status_code != 200:
        print(f"Failed to fetch {url}, status code: {response.status_code}")
        return {}, []
    
    link_type = check_valid_links(url)


    if not link_type:
        return {}, []

    if link_type != "Web":
        print(f'File url: {url}')
        return deal_file_url(url, link_type), []

    try:
        soup = BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print(e)
        print(response)
        print(url)
        print(response.text)

    # Extract title
    title = soup.title.text if soup.title else "No Title Found"

    # Extract main content (paragraphs)
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
    content = clean_text("\n".join(paragraphs))
    
    # Extract keywords (based on meta tags)
    meta_keywords = soup.find("meta", {"name": "keywords"})
    keywords = meta_keywords["content"].split(",") if meta_keywords else []
    links = filter_links([a["href"] for a in soup.find_all("a", href=True)], url)

    # Build JSON entry
    json_entry = {
        "url": url,
        "title": title,
        "content": content,
        "keywords": keywords
    }


    return json_entry, links


In [59]:
def BFS_links_web(base_url, visited):
    web_dicts = []

    links_to_visit = [base_url]

    while len(links_to_visit)> 0:
        url = links_to_visit.pop(0)

        if url in visited:
            continue

        if len(visited)%100 == 0:
            print(len(visited))
            
        visited.add(url)

        json_entry, links = extract_website_content(url)
        if not json_entry or len(json_entry) == 0:
            continue

        links_to_visit.extend(links)

        web_dicts.append(json_entry)

    return web_dicts, visited

In [60]:
url = "https://www.pittsburghsymphony.org/" # 403
url = "https://pittsburghopera.org/" # Yes
url = "https://trustarts.org/" #403
url = "https://carnegiemuseums.org/" # Yes
url = "https://www.heinzhistorycenter.org/" # Yes
url = "https://www.thefrickpittsburgh.org/" # Yes
url = "https://www.visitpittsburgh.com/events-festivals/food-festivals/" # Yes
url = "https://www.picklesburgh.com/" # Yes
url = "https://www.pghtacofest.com/" # Yes
url = "https://pittsburghrestaurantweek.com/"
url = "https://littleitalydays.com/"
url = "https://bananasplitfest.com/"

visited = set()

web_pages = []

for url in ["https://www.heinzhistorycenter.org/"]:
    print(url)
    results, visited = BFS_links_web(url, visited)
    web_pages.extend(results)

# extract_website_content(url)

https://www.heinzhistorycenter.org/
0
100
200
300
Failed to fetch https://www.heinzhistorycenter.org/?page_id=1533, status code: 404
Failed to fetch https://www.heinzhistorycenter.org/?page_id=1542, status code: 404
400
500
Failed to fetch https://www.heinzhistorycenter.org/wp-content/uploads/2022/10/Time-Capsule_World's-Fair.mp3, status code: 404
600
700
800
900
1000
1100
File url: https://www.heinzhistorycenter.org/wp-content/uploads/2023/10/HHC-Reproductions-Permissions.pdf
text: Detre Library & Archives
Senator John Heinz History Center
Reproduction and Permission Fees
Reproduction Fees
Digital Images
Low resolution/preview image (avg. 72-150 $15 (refunded if high resolution image is
dpi) subsequently ordered)
High resolution (avg. 400 dpi) $30
Custom extra-high resolution $50
New photography Add $20
Oversize images/maps/prints Pricing may be available upon request
Text
PDF $.50 per page plus a $10.00 service fee for
orders up to 50 pages. An additional $10.00
fee will be charged f

In [61]:
# url = 'https://www.heinzhistorycenter.org/wp-content/uploads/2022/10/Time-Capsule_Unconquered.mp3'

# url = 'https://www.heinzhistorycenter.org/wp-content/uploads/2023/10/HHC-Reproductions-Permissions.pdf'


# extract_text_from_pdf("downloaded_files/HHC-Reproductions-Permissions.pdf")

with open("heinzhistorycenter.json", "w") as json_file:
    json.dump({"documents": web_pages}, json_file, indent=4) 