In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urlparse, urljoin

import os
from tqdm import tqdm

In [8]:
def derive_text_links(web_link):
    response = requests.get(web_link)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all text from the page
        text = soup.get_text(separator="\n", strip=True)
        links = [a["href"] for a in soup.find_all("a", href=True)]

        # Print extracted text
        return text, links
    else:
        return "", []

In [52]:
def clean_text(text):
    """Remove extra spaces, newlines, and unwanted characters."""
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"\[.*?\]", "", text)  # Remove references like [1]
    return text

def filter_links(links, base_url):
    base_domain = urlparse(base_url).netloc

    # Convert relative links to absolute URLs
    absolute_links = [urljoin(base_url, link) for link in links]

    useful_links = []
    for link in absolute_links:
        parsed_link = urlparse(link)
        netloc = parsed_link.netloc
        path = parsed_link.path.lower()

        # ✅ Remove external links
        if netloc != base_domain:
            continue

        # ✅ Remove unwanted navigation and UI links
        unwanted_keywords = [
            "login", "signup", "account", "profile", "settings", "cart", "terms", "privacy",
            "help", "contact", "about", "faq"
        ]
        if any(word in path for word in unwanted_keywords):
            continue

        # ✅ Remove JavaScript, email, and phone links
        if link.startswith(("javascript:", "mailto:", "tel:")):
            continue

        # ✅ Remove pagination links
        if "page=" in path or "offset=" in path:
            continue

        # ✅ Remove tracking, ad, and referral links
        if any(param in link for param in ["utm_", "ref=", "tracking"]):
            continue

        useful_links.append(link)
    
    return useful_links

def extract_website_content(url):
    """Fetch and parse content from a webpage, then convert it into a structured JSON entry."""

    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch {url}, status code: {response.status_code}")
        return {}, []

    soup = BeautifulSoup(response.text, "html.parser")

    # Extract title
    title = soup.title.text if soup.title else "No Title Found"

    # Extract main content (paragraphs)
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]
    content = clean_text("\n".join(paragraphs))
    
    # Extract keywords (based on meta tags)
    meta_keywords = soup.find("meta", {"name": "keywords"})
    keywords = meta_keywords["content"].split(",") if meta_keywords else []
    links = filter_links([a["href"] for a in soup.find_all("a", href=True)], url)

    # Build JSON entry
    json_entry = {
        "url": url,
        "title": title,
        "content": content,
        "keywords": keywords
    }
    return json_entry, links


In [53]:
def BFS_links_web(base_url, visited):
    web_dicts = []

    links_to_visit = [base_url]

    while len(links_to_visit)> 0:
        url = links_to_visit.pop(0)

        if url in visited:
            continue
            
        visited.add(url)

        json_entry, links = extract_website_content(url)
        if len(json_entry) == 0:
            continue

        links_to_visit.extend(links)

        web_dicts.append(json_entry)

    return web_dicts, visited

In [54]:
url = "https://www.pittsburghsymphony.org/" # 403
url = "https://pittsburghopera.org/" # Yes
url = "https://trustarts.org/" #403
url = "https://carnegiemuseums.org/" # Yes
url = "https://www.heinzhistorycenter.org/" # Yes
url = "https://www.thefrickpittsburgh.org/" # Yes
url = "https://www.visitpittsburgh.com/events-festivals/food-festivals/" # Yes
url = "https://www.picklesburgh.com/" # Yes
url = "https://www.pghtacofest.com/" # Yes
url = "https://pittsburghrestaurantweek.com/"
url = "https://littleitalydays.com/"
url = "https://bananasplitfest.com/"

visited = set()

web_pages = []

for url in ["https://pittsburghopera.org/", "https://carnegiemuseums.org/", "https://www.heinzhistorycenter.org/", "https://www.thefrickpittsburgh.org/", "https://www.visitpittsburgh.com/events-festivals/food-festivals/", "https://www.picklesburgh.com/"
            "https://www.pghtacofest.com/", "https://pittsburghrestaurantweek.com/", "https://littleitalydays.com/", "https://bananasplitfest.com/"]:
    print(url)
    results, visited = BFS_links_web(url, visited)
    web_pages.extend(results)

# extract_website_content(url)

https://pittsburghopera.org/
Failed to fetch https://pittsburghopera.org/current-media-releases/PittsburghOpera_AntonyWalker_extension_2024_DRAFT.doc?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/PghProject?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/show/the-marriage-of-figaro?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/show/Nabucco?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/show/little-women?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/show/Twenty-Seven?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/show/the-rakes-progress?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/show/nabucco?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/resident-artists/2021-22-resident-artists/maire-carmack/?hsLang=en, status code: 404
Failed to fetch https://pittsburghopera.org/resident-artists/202

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))