In [1]:
def build_file_name(url):
    return 'raw_documents/'+'-'.join(url.split("/")[2:]).replace('.','-') + ".json"

## Wikipedia

In [64]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_wikipedia_to_json(url, output_file):
    # Send request to the URL
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the correct div based on the updated structure
    content = soup.find('div', {'id': 'mw-content-text'})

    # Dictionary to store the result
    data = {}
    current_h1 = None
    current_h2 = None
    current_h3 = None
    current_h4 = None
    current_h5 = None
    current_h6 = None

    # Iterate over headings and paragraphs
    for tag in content.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if tag.name == 'h2':
            current_h2 = tag.get_text(strip=True)
            data[current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name == 'p':
            if current_h6:
                data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h5:
                data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h4:
                data[current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h3:
                data[current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h2:
                data[current_h2].setdefault('content', []).append(tag.get_text(strip=True))
    
    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Content successfully written to {output_file}")


In [65]:
url_list = ["https://en.wikipedia.org/wiki/Pittsburgh", "https://en.wikipedia.org/wiki/History_of_Pittsburgh",
"https://en.wikipedia.org/wiki/Carnegie_Mellon_University"]
output_file_list = ["Pittsburgh_Wikipedia.json", "History_of_Pittsburgh_Wikipedia.json", "Carnegie_Mellon_University_Wikipedia.json"]
for url, output_file in zip(url_list, output_file_list):
    scrape_wikipedia_to_json(url, output_file)

Content successfully written to Pittsburgh_Wikipedia.json
Content successfully written to History_of_Pittsburgh_Wikipedia.json
Content successfully written to Carnegie_Mellon_University_Wikipedia.json


#### ---- scripts

In [45]:
type(content)

bs4.element.Tag

In [56]:
content = soup.find('div', {'id': 'bodyContent'})

In [None]:
content

In [None]:
for tag in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
    print(tag.name)

In [60]:
# Dictionary to store the result
data = {}
current_h1 = None
current_h2 = None
current_h3 = None
current_h4 = None
current_h5 = None
current_h6 = None

# Iterate over headings and paragraphs
for tag in content.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
    if tag.name == 'h2':
        current_h2 = tag.get_text(strip=True)
        data[current_h2] = {}
        current_h3 = current_h4 = current_h5 = current_h6 = None
    elif tag.name == 'h3' and current_h2:
        current_h3 = tag.get_text(strip=True)
        data[current_h2][current_h3] = {}
        current_h4 = current_h5 = current_h6 = None
    elif tag.name == 'h4' and current_h3:
        current_h4 = tag.get_text(strip=True)
        data[current_h2][current_h3][current_h4] = {}
        current_h5 = current_h6 = None
    elif tag.name == 'h5' and current_h4:
        current_h5 = tag.get_text(strip=True)
        data[current_h2][current_h3][current_h4][current_h5] = {}
        current_h6 = None
    elif tag.name == 'h6' and current_h5:
        current_h6 = tag.get_text(strip=True)
        data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
    elif tag.name == 'p':
        if current_h6:
            data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h5:
            data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h4:
            data[current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h3:
            data[current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h2:
            data[current_h2].setdefault('content', []).append(tag.get_text(strip=True))

In [None]:
data

## pittsburghpa.gov

### get subpages' url

In [None]:
import requests
from bs4 import BeautifulSoup

def get_subpage_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <a> tags
    links = []
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        if link.startswith('/'):  # If it's a relative link, prepend the base URL
            link = f"{url.rstrip('/')}{link}"
        links.append(link)
    
    return links

# Example usage
url = "https://pittsburghpa.gov/index.html"  # Replace with the base URL
subpage_links = get_subpage_links(url)
print(f"Found {len(subpage_links)} subpage links:")
for link in subpage_links:
    print(link)

In [8]:
response = requests.get('https://pittsburghpa.gov/finance/tax-forms')
soup = BeautifulSoup(response.content, 'html.parser')

div = soup.find('div', class_='opened-for-codepen')

gov_finance_urls = []
if div:
    links = div.find_all('a')
    for link in links:
        gov_finance_urls.append(link['href'])
else:
    print("未找到class='opened-for-codepen'的div标签")

In [15]:
response = requests.get('https://pittsburghpa.gov/events/index.html')
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', class_ = 'accordion')

gov_events_urls = []
if div:
    links = div.find_all('a')
    for link in links:
        gov_events_urls.append(link['href'])


In [40]:
response = requests.get('https://pittsburghpa.gov/city-info/policies')
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', class_ = 'accordion')

gov_policy_urls = []
if div:
    links = div.find_all('a')
    for link in links:
        gov_policy_urls.append(link['href'])
gov_policy_urls

['https://pittsburghpa.gov/city-info/press-releases',
 'https://pittsburghpa.gov/city-info/socialmedia',
 'https://pittsburghpa.gov/city-info/frequent-numbers',
 'https://pittsburghpa.gov/city-info/executive-orders',
 'https://pittsburghpa.gov/city-info/policies']

### build documents

In [24]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_gov_to_json(url, output_file):
    # Send request to the URL
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the correct div based on the updated structure
    content = soup.find('div', {'class': 'col-md-12'})

    # Dictionary to store the result
    data = {}
    for passage in content.find_all('p'):
        data.setdefault('content', []).append(passage.get_text(strip=True))
    
    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Content successfully written to {output_file}")

def gov_url_list_to_docuemnts(url_list):
    for url in url_list:
        if url.endswith('pdf'):
            print(f"Skipping PDF file: {url}")
            continue
        try:
            output_file = build_file_name(url)
            scrape_gov_to_json(url, output_file)
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")
            continue


In [None]:
# https://pittsburghpa.gov/pittsburgh/pgh-about
gov_urls = ["https://pittsburghpa.gov/pittsburgh/pgh-about", "https://pittsburghpa.gov/pittsburgh/pgh-sports", 
"https://pittsburghpa.gov/pittsburgh/cultural-activities", "https://pittsburghpa.gov/pittsburgh/flag-seal", 
"https://pittsburghpa.gov/mayor/pghmayors"]

gov_url_list_to_docuemnts(gov_urls)

In [None]:
gov_url_list_to_docuemnts(gov_finance_urls)

In [None]:
gov_url_list_to_docuemnts(gov_events_urls)

In [41]:
gov_url_list_to_docuemnts(gov_policy_urls)

Content successfully written to raw_documents/pittsburghpa-gov-city-info-press-releases.json
Content successfully written to raw_documents/pittsburghpa-gov-city-info-socialmedia.json
Content successfully written to raw_documents/pittsburghpa-gov-city-info-frequent-numbers.json
Content successfully written to raw_documents/pittsburghpa-gov-city-info-executive-orders.json
Content successfully written to raw_documents/pittsburghpa-gov-city-info-policies.json


### pdf

In [31]:
import requests
import pdfplumber

# 1. 通过 URL 下载 PDF
def download_pdf(url, output_path):
    response = requests.get(url)
    with open(output_path, 'wb') as file:
        file.write(response.content)

# 2. 将 PDF 转换为文字
def pdf_to_text(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            print(page)
            text += page.extract_text()
    return text

# 示例使用
url = "https://apps.pittsburghpa.gov/redtail/images/25073_SummerGuide-2024-WEB.pdf"
pdf_path = "raw_documents/gov_summer_event_guide.pdf"

# 下载 PDF
download_pdf(url, pdf_path)

# PDF 转文字
pdf_text = pdf_to_text(pdf_path)
print(pdf_text)

<Page:1>
<Page:2>
<Page:3>
<Page:4>
<Page:5>
<Page:6>
<Page:7>
<Page:8>



In [37]:
pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading PyMuPDF-1.24.11-cp38-abi3-macosx_11_0_arm64.whl (18.2 MB)
[K     |████████████████████████████████| 18.2 MB 2.5 MB/s eta 0:00:01
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.24.11
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [38]:
import fitz  # PyMuPDF

def pdf_to_text_pymupdf(pdf_path):
    text = ''
    pdf_document = fitz.open(pdf_path)
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        text += page.get_text("text")  # 使用 "text" 模式提取纯文本
    return text

pdf_text = pdf_to_text_pymupdf(pdf_path)
print(pdf_text)




## britannica.com/place/Pittsburgh

In [47]:
# https://www.britannica.com/place/Pittsburgh

response = requests.get('https://www.britannica.com/place/Pittsburgh')
soup = BeautifulSoup(response.content, 'html.parser')
content = soup.find('div', class_ = 'reading-channel')

data = {}
for passage in content.find_all('p'):
    data.setdefault('content', []).append(passage.get_text(strip=True))

with open('raw_documents/Pittsburgh_Britannica.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)


## visitPittsburgh

event

In [66]:
# https://www.visitpittsburgh.com/events-festivals/annual-events/
# 从class_='content--primary'中提取subpages
import requests
from bs4 import BeautifulSoup

visit_pitt_event_url = 'https://www.visitpittsburgh.com/events-festivals/annual-events/'
response = requests.get(visit_pitt_event_url)
soup = BeautifulSoup(response.content, 'html.parser')
    
# Find all <a> tags
visit_even_links = []
div = soup.find('main', class_='content--primary')
for a_tag in div.find_all('a', href=True):
    link = a_tag['href']
    if link.startswith('/'):  # If it's a relative link, prepend the base URL
        link = f"{url.rstrip('/')}{link}"
    if len(link.split('/')) > 4:
        visit_even_links.append(link)

In [74]:
def scrape_visit_to_json(url, output_file):
    # Send request to the URL
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the correct div based on the updated structure
    content = soup.find('main', {'class': 'content--primary'})

    # Dictionary to store the result
    data = {}
    current_h1 = None
    current_h2 = None
    current_h3 = None
    current_h4 = None
    current_h5 = None
    current_h6 = None

    # Iterate over headings and paragraphs
    for tag in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name == 'p':
            if current_h6:
                data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h5:
                data[current_h1][current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h4:
                data[current_h1][current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h3:
                data[current_h1][current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h2:
                data[current_h1][current_h2].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h1:
                data[current_h1].setdefault('content', []).append(tag.get_text(strip=True))

    
    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Content successfully written to {output_file}")

In [75]:
def visit_pitt_event_url_to_docuemnts(url_list):
    for url in url_list:
        if url.endswith('pdf'):
            print(f"Skipping PDF file: {url}")
            continue
        try:
            output_file = build_file_name(url)
            scrape_visit_to_json(url, output_file)
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")
            continue


In [None]:
visit_pitt_event_url_to_docuemnts(visit_even_links)

sports

In [98]:
visit_pitt_sports_url = 'https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'
response = requests.get(visit_pitt_sports_url)
soup = BeautifulSoup(response.content, 'html.parser')
    
# Find all <a> tags
visit_sports_links = []
div = soup.find('main', class_='content--primary')
for a_tag in div.find_all('a', href=True):
    link = a_tag['href']
    if link.startswith('/'):  # If it's a relative link, prepend the base URL
        link = f"{url.rstrip('/')}{link}"
    if len(link.split('/')) > 4:
        visit_sports_links.append(link)

In [99]:
visit_sports_links.extend(['https://www.visitpittsburgh.com/things-to-do/pittsburgh-sports-teams/'])

In [100]:
len(visit_sports_links)

61

In [110]:
def scrape_visit_sport_to_json(url, output_file):
    # Send request to the URL
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the correct div based on the updated structure
    if 'blog' in url:
        content = soup.find('article',class_= 'detail__inner')
    elif 'directory' in url:
        content = soup.find('div', class_='detail__primary-inner row')
    else:
        content = soup.find('main', {'class': 'content--primary'})

    # Dictionary to store the result
    data = {}
    current_h1 = None
    current_h2 = None
    current_h3 = None
    current_h4 = None
    current_h5 = None
    current_h6 = None

    # Iterate over headings and paragraphs
    for tag in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if tag.name == 'h1':
            current_h1 = tag.get_text(strip=True)
            data[current_h1] = {}
            current_h2 = current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h2' and current_h1:
            current_h2 = tag.get_text(strip=True)
            data[current_h1][current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name == 'p':
            if current_h6:
                data[current_h1][current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h5:
                data[current_h1][current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h4:
                data[current_h1][current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h3:
                data[current_h1][current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h2:
                data[current_h1][current_h2].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h1:
                data[current_h1].setdefault('content', []).append(tag.get_text(strip=True))

    
    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Content successfully written to {output_file}")

def visit_pitt_sport_url_to_docuemnts(url_list):
    for url in url_list:
        if url.endswith('pdf'):
            print(f"Skipping PDF file: {url}")
            continue
        try:
            output_file = build_file_name(url)
            scrape_visit_sport_to_json(url, output_file)
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")
            continue


In [None]:
visit_pitt_sport_url_to_docuemnts(visit_sports_links)

food

In [103]:
import requests
from bs4 import BeautifulSoup

visit_pitt_food_url = 'https://www.visitpittsburgh.com/restaurants-culinary/'
response = requests.get(visit_pitt_food_url)
soup = BeautifulSoup(response.content, 'html.parser')
    
# Find all <a> tags
visit_food_links = []
div = soup.find('main', class_='content--primary')
for a_tag in div.find_all('a', href=True):
    link = a_tag['href']
    if link.startswith('/'):  # If it's a relative link, prepend the base URL
        link = f"{url.rstrip('/')}{link}"
    if len(link.split('/')) > 4:
        visit_food_links.append(link)

In [None]:
visit_pitt_sport_url_to_docuemnts(visit_food_links)

In [108]:
# Send request to the URL
url = 'https://www.visitpittsburgh.com/blog/unique-pittsburgh-pierogi-joints/'
response = requests.get(url)

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the correct div based on the updated structure
if 'blog' in url:
    content = soup.find('article',class_= 'detail__inner')

main pages / other pages

In [113]:
visit_other_url = [
    'https://www.visitpittsburgh.com/',
    'https://www.visitpittsburgh.com/nfl-draft-pittsburgh/',
    'https://www.visitpittsburgh.com/blog/fall-fairs-festivals-and-events/',
    'https://www.visitpittsburgh.com/blog/pittsburgh-fall-date-ideas/',
    'https://www.visitpittsburgh.com/blog/guide-to-football-in-pittsburgh/',
    'https://www.visitpittsburgh.com/blog/upcoming-concerts-in-pittsburgh/',
    'https://www.visitpittsburgh.com/blog/how-to-ride-the-pittsburgh-inclines/',
    'https://www.visitpittsburgh.com/blog/pittsburgh-theatre-upcoming-performances/',
    'https://www.visitpittsburgh.com/blog/penguins-penguins-penguins/',
    'https://www.visitpittsburgh.com/blog/halloween-pop-ups-bars-and-restaurants-in-pittsburgh/',
    'https://www.visitpittsburgh.com/blog/jackworth-ginger-beer-pittsburgh/',
    'https://www.visitpittsburgh.com/restaurants-culinary/farms-farmers-markets/guide-to-pittsburgh-farmers-markets/',
    'https://www.visitpittsburgh.com/restaurants-culinary/farms-farmers-markets/pittsburgh-farm-fresh-meats/',
    'https://www.visitpittsburgh.com/restaurants-culinary/farms-farmers-markets/pittsburghs-csas/',
    'https://www.visitpittsburgh.com/restaurants-culinary/farms-farmers-markets/u-pick-farms/'
]

In [None]:
visit_pitt_sport_url_to_docuemnts(visit_other_url)

## Gov Tax Forms

参考如何把structure data和RAG结合：https://medium.com/intel-tech/tabular-data-rag-llms-improve-results-through-data-table-prompting-bcb42678914b

In [None]:
# https://pittsburghpa.gov/finance/tax-forms

## 2024 Operating Budget


In [None]:
# https://apps.pittsburghpa.gov/redtail/images/23255_2024_Operating_Budget.pdf

## CMU about

In [None]:
# https://www.cmu.edu/about/