In [94]:
def build_file_name(url):
    return '-'.join(url.split("/")[2:]).replace('.','-') + ".json"

## Wikipedia

In [64]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_wikipedia_to_json(url, output_file):
    # Send request to the URL
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the correct div based on the updated structure
    content = soup.find('div', {'id': 'mw-content-text'})

    # Dictionary to store the result
    data = {}
    current_h1 = None
    current_h2 = None
    current_h3 = None
    current_h4 = None
    current_h5 = None
    current_h6 = None

    # Iterate over headings and paragraphs
    for tag in content.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if tag.name == 'h2':
            current_h2 = tag.get_text(strip=True)
            data[current_h2] = {}
            current_h3 = current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h3' and current_h2:
            current_h3 = tag.get_text(strip=True)
            data[current_h2][current_h3] = {}
            current_h4 = current_h5 = current_h6 = None
        elif tag.name == 'h4' and current_h3:
            current_h4 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4] = {}
            current_h5 = current_h6 = None
        elif tag.name == 'h5' and current_h4:
            current_h5 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5] = {}
            current_h6 = None
        elif tag.name == 'h6' and current_h5:
            current_h6 = tag.get_text(strip=True)
            data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
        elif tag.name == 'p':
            if current_h6:
                data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h5:
                data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h4:
                data[current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h3:
                data[current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
            elif current_h2:
                data[current_h2].setdefault('content', []).append(tag.get_text(strip=True))
    
    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Content successfully written to {output_file}")


In [65]:
url_list = ["https://en.wikipedia.org/wiki/Pittsburgh", "https://en.wikipedia.org/wiki/History_of_Pittsburgh",
"https://en.wikipedia.org/wiki/Carnegie_Mellon_University"]
output_file_list = ["Pittsburgh_Wikipedia.json", "History_of_Pittsburgh_Wikipedia.json", "Carnegie_Mellon_University_Wikipedia.json"]
for url, output_file in zip(url_list, output_file_list):
    scrape_wikipedia_to_json(url, output_file)

Content successfully written to Pittsburgh_Wikipedia.json
Content successfully written to History_of_Pittsburgh_Wikipedia.json
Content successfully written to Carnegie_Mellon_University_Wikipedia.json


#### ---- scripts

In [45]:
type(content)

bs4.element.Tag

In [56]:
content = soup.find('div', {'id': 'bodyContent'})

In [None]:
content

In [None]:
for tag in content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
    print(tag.name)

In [60]:
# Dictionary to store the result
data = {}
current_h1 = None
current_h2 = None
current_h3 = None
current_h4 = None
current_h5 = None
current_h6 = None

# Iterate over headings and paragraphs
for tag in content.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
    if tag.name == 'h2':
        current_h2 = tag.get_text(strip=True)
        data[current_h2] = {}
        current_h3 = current_h4 = current_h5 = current_h6 = None
    elif tag.name == 'h3' and current_h2:
        current_h3 = tag.get_text(strip=True)
        data[current_h2][current_h3] = {}
        current_h4 = current_h5 = current_h6 = None
    elif tag.name == 'h4' and current_h3:
        current_h4 = tag.get_text(strip=True)
        data[current_h2][current_h3][current_h4] = {}
        current_h5 = current_h6 = None
    elif tag.name == 'h5' and current_h4:
        current_h5 = tag.get_text(strip=True)
        data[current_h2][current_h3][current_h4][current_h5] = {}
        current_h6 = None
    elif tag.name == 'h6' and current_h5:
        current_h6 = tag.get_text(strip=True)
        data[current_h2][current_h3][current_h4][current_h5][current_h6] = {}
    elif tag.name == 'p':
        if current_h6:
            data[current_h2][current_h3][current_h4][current_h5][current_h6].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h5:
            data[current_h2][current_h3][current_h4][current_h5].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h4:
            data[current_h2][current_h3][current_h4].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h3:
            data[current_h2][current_h3].setdefault('content', []).append(tag.get_text(strip=True))
        elif current_h2:
            data[current_h2].setdefault('content', []).append(tag.get_text(strip=True))

In [None]:
data

### pittsburghpa.gov

In [90]:
import requests
from bs4 import BeautifulSoup

def get_subpage_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <a> tags
    links = []
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        if link.startswith('/'):  # If it's a relative link, prepend the base URL
            link = f"{url.rstrip('/')}{link}"
        links.append(link)
    
    return links

# Example usage
url = "https://pittsburghpa.gov/index.html"  # Replace with the base URL
subpage_links = get_subpage_links(url)
print(f"Found {len(subpage_links)} subpage links:")
for link in subpage_links:
    print(link)

Found 346 subpage links:
#
#
../index.html
https://www.votespa.com/Pages/default.aspx
https://pittsburghpa.gov/guia-para-residentes-de-la-ciudad-de-pittsburgh/introduccion
https://pittsburghpa.gov/311
https://pittsburghpa.gov/mayor/covid-updates
https://pittsburghpa.gov/dcp/ccb-ada
https://pittsburghpa.gov/city-info/frequent-numbers
https://pittsburghpa.gov/city-info/socialmedia
page.html
https://pittsburghpa.gov/311
https://pittsburghpa.gov/citiparks/parks.html
https://cprbpgh.org/
https://pittsburghpa.gov/dcp/index.html
https://pittsburghpa.gov/chr/index.html
https://engage.pittsburghpa.gov/
https://pittsburghpa.gov/ehb/index.html
https://pittsburghpa.gov/finance/finance.html
https://pittsburghpa.gov/innovation-performance/index.html
https://pittsburghpa.gov/humanresources/index.html
https://www.governmentjobs.com/careers/pittsburgh
https://pittsburghpa.gov/domi/index.html
https://pittsburghpa.gov/ochs/index.html
https://pittsburghpa.gov/publicsafety/ofem
https://pittsburghpa.gov/pli

In [95]:
# https://pittsburghpa.gov/pittsburgh/pgh-about
gov_urls = ["https://pittsburghpa.gov/pittsburgh/pgh-about", "https://pittsburghpa.gov/pittsburgh/pgh-sports", 
"https://pittsburghpa.gov/pittsburgh/cultural-activities", "https://pittsburghpa.gov/pittsburgh/flag-seal", 
"https://pittsburghpa.gov/mayor/pghmayors"]


gov_files = []
for url in gov_urls:
    gov_files.append(build_file_name(url))

for url, output_file in zip(gov_urls, gov_files):
    scrape_gov_to_json(url, output_file)

pittsburghpa-gov-pittsburgh-pgh-about.json
pittsburghpa-gov-pittsburgh-pgh-sports.json
pittsburghpa-gov-pittsburgh-cultural-activities.json
pittsburghpa-gov-pittsburgh-flag-seal.json
pittsburghpa-gov-mayor-pghmayors.json


In [88]:
def scrape_gov_to_json(url, output_file):
    # Send request to the URL
    response = requests.get(url)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the correct div based on the updated structure
    content = soup.find('div', {'class': 'col-md-12'})

    # Dictionary to store the result
    data = {}
    for passage in content.find_all('p'):
        data.setdefault('content', []).append(passage.get_text(strip=True))
    
    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    print(f"Content successfully written to {output_file}")


Content successfully written to Pittsburgh_Gov_About.json
Content successfully written to Pittsburgh_Gov_Sports.json
