# GTSWebscrape

### Runs through all pages AND PREVENTS duplicates H3 elements

In [73]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

# Initialize an empty list to store data for CSV export and a set to track processed PIDs.
data_for_csv = []
processed_pids = set()

# Loop through pages
page = 0
while True:
    # Update the base URL to include the current page number
    base_url = f'https://publicreporting.sts.org/gtsd?page={page}'
    response = requests.get(base_url)
    bs = BeautifulSoup(response.content, 'html.parser')

    # Regular expression pattern to match '/gtsd-participant/' followed by any number of digits
    pattern = re.compile(r'/gtsd-participant/\d+')

    # Find all links that match the pattern
    links = bs.find_all('a', href=pattern)
    print(f'Processing page: {page}, found {len(links)} links')

    # If no links are found on the current page, exit the loop
    if not links:
        break

    for link in links:
        digits = re.findall(r'\d+', link['href'])
        pid = int(digits[0]) if digits else None

        # Check if this pid has already been processed
        if pid in processed_pids:
            print(f'Skipping already processed PID: {pid}')
            continue  # Skip this pid as it's already been processed
        else:
            processed_pids.add(pid)  # Mark this pid as processed

        url = f'https://publicreporting.sts.org{link["href"]}'
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        print(f'Processing link: {url}')

        # Extract hospital, location, and procedure timeframe information.
        hospital = soup.find('h1').text.strip() if soup.find('h1') else "No Hospital Name"
        location_full = soup.find('div', {'class': 'hospital'}).text.strip() if soup.find('div', {'class': 'hospital'}) else "No Location"
        parts = location_full.split('â€”')
        location = parts[1].strip() if len(parts) > 1 else "Not available"

        procedure_timeframe = "Not available"
        h3_tags = soup.find_all('h3')
        if len(h3_tags) >= 2:
            procedure_timeframe_full = h3_tags[1].get_text(strip=True)
            match = re.search(r'\(([^)]+)\)', procedure_timeframe_full)
            procedure_timeframe = match.group(1) if match else procedure_timeframe

        # Process the page content starting from the second h3 element.
        h3_elements = soup.find_all('h3')[1:]  # Start from the second occurrence
        for h3 in h3_elements:
            category = h3.text.strip() if h3 else "Unknown Category"
            next_sibling = h3.find_next_sibling() if h3 else None

            while next_sibling and next_sibling.name != 'table':
                next_sibling = next_sibling.find_next_sibling()

            if next_sibling and next_sibling.name == 'table':
                rows = next_sibling.find_all('tr')
                for row in rows:
                    description = row.find('td').text.strip() if row.find('td') else "No Description"
                    score = row.find('div', class_='score').text.strip() if row.find('div', class_='score') else "No Score"
                    interval = row.find('div', class_='interval').text.strip() if row.find('div', class_='interval') else "No Interval"
                    report_card_element = row.find('div', class_='report-card-score')
                    report_card_score = report_card_element.text.strip() if report_card_element else "No Report Card Score"

                    # Add data to the list for CSV export.
                    data_for_csv.append({
                        'PID': pid,
                        'Hospital': hospital,
                        'Location': location,
                        'Procedure Timeframe': procedure_timeframe,
                        'Category': category,
                        'Description': description,
                        'Score': score,
                        'Interval': interval,
                        'Report Card Score': report_card_score
                    })

    # Increment the page number for the next iteration
    page += 1

# Create a DataFrame from the list of dictionaries and export to CSV.
if data_for_csv:
    df = pd.DataFrame(data_for_csv)
    df.to_csv('GTSWebscrape.csv', index=False)
    print("Data exported to CSV")
else:
    print("No data available for CSV export.")

Processing page: 0, found 46 links
Processing link: https://publicreporting.sts.org/gtsd-participant/41009
Skipping already processed PID: 41009
Processing link: https://publicreporting.sts.org/gtsd-participant/40013
Skipping already processed PID: 40013
Skipping already processed PID: 40013
Skipping already processed PID: 40013
Skipping already processed PID: 40013
Processing link: https://publicreporting.sts.org/gtsd-participant/40216
Skipping already processed PID: 40216
Processing link: https://publicreporting.sts.org/gtsd-participant/40324
Skipping already processed PID: 40324
Processing link: https://publicreporting.sts.org/gtsd-participant/40326
Skipping already processed PID: 40326
Processing link: https://publicreporting.sts.org/gtsd-participant/40325
Skipping already processed PID: 40325
Processing link: https://publicreporting.sts.org/gtsd-participant/40419
Skipping already processed PID: 40419
Processing link: https://publicreporting.sts.org/gtsd-participant/40051
Skipping 

Processing link: https://publicreporting.sts.org/gtsd-participant/40462
Skipping already processed PID: 40462
Processing link: https://publicreporting.sts.org/gtsd-participant/40018
Skipping already processed PID: 40018
Processing link: https://publicreporting.sts.org/gtsd-participant/40021
Skipping already processed PID: 40021
Processing link: https://publicreporting.sts.org/gtsd-participant/40022
Skipping already processed PID: 40022
Skipping already processed PID: 40022
Processing link: https://publicreporting.sts.org/gtsd-participant/40229
Skipping already processed PID: 40229
Processing link: https://publicreporting.sts.org/gtsd-participant/40228
Skipping already processed PID: 40228
Processing link: https://publicreporting.sts.org/gtsd-participant/40460
Skipping already processed PID: 40460
Processing link: https://publicreporting.sts.org/gtsd-participant/40398
Skipping already processed PID: 40398
Skipping already processed PID: 40398
Skipping already processed PID: 40398
Skippi

Processing link: https://publicreporting.sts.org/gtsd-participant/40093
Skipping already processed PID: 40093
Processing link: https://publicreporting.sts.org/gtsd-participant/40199
Skipping already processed PID: 40199
Processing link: https://publicreporting.sts.org/gtsd-participant/40215
Skipping already processed PID: 40215
Processing link: https://publicreporting.sts.org/gtsd-participant/41165
Skipping already processed PID: 41165
Processing link: https://publicreporting.sts.org/gtsd-participant/40348
Skipping already processed PID: 40348
Processing link: https://publicreporting.sts.org/gtsd-participant/40028
Skipping already processed PID: 40028
Processing link: https://publicreporting.sts.org/gtsd-participant/40150
Skipping already processed PID: 40150
Processing link: https://publicreporting.sts.org/gtsd-participant/40017
Skipping already processed PID: 40017
Processing link: https://publicreporting.sts.org/gtsd-participant/40006
Skipping already processed PID: 40006
Processing