In [None]:
from curl_cffi.requests import Session # Make sure you've pip installed curl-cffi
import json
import re
import pandas as pd
import time
import random
import html 
import os

DOWNLOAD_DIR = "/Users/jiturner/Downloads" # Replace with the local directory where you want to save the CSVs

specialties = [
    "cancer", "cardiology-and-heart-surgery", "diabetes-and-endocrinology",
    "ear-nose-and-throat", "gastroenterology-and-gi-surgery", "geriatric-care",
    "kidney-failure",
    "gynecology", "neurology-and-neurosurgery", "ophthalmology", "orthopedics",
    "psychiatry", "pulmonology", "rehabilitation", "rheumatology", "urology"
]

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'Mozilla/5.G' ' (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
]

all_rankings = []
base_url = "https://health.usnews.com"
REQUEST_TIMEOUT = 20

json_pattern = re.compile(r"window\['__PAGE_CONTEXT_QUERY_STATE__'\] = (\{.*?\});", re.DOTALL)

session = Session()
session.timeout = REQUEST_TIMEOUT

for specialty in specialties:
    print(f"--- Scraping {specialty} ---")
    page_url = f"{base_url}/best-hospitals/rankings/{specialty}"
    
    session.headers['User-Agent'] = random.choice(USER_AGENTS)
    time.sleep(random.uniform(2.0, 5.0))
    
    try:
        print(f"Requesting {page_url}...")
        response = session.get(page_url, impersonate="chrome110")
        response.raise_for_status()
        
        match = json_pattern.search(response.text)
        if not match:
            print(f"Could not find JSON data for {specialty}")
            continue
            
        json_text = match.group(1)
        
        # --- START: ROBUST PARSING LOGIC ---
        
        # 1. Un-escape HTML entities like &amp;
        json_text = html.unescape(json_text)
        
        # 2. Replace undefined
        json_text = json_text.replace(':undefined', ':null')
        json_text = json_text.replace(',undefined', ',null')
        json_text = json_text.replace('[undefined', '[null')

        # 3. Aggressive removal of link-data attributes
        json_text = re.sub(r'\s+link-data=\\"[^>]*?\\"(?=>)', '', json_text)
        
        # 4. Fallback removal for link-data
        if 'link-data=' in json_text:
            json_text = re.sub(r'\s+link-data=\\"[^\u003E]*?\\"', '', json_text)
        
        # 5. NEW: Remove style attributes (which have unescaped quotes)
        json_text = re.sub(r'\s+style=\\"[^"]*?\\"', '', json_text)

        # 6. NEW: Fallback removal for style
        if 'style=\\"' in json_text:
            json_text = re.sub(r'\s+style=\\"[^\u003E]*?\\"', '', json_text)
        
        # --- END: ROBUST PARSING LOGIC ---
        
        try:
            data = json.loads(json_text)
        except json.JSONDecodeError as e:
            # If it still fails, we'll save the file and move on
            print(f"!!! JSON PARSE FAILED for {specialty}: {e}")
            filename = f"failed_json_FINAL_{specialty}.txt"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(json_text)
            print(f"Saved final failed text to {filename}")
            
            # Print debug context
            error_pos = e.pos
            start = max(0, error_pos - 150)
            end = min(len(json_text), error_pos + 150)
            print(f"\nContext around position {error_pos}:")
            print(json_text[start:end])
            
            continue
        
        # 3. Find the key data
        page_data_key = 'src/containers/pages/health/hospitals/search/index.js'
        page_data = data.get(page_data_key, {}).get('data', {})
        
        # 4. Get initial matches and pagination info
        matches = page_data.get('matches', [])
        has_next_page = page_data.get('hasNextPage', False)
        
        if not matches:
            print(f"No matches found for {specialty}")
            continue
            
        specialty_id = matches[0]['ranking']['specialty_id']
        print(f"Found {len(matches)} initial hospitals (Page 1). (hasNextPage: {has_next_page})")

        for hospital in matches:
            all_rankings.append({
                'specialty': hospital['ranking']['specialty_name'],
                'rank': hospital['ranking']['rank'],
                'name': hospital['name'],
                'url': f"{base_url}{hospital['url']}"
            })

        # 5. --- Handle Pagination (Hardcoded to 5 pages total) ---
        page = 2
        
        while has_next_page and page <= 5: 
            print(f"Fetching page {page} for {specialty}...")
            time.sleep(random.uniform(1.0, 3.0))
            
            api_url = f"{base_url}/best-hospitals/search-data"
            
            params = {
                'page': page,
                'specialty_id': specialty_id,
                'type': 'adult',
                'sort': 'ranking'
            }
            
            api_res = session.get(api_url, params=params, impersonate="chrome110")
            api_res.raise_for_status() # Add this to catch bad API requests
            
            # --- THIS IS THE FIX ---
            # The API response IS the data. There is no 'data' key.
            api_data = api_res.json()
            # --- END FIX ---
            
            page_matches = api_data.get('matches', [])
            has_next_page = api_data.get('hasNextPage', False)
            
            for hospital in page_matches:
                all_rankings.append({
                    'specialty': hospital['ranking']['specialty_name'],
                    'rank': hospital['ranking']['rank'],
                    'name': hospital['name'],
                    'url': f"{base_url}{hospital['url']}"
                })
            
            page += 1
            
        if has_next_page and page > 5:
            print(f"Reached page limit (5). Stopping pagination for {specialty}.")


    except Exception as e:
        print(f"Error processing {specialty}: {e}")
        print("Moving to next specialty...")

# --- All done, show the results ---
df = pd.DataFrame(all_rankings)
print("\n\n--- SCRAPING COMPLETE ---")

if not df.empty:

    # Now save a pivoted version with cumulative columns
    df['Institution'] = df['url'].str.replace('https://health.usnews.com/best-hospitals/area/', '')
    df['Institution'] = df['url'].apply(lambda x:
                                    x.replace('https://health.usnews.com/best-hospitals/area/', '').split('/')[1].replace('-', ' ')
                                    )
    df['Institution'] = df['Institution'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
    df['Institution'] = df['Institution'].str.strip()
    df['Institution'] = df['Institution'].str.title()

    # Make "Institution" be the first column
    cols = df.columns.tolist()
    cols = ['Institution'] + [col for col in cols if col != 'Institution']
    df = df[cols]

    df.to_csv(os.path.join(DOWNLOAD_DIR, "raw_hospital_rankings.csv"), index=False)
    print("\nSaved to hospital_rankings.csv")


    pivot_df = df.pivot_table(
        index='Institution',
        columns='specialty',
        values='rank'
    )

    pivot_df_with_cumulative_columns = pivot_df.copy()

    pivot_df_with_cumulative_columns['#n_ranked_specialties'] = pivot_df.count(axis=1)

    # 2. Count of specialties ranked in the Top 10 (rank < 11)
    pivot_df_with_cumulative_columns['#n_top10_specialties'] = (pivot_df <= 10).sum(axis=1)

    # 3. Count of specialties ranked #1
    pivot_df_with_cumulative_columns['#n_top1_specialties'] = (pivot_df == 1).sum(axis=1)

    pivot_df_with_cumulative_columns = pivot_df_with_cumulative_columns.sort_values(
        by='#n_ranked_specialties',
        ascending=False
    )

    pivot_df_with_cumulative_columns = pivot_df_with_cumulative_columns.reset_index().rename(columns={'Institution': 'Institution'})

    pivot_df_with_cumulative_columns.columns.name = None

    pivot_df_with_cumulative_columns.to_csv(os.path.join(DOWNLOAD_DIR, "hospital_rankings_pivoted.csv"), index=False)
    print("Saved to hospital_rankings_pivoted.csv")

else:
    print("No data was scraped.")


Saved to hospital_rankings.csv
Saved to hospital_rankings_pivoted.csv
