First pass

In [5]:
import pandas as pd
import requests
import io
from PyPDF2 import PdfReader

# List of U.S. states
states = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
    'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
    'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
    'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
    'Wisconsin', 'Wyoming'
]

def classify_pdf_by_state(pdf_url):
    try:
        response = requests.get(pdf_url, timeout=15)
        if response.status_code != 200:
            return 'Other'

        file_stream = io.BytesIO(response.content)
        reader = PdfReader(file_stream)

        full_text = ""
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "

        # Track found states (case-insensitive match)
        found_states = set()
        lower_text = full_text.lower()
        for state in states:
            if state.lower() in lower_text:
                found_states.add(state)

        if len(found_states) == 0:
            return 'Other'
        elif len(found_states) == 1:
            return found_states.pop()
        else:
            return 'Ambiguous'

    except Exception as e:
        print(f"Error processing {pdf_url}: {e}")
        return 'Other'

def classify_pdfs(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)
    df['State'] = df['PDF Link'].apply(classify_pdf_by_state)
    df.to_csv(output_csv_path, index=False)

In [None]:
input_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_20_rows.csv'
output_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/classified_state_pdfs.csv'
classify_pdfs(input_csv, output_csv)

Error processing https://dbr.ri.gov/sites/g/files/xkgbur696/files/documents/divisions/insurance/financial_info/2021/Metropolitan_Property_and_Casualty_Insurance_Company_03_31_2021.pdf: PyCryptodome is required for AES algorithm


Second pass (URL-based classification, then keyword-based classification)

In [1]:
import pandas as pd
import requests
import io
import re
from urllib.parse import urlparse
from PyPDF2 import PdfReader

# List of U.S. state names
states = [
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
    'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
    'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
    'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
    'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
    'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia',
    'Wisconsin', 'Wyoming'
]

# State government domains mapped to full state names
gov_domain_to_state = {
    "alabama.gov": "Alabama", "alaska.gov": "Alaska", "az.gov": "Arizona", "arkansas.gov": "Arkansas", "ca.gov": "California",
    "colorado.gov": "Colorado", "ct.gov": "Connecticut", "delaware.gov": "Delaware", "myflorida.com": "Florida",
    "georgia.gov": "Georgia", "hawaii.gov": "Hawaii", "idaho.gov": "Idaho", "illinois.gov": "Illinois", "in.gov": "Indiana",
    "iowa.gov": "Iowa", "kansas.gov": "Kansas", "ky.gov": "Kentucky", "louisiana.gov": "Louisiana", "maine.gov": "Maine",
    "maryland.gov": "Maryland", "mass.gov": "Massachusetts", "michigan.gov": "Michigan", "mn.gov": "Minnesota",
    "ms.gov": "Mississippi", "mo.gov": "Missouri", "mt.gov": "Montana", "nebraska.gov": "Nebraska", "nv.gov": "Nevada",
    "nh.gov": "New Hampshire", "nj.gov": "New Jersey", "nm.gov": "New Mexico", "ny.gov": "New York", "nc.gov": "North Carolina",
    "nd.gov": "North Dakota", "ohio.gov": "Ohio", "oklahoma.gov": "Oklahoma", "oregon.gov": "Oregon", "pa.gov": "Pennsylvania",
    "ri.gov": "Rhode Island", "sc.gov": "South Carolina", "sd.gov": "South Dakota", "tn.gov": "Tennessee", "texas.gov": "Texas",
    "utah.gov": "Utah", "vermont.gov": "Vermont", "virginia.gov": "Virginia", "wa.gov": "Washington", "wv.gov": "West Virginia",
    "wisconsin.gov": "Wisconsin", "wyoming.gov": "Wyoming"
}

def classify_by_url(url):
    netloc = urlparse(url).netloc.lower()
    for domain, state in gov_domain_to_state.items():
        if domain in netloc:
            return state
    return None

def classify_by_pdf_content(url):
    try:
        response = requests.get(url, timeout=15)
        if response.status_code != 200:
            return 'Other'

        file_stream = io.BytesIO(response.content)
        reader = PdfReader(file_stream)

        full_text = ""
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "

        found_states = {state for state in states if state.lower() in full_text.lower()}

        if len(found_states) == 0:
            return 'Other'
        elif len(found_states) == 1:
            return next(iter(found_states))
        else:
            return 'Ambiguous'

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return 'Other'

def classify_pdf(url):
    state_from_url = classify_by_url(url)
    if state_from_url:
        return state_from_url
    return classify_by_pdf_content(url)

def classify_pdfs(input_csv_path, output_csv_path, link_col='link'):
    df = pd.read_csv(input_csv_path)
    if link_col not in df.columns:
        raise KeyError(f"Column '{link_col}' not found in CSV.")
    
    df['State'] = df[link_col].apply(classify_pdf)
    df.to_csv(output_csv_path, index=False)



In [3]:
input_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_20_rows.csv'
output_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/classified_state_pdfs2.csv'
classify_pdfs(input_csv, output_csv, link_col='PDF Link')

Third pass (only classifies by state gov documents - final version)

In [5]:
import pandas as pd
from urllib.parse import urlparse

# State government domains mapped to full state names
gov_domain_to_state = {
    "alabama.gov": "Alabama", "alaska.gov": "Alaska", "az.gov": "Arizona", "arkansas.gov": "Arkansas", "ca.gov": "California",
    "colorado.gov": "Colorado", "ct.gov": "Connecticut", "delaware.gov": "Delaware", "myflorida.com": "Florida",
    "georgia.gov": "Georgia", "hawaii.gov": "Hawaii", "idaho.gov": "Idaho", "illinois.gov": "Illinois", "in.gov": "Indiana",
    "iowa.gov": "Iowa", "kansas.gov": "Kansas", "ky.gov": "Kentucky", "louisiana.gov": "Louisiana", "maine.gov": "Maine",
    "maryland.gov": "Maryland", "mass.gov": "Massachusetts", "michigan.gov": "Michigan", "mn.gov": "Minnesota",
    "ms.gov": "Mississippi", "mo.gov": "Missouri", "mt.gov": "Montana", "nebraska.gov": "Nebraska", "nv.gov": "Nevada",
    "nh.gov": "New Hampshire", "nj.gov": "New Jersey", "nm.gov": "New Mexico", "ny.gov": "New York", "nc.gov": "North Carolina",
    "nd.gov": "North Dakota", "ohio.gov": "Ohio", "oklahoma.gov": "Oklahoma", "oregon.gov": "Oregon", "pa.gov": "Pennsylvania",
    "ri.gov": "Rhode Island", "sc.gov": "South Carolina", "sd.gov": "South Dakota", "tn.gov": "Tennessee", "texas.gov": "Texas",
    "utah.gov": "Utah", "vermont.gov": "Vermont", "virginia.gov": "Virginia", "wa.gov": "Washington", "wv.gov": "West Virginia",
    "wisconsin.gov": "Wisconsin", "wyoming.gov": "Wyoming"
}

def classify_by_url(url):
    netloc = urlparse(url).netloc.lower()
    for domain, state in gov_domain_to_state.items():
        if netloc == domain or netloc.endswith(f".{domain}"):
            return state
    return "NA"


def classify_pdfs(input_csv_path, output_csv_path, link_col='link'):
    df = pd.read_csv(input_csv_path)
    if link_col not in df.columns:
        raise KeyError(f"Column '{link_col}' not found in CSV.")
    
    df['State'] = df[link_col].apply(classify_by_url)
    df.to_csv(output_csv_path, index=False)


In [21]:
input_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_20_rows.csv'
output_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/classified_state_pdfs3.csv'
classify_pdfs(input_csv, output_csv, link_col='PDF Link')

In [6]:
input_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/sampled_25_rows.csv'
output_csv = '/Users/winnie/Documents/GitHub/MedAI/classify_by_state/classified_state_pdfs4.csv'
classify_pdfs(input_csv, output_csv, link_col='PDF Link')