In [4]:
import pandas as pd
from urllib.parse import urlparse

# State government domains mapped to full state names
gov_domain_to_state = {
    "alabama.gov": "Alabama", "alaska.gov": "Alaska", "az.gov": "Arizona", "arkansas.gov": "Arkansas", "ca.gov": "California",
    "colorado.gov": "Colorado", "ct.gov": "Connecticut", "delaware.gov": "Delaware", "myflorida.com": "Florida",
    "georgia.gov": "Georgia", "hawaii.gov": "Hawaii", "idaho.gov": "Idaho", "illinois.gov": "Illinois", "in.gov": "Indiana",
    "iowa.gov": "Iowa", "kansas.gov": "Kansas", "ky.gov": "Kentucky", "louisiana.gov": "Louisiana", "maine.gov": "Maine",
    "maryland.gov": "Maryland", "mass.gov": "Massachusetts", "michigan.gov": "Michigan", "mn.gov": "Minnesota",
    "ms.gov": "Mississippi", "mo.gov": "Missouri", "mt.gov": "Montana", "nebraska.gov": "Nebraska", "nv.gov": "Nevada",
    "nh.gov": "New Hampshire", "nj.gov": "New Jersey", "nm.gov": "New Mexico", "ny.gov": "New York", "nc.gov": "North Carolina",
    "nd.gov": "North Dakota", "ohio.gov": "Ohio", "oklahoma.gov": "Oklahoma", "oregon.gov": "Oregon", "pa.gov": "Pennsylvania",
    "ri.gov": "Rhode Island", "sc.gov": "South Carolina", "sd.gov": "South Dakota", "tn.gov": "Tennessee", "texas.gov": "Texas",
    "utah.gov": "Utah", "vermont.gov": "Vermont", "virginia.gov": "Virginia", "wa.gov": "Washington", "wv.gov": "West Virginia",
    "wisconsin.gov": "Wisconsin", "wyoming.gov": "Wyoming"
}

def classify_by_url(url):
    netloc = urlparse(url).netloc.lower()
    for domain, state in gov_domain_to_state.items():
        if netloc == domain or netloc.endswith(f".{domain}"):
            return state
    return "NA"


def classify_pdfs(input_csv_path, output_csv_path, link_col='URL'):
    df = pd.read_csv(input_csv_path)
    if link_col not in df.columns:
        raise KeyError(f"Column '{link_col}' not found in CSV.")
    
    df['State'] = df[link_col].apply(classify_by_url)
    output_df = df[[link_col, 'State']]
    output_df.to_csv(output_csv_path, index=False)

In [7]:
input_csv = '/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/FINAL_dataset.csv'
output_csv = '/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_state/state_results.csv'
classify_pdfs(input_csv, output_csv, link_col='URL')

In [None]:
import csv
import json

def csv_to_json(csv_file_path, json_file_path):
    # Read the CSV and add data to a dictionary
    with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csv_file:
        reader = csv.DictReader(csv_file)
        data = list(reader)

    # Write the JSON file
    with open(json_file_path, mode='w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

# Example usage
csv_to_json('/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_state/state_results.csv', '/Users/winnie/Documents/GitHub/MedAI/Analysis&NLP/classify_by_state/state_results.json')
