In [1]:
import striprtf
import re
import os
import pandas as pd
import geopandas as gpd
import requests
import PyPDF2
from docx import Document
from striprtf.striprtf import rtf_to_text

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [35]:
## "scraping" the NACHC website for all congressional district map PDFs

state_districts = {
    "FL": 28,
    "GA": 14,
    "HI": 2,
    "ID": 2,
    "IL": 17,
    "IN": 9,
    "IA": 4,
    "KS": 4,
    "KY": 6,
    "LA": 6,
    "ME": 2,
    "MA": 9,
    "MD": 8,
    "MI": 13,
    "MN": 8,
    "MS": 4,
    "MO": 8,
    "MT": 2,
    "NE": 3,
    "NV": 4,
    "NH": 2,
    "NJ": 12,
    "NM": 3,
    "NY": 26,
    "NC": 14,
    "OH": 15,
    "OK": 5,
    "OR": 6,
    "PA": 17,
    "RI": 2,
    "SC": 7,
    "TN": 9,
    "TX": 38,
    "UT": 4,
    "VA": 11,
    "WA": 10,
    "WV": 2,
    "WI": 8
}

base_url = "https://www.nachc.org/wp-content/uploads/2023/02"
download_dir = "../misc/CHC_pdf"  # Specify the directory here

def download_congressional_district_pdfs(state_code, num_districts, base_url, download_dir):
    for district in range(1, num_districts + 1):
        district_code = str(district).zfill(2)
        pdf_url = f"{base_url}/{state_code}{district_code}.pdf"
        response = requests.get(pdf_url)
        if response.status_code == 200:
            file_name = f"{state_code}{district_code}.pdf"
            file_path = os.path.join(download_dir, file_name)
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {file_name} to {download_dir}")
        else:
            print(f"PDF not found for {state_code} District {district_code}")
            
for state_code, num_districts in state_districts.items():
    download_congressional_district_pdfs(state_code, num_districts, base_url, download_dir)


Downloaded MD01.pdf to ../misc/CHC_pdf
Downloaded MD02.pdf to ../misc/CHC_pdf
Downloaded MD03.pdf to ../misc/CHC_pdf
Downloaded MD04.pdf to ../misc/CHC_pdf
Downloaded MD05.pdf to ../misc/CHC_pdf
Downloaded MD06.pdf to ../misc/CHC_pdf
Downloaded MD07.pdf to ../misc/CHC_pdf
Downloaded MD08.pdf to ../misc/CHC_pdf


In [10]:
## combining PDFs of the same state into a RTF

states = [
    "CT",
    "DC",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MA",
    "MD",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "ND",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VA",
    "VT",
    "WA",
    "WV",
    "WI",
    "WY"
]

download_dir = "../misc/CHC_pdf"
final_dir = "../data/CHC_data"

def combine_pdfs_to_rtf(state_code, download_dir, final_dir):
    pdf_files = [file for file in os.listdir(download_dir) if file.startswith(state_code) and file.endswith('.pdf')]
    pdf_files.sort()

    combined_text = ""
    for pdf_file in pdf_files:
        file_path = os.path.join(download_dir, pdf_file)
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(pdf_reader.pages):
                if page_num == 0:  # skip the first page
                    continue
                combined_text += page.extract_text()

    rtf_file_name = f"{state_code}_combined.rtf"
    rtf_file_path = os.path.join(final_dir, rtf_file_name)
    document = Document()
    document.add_paragraph(combined_text)
    document.save(rtf_file_path)
    print(f"Combined PDFs for {state_code} into {rtf_file_path}")
    
for state_code in states:
     combine_pdfs_to_rtf(state_code, download_dir, final_dir)

Combined PDFs for OH into ../data/CHC_data/OH_combined.rtf


In [3]:
## cleaning text data from combined RTF for street addresses; geocoding in "batches"

def extract_addresses_from_rtfs(folder_path):
    addresses = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".rtf"):
            rtf_path = os.path.join(folder_path, filename)
            with open(rtf_path, encoding = "utf-8") as infile:
                content = infile.read()
                text = rtf_to_text(content)
                pattern = r"(?<=\|)\s*([^|]+)\s*\|"
                addresses_dirty = re.findall(pattern, text)
                pattern1 = r"(?<!\d)\d{1,5}\s+[^\d\n]+\s*\d{5}(?:-\d{4})?"
                addresses_clean = re.findall(pattern1, "\n".join(addresses_dirty), re.MULTILINE)
                addresses.extend([addr.strip() for addr in addresses_clean if addr.strip()])
                # addresses.append(addresses_clean)
    return addresses

# len(addresses) 13290
addresses = extract_addresses_from_rtfs("../data/CHC_data")
# addresses1 = addresses[:len(addresses)//2]
# addresses2 = addresses[len(addresses)//2:]

# addresses_df1 = pd.DataFrame(addresses1)
# addresses_df2 = pd.DataFrame(addresses2)

# addresses_df1.to_csv("addresses1.csv")
# addresses_df2.to_csv("addresses2.csv")



In [4]:
items = addresses  # Your list of 13,290 items

chunk_size = len(items) // 30  # Calculate the approximate chunk size

result = [items[i:i+chunk_size] for i in range(0, len(items), chunk_size)]

final_dir = "../data/CHC_data/geocode"
    

# Define a function to process each sublist
def process_sublist(sublist):
    # Perform geocoding for the sublist
    geocodes = gpd.tools.geocode(sublist, timeout=100)
    
    # Save geocodes as a CSV file
    geocode_name = f"geocodes_{sublist[0]}_{sublist[-1]}.csv"  # Use sublist name as the CSV file name
    geocode_path = os.path.join(final_dir, geocode_name)  # Assuming final_dir is a valid directory path
    geocodes.to_csv(geocode_path, index=False)  # Save geocodes DataFrame to CSV
    
# Call the process_sublist function for each sublist
for sublist in result:
    process_sublist(sublist)


In [42]:
## GEOCODING
# import geopandas as gpd

# geocodes = gpd.tools.geocode(addresses1, timeout = 100)

# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderTimedOut

# def geocode_with_retry(address, geolocator):
#     try:
#         location = geolocator.geocode(address)
#         return location
#     except GeocoderTimedOut:
#         # If a timeout occurs, retry the request
#         return geocode_with_retry(address, geolocator)

# def batch_geocode(addresses):
#     geolocator = Nominatim(user_agent="my_app")
#     location_list = []

#     for address in addresses:
#         location = geocode_with_retry(address, geolocator)
#         if location is not None:
#             location_list.append({
#                 "address": address,
#                 "latitude": location.latitude,
#                 "longitude": location.longitude
#             })

#     return location_list

# # Example usage:
# geocoded_data = batch_geocode(addresses)


KeyboardInterrupt: 

In [23]:
url = ("https://api.tomtom.com/search/2/batch/45e0909c-625a-4822-a060-8f7f88498c0e?key={ye7l20ZZ5GGGG5Gu2WbDVpzazALFAWtI}")

610

In [41]:
## Trying out Google Maps API

# AIzaSyB4738di4yFK-JZOkaF_qBWSZ7GSYWAVxk

import csv
import pandas as pd
import googlemaps

data = pd.read_csv("addresses1.csv",encoding = "ISO-8859-1")
df = data.copy()
df.head()

gmaps_key = googlemaps.Client(key = "AIzaSyB4738di4yFK-JZOkaF_qBWSZ7GSYWAVxk")

add_1 = df["0"][0]
g = gmaps_key.geocode(add_1)
lat = g[0]["geometry"]["location"]["lat"]
long = g[0]["geometry"]["location"]["lng"]
print('Latitude: '+str(lat)+', Longitude: '+str(long))

Unnamed: 0.1,Unnamed: 0,0
0,0,"3900 Calhoun St Gary, IN 46408-1753"
1,1,"3503 Martin Luther King Dr Gary, IN 46409-1233"
2,2,"5926 Calumet Ave Hammond, IN 46320-2505"
3,3,"6100 Broadway Merrillville, IN 46410-3002"
4,4,"1015 N Shelby St Gary, IN 46403-1446"


ApiError: REQUEST_DENIED (This API project is not authorized to use this API.)