In [3]:
from bs4 import BeautifulSoup # essential package for parsing in Python
import requests # for web requests

In [7]:
url = 'https://www.nachc.org/state-level-data-maps/'

html = requests.get(url)

soup = BeautifulSoup(html.text)

print(soup.prettify()[:1200])

<!DOCTYPE html>
<html class="no-js" itemscope="" itemtype="https://schema.org/WebPage" lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="The National Association of Community Health Centers (NACHC) was founded in 1971 to promote efficient, high quality, comprehensive health care that is accessible, culturally and linguistically competent, community directed, and patient centered for all." name="description"/>
  <link href="https://www.nachc.org/state-level-data-maps/" hreflang="en-US" rel="alternate"/>
  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-3726592-1">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'UA-3726592-1');
  </script>
  <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots"/>
  <meta content="telephone=no" name

In [27]:
for a in soup.find_all('a', href=True): # first 10 paragraphs via <p> tag
    print(a["href"])

https://www.nachc.org
#
/about/about-nachc/
https://www.nachc.org/about/about-nachc/
https://www.nachc.org/about-nachc/leadership/
https://www.nachc.org/about-nachc/staff-directory/
https://www.nachc.org/about-nachc/state-affiliates/state-regional-pca-listing/
http://iweb.nachc.com/Purchase/SearchCatalog.aspx
https://www.nachc.org/media-center/
https://nachc.recruiterbox.com/
https://www.nachc.org/about/about-our-health-centers/
https://www.nachc.org/about/about-our-health-centers/what-is-a-health-center/
http://www.findahealthcenter.hrsa.gov
http://www.hcadvocacy.org/join
https://www.nachc.org/media-center/nachc-podcasts/
https://www.nachc.org/job-board/
https://www.nachc.org/job-board/
https://www.nachc.org/job-board/instructions-for-posting-a-new-position/
https://www.nachc.org/job-board/submit-a-job/
/focus-areas/policy-matters/
https://www.nachc.org/focus-areas/policy-matters/
https://www.nachc.org/focus-areas/policy-matters/health-center-funding/
https://www.nachc.org/focus-areas

In [28]:
for img in soup.find_all('img'):
    print(img)

<img alt="State Level Health Center Data &amp; Maps" class="logo-main scale-with-grid" data-height="" data-no-retina="" data-retina="" src="/wp-content/uploads/2019/03/NACHC_LOGO_2C_R_transparent.png"/>
<img alt="State Level Health Center Data &amp; Maps" class="logo-sticky scale-with-grid" data-height="" data-no-retina="" data-retina="" src="/wp-content/uploads/2019/03/NACHC_LOGO_2C_R_transparent.png"/>
<img alt="State Level Health Center Data &amp; Maps" class="logo-mobile scale-with-grid" data-height="" data-no-retina="" data-retina="" src="/wp-content/uploads/2019/03/NACHC_LOGO_2C_R_transparent.png"/>
<img alt="State Level Health Center Data &amp; Maps" class="logo-mobile-sticky scale-with-grid" data-height="" data-no-retina="" data-retina="" src="/wp-content/uploads/2019/03/NACHC_LOGO_2C_R_transparent.png"/>
<img alt="NACHC Logo" class="logo-main scale-with-grid jetpack-lazy-image" data-lazy-src="/wp-content/uploads/2019/03/NACHC_LOGO_2C_R_transparent.png?is-pending-load=1" decodi

In [1]:
import striprtf
import re
import os

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [35]:
## "scraping" the NACHC website for all congressional district map PDFs

import requests

state_districts = {
    "FL": 28,
    "GA": 14,
    "HI": 2,
    "ID": 2,
    "IL": 17,
    "IN": 9,
    "IA": 4,
    "KS": 4,
    "KY": 6,
    "LA": 6,
    "ME": 2,
    "MA": 9,
    "MD": 8,
    "MI": 13,
    "MN": 8,
    "MS": 4,
    "MO": 8,
    "MT": 2,
    "NE": 3,
    "NV": 4,
    "NH": 2,
    "NJ": 12,
    "NM": 3,
    "NY": 26,
    "NC": 14,
    "OH": 15,
    "OK": 5,
    "OR": 6,
    "PA": 17,
    "RI": 2,
    "SC": 7,
    "TN": 9,
    "TX": 38,
    "UT": 4,
    "VA": 11,
    "WA": 10,
    "WV": 2,
    "WI": 8
}

base_url = "https://www.nachc.org/wp-content/uploads/2023/02"
download_dir = "../misc/CHC_pdf"  # Specify the directory here

def download_congressional_district_pdfs(state_code, num_districts, base_url, download_dir):
    for district in range(1, num_districts + 1):
        district_code = str(district).zfill(2)
        pdf_url = f"{base_url}/{state_code}{district_code}.pdf"
        response = requests.get(pdf_url)
        if response.status_code == 200:
            file_name = f"{state_code}{district_code}.pdf"
            file_path = os.path.join(download_dir, file_name)
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {file_name} to {download_dir}")
        else:
            print(f"PDF not found for {state_code} District {district_code}")
            
for state_code, num_districts in state_districts.items():
    download_congressional_district_pdfs(state_code, num_districts, base_url, download_dir)

# # Example usage:
# state_code = "CA"  # State code for Maryland
# num_districts = 52  # Number of districts in Maryland
# base_url = "https://www.nachc.org/wp-content/uploads/2023/02"
# download_dir = "../misc/CHC_pdf"  # Specify the directory here

# download_congressional_district_pdfs(state_code, num_districts, base_url, download_dir)


Downloaded MD01.pdf to ../misc/CHC_pdf
Downloaded MD02.pdf to ../misc/CHC_pdf
Downloaded MD03.pdf to ../misc/CHC_pdf
Downloaded MD04.pdf to ../misc/CHC_pdf
Downloaded MD05.pdf to ../misc/CHC_pdf
Downloaded MD06.pdf to ../misc/CHC_pdf
Downloaded MD07.pdf to ../misc/CHC_pdf
Downloaded MD08.pdf to ../misc/CHC_pdf


In [10]:
import PyPDF2
from docx import Document

states = [
    "CT",
    "DC",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MA",
    "MD",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "ND",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VA",
    "VT",
    "WA",
    "WV",
    "WI",
    "WY"
]

download_dir = "../misc/CHC_pdf"
final_dir = "../data/CHC_data"

def combine_pdfs_to_rtf(state_code, download_dir, final_dir):
    pdf_files = [file for file in os.listdir(download_dir) if file.startswith(state_code) and file.endswith('.pdf')]
    pdf_files.sort()

    combined_text = ""
    for pdf_file in pdf_files:
        file_path = os.path.join(download_dir, pdf_file)
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(pdf_reader.pages):
                if page_num == 0:  # skip the first page
                    continue
                combined_text += page.extract_text()

    rtf_file_name = f"{state_code}_combined.rtf"
    rtf_file_path = os.path.join(final_dir, rtf_file_name)
    document = Document()
    document.add_paragraph(combined_text)
    document.save(rtf_file_path)
    print(f"Combined PDFs for {state_code} into {rtf_file_path}")

# # Example usage:
# state_code = "AL"  # Specify the state code
# download_dir = "../misc/CHC_pdf"  # Specify the directory here
# final_dir = "../data/CHC_data" # final directory

for state_code in states:
     combine_pdfs_to_rtf(state_code, download_dir, final_dir)

Combined PDFs for OH into ../data/CHC_data/OH_combined.rtf


In [11]:
import re
from striprtf.striprtf import rtf_to_text

def extract_addresses_from_rtfs(folder_path):
    addresses = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".rtf"):
            rtf_path = os.path.join(folder_path, filename)
            with open(rtf_path, encoding = "utf-8") as infile:
                content = infile.read()
                text = rtf_to_text(content)
                pattern = r"(?<=\|)\s*([^|]+)\s*\|"
                addresses_dirty = re.findall(pattern, text)
                pattern1 = r"(?<!\d)\d{1,5}\s+[^\d\n]+\s*\d{5}(?:-\d{4})?"
                addresses_clean = re.findall(pattern1, "\n".join(addresses_dirty), re.MULTILINE)
                addresses.extend([addr.strip() for addr in addresses_clean if addr.strip()])
                # addresses.append(addresses_clean)
    return addresses


addresses = extract_addresses_from_rtfs("../data/CHC_data")
addresses
len(addresses)

# a1 = [x for x in addresses[0] if x]
# a2 = [x for x in addresses[1] if x]
# a3 = [x for x in addresses[2] if x]
# a4 = [x for x in addresses[3] if x]

# addresses_al = a1 + a2 + a3 + a4

# addresses_al

['3900 Calhoun St Gary, IN 46408-1753',
 '3503 Martin Luther King Dr Gary, IN 46409-1233',
 '5926 Calumet Ave Hammond, IN 46320-2505',
 '6100 Broadway Merrillville, IN 46410-3002',
 '1015 N Shelby St Gary, IN 46403-1446',
 '5927 Columbia Ave Hammond, IN 46320-2611',
 '6915 Grand Ave Hammond, IN 46323-2587',
 '2401 Valley Dr Valparaiso, IN 46383-2520',
 '1313 W Chicago Ave East Chicago, IN 46312-3316',
 '710 Franklin St Michigan City, IN 46360-3563',
 '1001 Sturdy Rd Valparaiso, IN 46383-4126',
 '701 Wall St Valparaiso, IN 46383-2514',
 '2323 Broadway St East Chicago, IN 46312-2264',
 '1 Valparaiso, IN 46385',
 '400 Teegarden St Laporte, IN 46350-3175',
 '1001 Sturdy Rd Valparaiso, IN 46383-4126',
 '1008 Broadway Chesterton, IN 46304-2149',
 '801 Broadway Chesterton, IN 46304-2230',
 '407 W Indiana Ave Chesterton, IN 46304-2350',
 '168 Chesterton, IN 46304-2243',
 '10851 Broadway Crown Point, IN 46307-7303',
 '3099 Central Ave Lake Station, IN 46405-2207',
 '3304 Parkside Ave Lake Stati

13290

In [20]:
## GEOCODING
import geopandas as gpd

geocodes = gpd.tools.geocode(addresses, timeout = 100)

# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderTimedOut

# def geocode_with_retry(address, geolocator):
#     try:
#         location = geolocator.geocode(address)
#         return location
#     except GeocoderTimedOut:
#         # If a timeout occurs, retry the request
#         return geocode_with_retry(address, geolocator)

# def batch_geocode(addresses):
#     geolocator = Nominatim(user_agent="my_app")
#     location_list = []

#     for address in addresses:
#         location = geocode_with_retry(address, geolocator)
#         if location is not None:
#             location_list.append({
#                 "address": address,
#                 "latitude": location.latitude,
#                 "longitude": location.longitude
#             })

#     return location_list

# # Example usage:
# geocoded_data = batch_geocode(addresses)


KeyboardInterrupt: 