# Get PA Nursing Home Data

This notebook scrapes data from the Pennsylvania Department of Health, Nursing Care Facility Information database.

The goal is to create a database of nursing homes in Montgomery County, PA that accept Medicaid payments.

## Import dependencies

In [1]:
import re
import requests

from bs4 import BeautifulSoup
import pandas as pd

## Get the data

In [2]:
url = 'https://sais.health.pa.gov/commonpoc/content/publicweb/nhinformation2.asp?COUNTY=Montgomery'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')

In [3]:
# Use to inspect organized/indented HTML
#print(soup.prettify())

### Note
Looking at the HTML revealed that there are table rows (tr) within tr, within tr...   
So, when I loop through the rows, below, I've got to start with the 3rd tr (that is, tr[2:])

## Extract the HTML table, with the target data
The target data are in the last table (`tables[-1]`) on the page.

In [4]:
tables = soup.find_all('table')

### Get column header info

In [5]:
table_headers = tables[-1].find_all('th')

In [6]:
columns = []
for header in table_headers:
    columns.append(header.getText())

### Make DataFrame

In [7]:
table_rows = tables[-1].find_all('tr')

list_of_rows = []

# Remember the tr within tr within tr... need to start at [2:]
for tr in table_rows[2:]:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    
    list_of_rows.append(row)

df = pd.DataFrame(data=list_of_rows, columns=columns)

## Select just the nursing homes that have Medicaid beds
(And drop the 'Select' column, too.)

In [50]:
df_Medicaid = df[df['Payment Options'].str.contains('Medicaid')].drop(columns='Select')

## Clean the DataFrame

### Remove phone numbers from Name/Address/Phone and put in own column

In [38]:
# Regex pattern for a phone number in these formats: (XXX) XXX-XXXX, (XXX)XXX-XXXX
pattern = r"(\(\d{3}\) ?\d{3}-\d{4})"

test_str = "This string contains a phone number: (215) 483-7799, let's see if I can pick it out!"

foo = re.search(pattern, test_str).group(0) # group(0) returns the entire match string
foo

'(215) 483-7799'

In [51]:
df_Medicaid.insert(1, 'phone_number', df_Medicaid['Name/Address/Phone'].str.extract(pattern))
df_Medicaid['phone_number'] = df_Medicaid['phone_number'].str.replace(')', ') ', regex=False)

In [53]:
df_Medicaid.rename(columns={'Name/Address/Phone' : 'Name/Address'}, inplace=True)

In [58]:
df_Medicaid['Name/Address'] = df_Medicaid['Name/Address'].str.replace(pattern, '')

## Write out the data to a CSV file

In [56]:
df_Medicaid.to_csv('output/Montgomery_County_PA_nursing_homes_with_Medicaid_beds.csv', index=False)