# Get PA Nursing Home Data

This notebook scrapes data from the Pennsylvania Department of Health, Nursing Care Facility Information database.

The goal is to create a database of nursing homes in Montgomery County, PA that accept Medicaid payments.

## Import dependencies

In [2]:
import re
import requests

from bs4 import BeautifulSoup
import pandas as pd

## Get the data

In [3]:
url = 'https://sais.health.pa.gov/commonpoc/content/publicweb/nhinformation2.asp?COUNTY=Montgomery'
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, 'html.parser')

In [4]:
# Use to inspect organized/indented HTML
#print(soup.prettify())

<script id="clientEventHandlersJS" language="javascript">
 var intNhInfo
<!--
function submit1_onclick(thisForm)
{
/* If the user does not check anything this checks everything and submits */
var selectcount=0;
if (typeof(thisForm.length) == "undefined")
    {
     thisForm.checked = true;
    }
else
    {
     for (i = 0;i<thisForm.length; i++)
        {
        if(thisForm[i].checked)
        selectcount=selectcount+1;
        } 
    }

{
if (selectcount == 0) 
    submit2_onclick(thisForm);
    else
    thisForm.submit();
}   
}


function submit2_onclick(thisForm) {
/* Compare all facilities  */
if (typeof(thisForm.length) == "undefined") {
    thisForm.checked = true;
}
else
{
for (i = 0;i<thisForm.length; i++)
{
thisForm[i].checked = true;
}
}
thisForm.submit();

}

function btnDefinitions_onclick() {
    // Create an inter window and put text into it
   var winTop = (screen.height /2)- 225;
   var winLeft = (screen.width /2)- 175;


### Note
Looking at the HTML revealed that there are table rows (tr) within tr, within tr...   
So, when I loop through the rows, below, I've got to start with the 3rd tr (that is, tr[2:])

## Extract the HTML table, with the target data
The target data are in the last table (`tables[-1]`) on the page.

In [53]:
tables = soup.find_all('table')

### Get column header info

In [54]:
table_headers = tables[-1].find_all('th')

In [55]:
columns = []
for header in table_headers:
    columns.append(header.getText())

### Make DataFrame

In [57]:
table_rows = tables[-1].find_all('tr')

list_of_rows = []

# Remember the tr within tr within tr... need to start at [2:]
for tr in table_rows[2:]:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    
    list_of_rows.append(row)

df = pd.DataFrame(data=list_of_rows, columns=columns)

## Select just the nursing homes that have Medicaid beds

In [65]:
df_Medicaid = df[df['Payment Options'].str.contains('Medicaid')]

## Clean the DataFrame

In [61]:
df_Medicaid.drop(columns='Select', inplace=True)

## Write out the data to a CSV file

In [68]:
df_Medicaid.to_csv('output/Montgomery_County_PA_nursing_homes_with_Medicaid_beds.csv')