# PART 1 : New York Hospital Data Web Srapping

In [1]:
import pandas as pd
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


## Scraping with selenium

Selenium is a framework which is designed to automate test for web applications.It can be used to control the browser interactions automatically such as link clicks and form submissions. Since in our case we need a click first to upfold the table for fetching, I use selenuium instead of <code>BeautifulSoup</code>.

### Extract Hospital Beds Table and Hopital Name

#### Set up a browser driver to simulate browser sessions, here I use Chrome driver

I will try for just one record first using hospital id = '102940' (New York-Presbyterian/Lower Manhattan Hospital) 

In [51]:
url = 'https://profiles.health.ny.gov/hospital/view/102940'
driver = webdriver.Chrome()
driver.get(url)

#### Simulate a click on the <code>Bed Types</code> to unhide the table we need

In [None]:
driver.find_element_by_xpath('.//*[@id="bed-types"]/a').click()

**Hospital Name:**

In [183]:
name_list = driver.find_elements_by_xpath('.//*[@id="main-content"]/div[2]/div/h2')
for name in name_list:
    hospital_name = name.text
hospital_name

'New York-Presbyterian/Lower Manhattan Hospital'

**Hopital Bed Types and Count Table:**

In [182]:
table = []
for rows in driver.find_elements_by_xpath('//*//*[@id="number-of-beds"]//tr'):
    time.sleep(3)
    row = [item.text for item in rows.find_elements_by_xpath(".//*[self::td]")]
    table.append(row)

df = pd.DataFrame.from_records(table)
df.columns = ['Bed_Type', hospital_name]
df

Unnamed: 0,Bed_Type,New York-Presbyterian/Lower Manhattan Hospital
0,Coronary Care Beds,8
1,Intensive Care Beds,13
2,Maternity Beds,24
3,Medical / Surgical Beds,127
4,Neonatal Intermediate Care Beds,8
5,Total Beds,180


### Scrape all the relevant hospital beds data of NYC

A list of hospitals' ids are mannually collected from NYS Health Profile website, and will be used to fetch hospital bed type and bed number data using selenium as a simulated click is needed to fetch the data we need.

In [186]:
ROOT_URL = "https://profiles.health.ny.gov/hospital/view/{}"
NYM_NYC = [
        103016, 106804, 102908, 103035, 102934, 1256608, 105117, 103009, 102974, 103006, 103041, 105086, 103056, 103086, 102973,
        102970, 102950, 103074, 103008, 103007, 102985, 103012, 106809, 102937, 103068, 102944, 102995, 106803, 102916, 105109,
        102914, 102960, 103038, 106810, 106811, 102961, 102940, 102933, 103078, 254693, 103065, 103021, 103080, 103033, 102919,
        105116, 106825, 103084, 103087, 102989, 102929, 106817, 106819, 103073, 103085, 103025
    ]  # New York Metro: New York City Hospitals' IDs 
NYM_LI = [
        102999, 103062, 102928, 103002, 102980, 103077, 103049, 103011, 102918, 102965, 102994, 102966, 103069, 1189331, 102926,
        103088, 103045, 103000, 103070, 105137, 103082, 102954, 103072
    ] # New York Metro: Long Iceland Hospitals' IDs
BRONX = [
        102908, 106804, 105117, 102973, 102950, 106809, 102937, 103068, 102944, 103078, 103087
    ] # New York Metro: Bronx Hospitals' IDs
QUEENS = [
        102974, 103006, 102912, 103074, 103008, 105109, 102933, 103033, 103084
    ] # New York Metro: Queens Hospitals' IDs

HOSPITALS = list(set(NYM_LI + NYM_NYC + BRONX + QUEENS))
print('Total hospitals', len(HOSPITALS))

Total hospitals 80


In [202]:
hospital_data = []
for val in HOSPITALS:
    print("Processing hospital id", val)
    url = ROOT_URL.format(val)
    driver = webdriver.Chrome()
    try:
        driver.get(url)
        time.sleep(3)
        driver.find_element_by_xpath('//*[@id="bed-types"]/a').click()
        name_list = driver.find_elements_by_xpath('.//*[@id="main-content"]/div[2]/div/h2')
        for name in name_list:
            hospital_name = name.text
        
        table = []
        for rows in driver.find_elements_by_xpath('//*//*[@id="bed-types"]//tr'):
            time.sleep(2)
            row = [item.text for item in rows.find_elements_by_xpath(".//*[self::td]")]
            table.append(row)
            
        data = pd.DataFrame.from_records(table)
        data.columns = ['Bed_Type', hospital_name]
        data[hospital_name] = pd.to_numeric(data[hospital_name])
        hospital_data.append(data)
  
    except Exception as e:
        print(e)
    driver.quit()
    
print('End of scrapping!')
print(len(hospital_data),'hospitals data scrapped!')

Processing hospital id 102912
Processing hospital id 102914
Processing hospital id 102916
Processing hospital id 102918
Processing hospital id 102919
Processing hospital id 102926
Processing hospital id 102928
Processing hospital id 102929
Processing hospital id 102933
Processing hospital id 102934
Processing hospital id 102937
Processing hospital id 102940
Processing hospital id 102944
Processing hospital id 102950
Processing hospital id 102954
Processing hospital id 102960
Processing hospital id 102961
Processing hospital id 102965
Processing hospital id 102966
Processing hospital id 102970
Processing hospital id 102973
Processing hospital id 102974
Processing hospital id 102980
Processing hospital id 102985
Processing hospital id 102989
Processing hospital id 102994
Processing hospital id 102995
Processing hospital id 102999
Processing hospital id 103000
Processing hospital id 103002
Processing hospital id 103006
Processing hospital id 103007
Processing hospital id 103008
Message: n

In [191]:
hospital_data

[             Bed_Type  Jamaica Hospital Medical Center
 0  Coronary Care Beds                                4,
               Bed_Type  Jamaica Hospital Medical Center
 0   Coronary Care Beds                                4
 1  Intensive Care Beds                                8,
               Bed_Type  Jamaica Hospital Medical Center
 0   Coronary Care Beds                                4
 1  Intensive Care Beds                                8
 2       Maternity Beds                               40,
                   Bed_Type  Jamaica Hospital Medical Center
 0       Coronary Care Beds                                4
 1      Intensive Care Beds                                8
 2           Maternity Beds                               40
 3  Medical / Surgical Beds                              228,
                         Bed_Type  Jamaica Hospital Medical Center
 0             Coronary Care Beds                                4
 1            Intensive Care Beds             

**Transform the list of dataframes into one dataframe** 

In [203]:
df = pd.concat(hospital_data, axis = 0, sort = False)
df = df.groupby('Bed_Type').sum().astype(int).T
df.index.rename('Hospital', inplace = True)
final_df = df[['Intensive Care Beds', 'Medical / Surgical Beds', 'Total Beds']].reset_index() 
#select bed types we are interested to use

### Save hospital data as csv 

In [204]:
final_df.Hospital.value_counts().sort_values()
#no duplicates in hospital

BronxCare Hospital Center                                               1
NYU Langone Orthopedic Hospital                                         1
Queens Hospital Center                                                  1
University Hospital - Stony Brook Eastern Long Island Hospital          1
Henry J. Carter Specialty Hospital                                      1
Calvary Hospital                                                        1
North Shore University Hospital                                         1
Plainview Hospital                                                      1
St Francis Hospital                                                     1
Syosset Hospital                                                        1
New York-Presbyterian Hospital - Columbia Presbyterian Center           1
Interfaith Medical Center                                               1
Wyckoff Heights Medical Center                                          1
Montefiore Med Center - Jack D Weiler 

In [205]:
final_df.to_csv('hospital_beds.csv', index = False)

In [3]:
pd.read_csv('data_output/hospital_beds.csv').head(10)

Unnamed: 0,Hospital,Intensive Care Beds,Medical / Surgical Beds,Total Beds
0,Jamaica Hospital Medical Center,8,228,402
1,"New York Community Hospital of Brooklyn, Inc",7,127,134
2,Mount Sinai Hospital,85,639,1134
3,Nassau University Medical Center,22,158,530
4,Richmond University Medical Center,20,286,448
5,Southside Hospital,26,223,305
6,Huntington Hospital,12,253,348
7,Mount Sinai St. Luke's,24,294,495
8,New York-Presbyterian/Queens,29,393,535
9,Brooklyn Hospital Center - Downtown Campus,24,311,464
