# Hospital Facilities Insight 

Data obtained from Community Benefits Insight: http://www.communitybenefitinsight.org/api/get_hospitals.php

## Import Libaries

In [50]:
# import appropriate packages
import os
import requests
import json 
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql.cursors
from pymysql import IntegrityError

import warnings
warnings.filterwarnings("ignore") #supress warnings

## Import Data

In [51]:
# API endpoint
api_url = 'http://www.communitybenefitinsight.org/api/get_hospitals.php'

# requesting data from API
response = requests.get(api_url)

# check status
if response.status_code == 200:
    data = response.json()

    # data update check
    last_modified_header = response.headers.get('Last-Modified')

    if last_modified_header:
        last_modified = datetime.strptime(last_modified_header, '%a, %d %b %Y %H:%M:%S GMT')
        print(f"Data was last modified: {last_modified}")

    # convert data to dataframe in chunks
    chunk_size = 100  # define size of chunk
    hospital_dfs = []  # create list to store df 

    for chunk_start in range(0, len(data), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(data))
        chunk_data = data[chunk_start:chunk_end]
        hospital_df = pd.DataFrame(chunk_data)
        hospital_dfs.append(hospital_df)

    # concate all df chunks into one
    hospital_df = pd.concat(hospital_dfs, ignore_index=True)

    # print df 
    print(hospital_df)
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")
    print(response.text)

     hospital_id hospital_org_id        ein  \
0              1               1  630307951   
1              2               2  630578923   
2              3               3  630312913   
3              4               4  630459034   
4              5               5  581973570   
...          ...             ...        ...   
3486        3487            2647  813040663   
3487        3488            2304  741109643   
3488        3489            2648  831954982   
3489        3490            2302  750800661   
3490        3491            2649  831869297   

                                     name  \
0                Mizell Memorial Hospital   
1                        St Vincents East   
2           Shelby Baptist Medical Center   
3            Callahan Eye Foundation Hosp   
4                 Cherokee Medical Center   
...                                   ...   
3486          Bsw Medical Center - Austin   
3487              Ascension Seton Bastrop   
3488         Texas Health Hosp

In [25]:
hospital_df # print df 

Unnamed: 0,hospital_id,hospital_org_id,ein,name,name_cr,street_address,city,state,zip_code,fips_state_and_county_code,hospital_bed_count,chrch_affl_f,urban_location_f,children_hospital_f,memb_counc_teach_hosps_f,medicare_provider_number,county,hospital_bed_size,updated_dt
0,1,1,630307951,Mizell Memorial Hospital,Mizell Memorial Hospital,702 Main Street,Opp,AL,36462,01039,99,N,N,N,N,010007,Covington County,<100 beds,"November 20, 2023"
1,2,2,630578923,St Vincents East,St Vincents East,50 Medical Park Drive East,Birmingham,AL,35235,01073,362,N,Y,N,Y,010011,Jefferson County,>299 beds,"November 20, 2023"
2,3,3,630312913,Shelby Baptist Medical Center,Shelby Baptist Medical Center,1000 First Street North,Alabaster,AL,35007,01117,252,N,Y,N,N,010016,Shelby County,100-299 beds,"November 20, 2023"
3,4,4,630459034,Callahan Eye Foundation Hosp,Callahan Eye Foundation Hosp,1720 University Boulevard,Birmingham,AL,35233,01073,106,N,Y,N,Y,010018,Jefferson County,100-299 beds,"November 20, 2023"
4,5,5,581973570,Cherokee Medical Center,Cherokee Medical Center,400 Northwood Drive,Centre,AL,35960,01019,60,N,N,N,N,010022,Cherokee County,<100 beds,"November 20, 2023"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3486,3487,2647,813040663,Bsw Medical Center - Austin,Bsw Medical Center - Austin,5245 W Us 290,Austin,TX,78735,48453,16,N,Y,N,N,670136,Travis County,<100 beds,"November 20, 2023"
3487,3488,2304,741109643,Ascension Seton Bastrop,Ascension Seton Bastrop,630 Highway 71 W,Bastrop,TX,78602,48021,7,N,Y,N,N,670143,Bastrop County,<100 beds,"November 20, 2023"
3488,3489,2648,831954982,Texas Health Hospital Frisco,Texas Health Hospital Frisco,12400 N Dallas Parkway,Frisco,TX,75033,48121,63,N,Y,N,N,670260,Denton County,<100 beds,"November 20, 2023"
3489,3490,2302,750800661,Methodist Midlothian Medical Center,Methodist Midlothian Medical Center,1201 E Highway 287,Midlothian,TX,76065,48139,46,N,Y,N,N,670300,Ellis County,<100 beds,"November 20, 2023"


In [26]:
# print 10 random samples from the dataframe 
hospital_df.sample(10)

Unnamed: 0,hospital_id,hospital_org_id,ein,name,name_cr,street_address,city,state,zip_code,fips_state_and_county_code,hospital_bed_count,chrch_affl_f,urban_location_f,children_hospital_f,memb_counc_teach_hosps_f,medicare_provider_number,county,hospital_bed_size,updated_dt
2779,2780,2194,571067254,Bon Secours St Francis Xavier Hospi,Bon Secours St Francis Xavier Hospi,2095 Henry Tecklenburg Drive,Charleston,SC,29414,45019,204,N,Y,N,N,420065,Charleston County,100-299 beds,"November 20, 2023"
573,574,349,590855412,Mease Hospital Countryside,Mease Hospital Countryside,3231 North Mcmullen Booth Rd,Safety Harbor,FL,33761,12103,300,N,Y,N,N,100265,Pinellas County,>299 beds,"November 20, 2023"
1389,1390,1070,42121377,The Shriners Hospital For Children,The Shriners Hospital For Children,51 Blossom Street,Boston,MA,2114,25025,30,N,Y,Y,N,223304,Suffolk County,<100 beds,"November 20, 2023"
3352,3353,2537,611649250,Aurora Lakeland Medical Center,Aurora Lakeland Medical Center,W3985 Highway Nn,Elkhorn,WI,53121,55127,99,N,N,N,N,520102,Walworth County,<100 beds,"November 20, 2023"
1989,1990,1558,161533232,Kaleida Health,Kaleida Health,100 High Street,Buffalo,NY,14203,36029,986,N,Y,N,Y,330005,Erie County,>299 beds,"November 20, 2023"
36,37,27,630307306,The Childrens Hospital Of Alabama,The Childrens Hospital Of Alabama,1600 7th Avenue South,Birmingham,AL,35233,1073,332,N,Y,Y,Y,13300,Jefferson County,>299 beds,"November 20, 2023"
832,833,606,362852553,St Margarets Health - Peru,St Margarets Health - Peru,925 West Street,Peru,IL,61354,17099,65,N,N,N,N,140234,LaSalle County,<100 beds,"November 20, 2023"
487,488,300,60646597,Danbury Hospital,Danbury Hospital,24 Hospital Ave,Danbury,CT,6810,9001,345,N,Y,N,Y,70033,Fairfield County,>299 beds,"November 20, 2023"
2241,2242,1738,911997979,Vidant Pungo Hospital,Pungo District Hospital Corporation,210 East Water St,Belhaven,NC,27810,37013,49,N,N,N,N,341310,Beaufort County,<100 beds,"November 20, 2023"
1981,1982,1553,850442957,Union County Gen Hospital,Union County Gen Hospital,301 Harding St,Clayton,NM,88415,35059,25,N,N,N,N,321304,Union County,<100 beds,"November 20, 2023"


## Data Inspection

In [27]:
hospital_df.shape 

(3491, 19)

In [28]:
hospital_df.info() #all variables in df are labeled as object variables

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3491 entries, 0 to 3490
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   hospital_id                 3491 non-null   object
 1   hospital_org_id             3491 non-null   object
 2   ein                         3491 non-null   object
 3   name                        3491 non-null   object
 4   name_cr                     3491 non-null   object
 5   street_address              3491 non-null   object
 6   city                        3491 non-null   object
 7   state                       3491 non-null   object
 8   zip_code                    3491 non-null   object
 9   fips_state_and_county_code  3491 non-null   object
 10  hospital_bed_count          3491 non-null   object
 11  chrch_affl_f                3491 non-null   object
 12  urban_location_f            3491 non-null   object
 13  children_hospital_f         3491 non-null   obje

In [29]:
columns_to_convert = ['hospital_bed_count', 'medicare_provider_number']
binary_columns = ['chrch_affl_f', 'urban_location_f', 'children_hospital_f', 'memb_counc_teach_hosps_f']

# iterate through the list of columns
for col in columns_to_convert:
    # convert to numeric, coerce errors to NaN
    hospital_df[col] = pd.to_numeric(hospital_df[col], errors='coerce')
    
for col in binary_columns:
    # Map 'N' to 0 and 'Y' to 1
    hospital_df[col] = hospital_df[col].map({'N': 0, 'Y': 1})

# check the data types after conversion
hospital_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3491 entries, 0 to 3490
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   hospital_id                 3491 non-null   object
 1   hospital_org_id             3491 non-null   object
 2   ein                         3491 non-null   object
 3   name                        3491 non-null   object
 4   name_cr                     3491 non-null   object
 5   street_address              3491 non-null   object
 6   city                        3491 non-null   object
 7   state                       3491 non-null   object
 8   zip_code                    3491 non-null   object
 9   fips_state_and_county_code  3491 non-null   object
 10  hospital_bed_count          3491 non-null   int64 
 11  chrch_affl_f                3491 non-null   int64 
 12  urban_location_f            3491 non-null   int64 
 13  children_hospital_f         3491 non-null   int6

In [30]:
hospital_df.nunique() #unique variables 

hospital_id                   3491
hospital_org_id               2377
ein                           2377
name                          3330
name_cr                       3072
street_address                3439
city                          2192
state                           51
zip_code                      3200
fips_state_and_county_code    1751
hospital_bed_count             696
chrch_affl_f                     2
urban_location_f                 2
children_hospital_f              2
memb_counc_teach_hosps_f         2
medicare_provider_number      3491
county                        1195
hospital_bed_size                3
updated_dt                       1
dtype: int64

In [31]:
hospital_df.isnull().sum() #null values 

hospital_id                   0
hospital_org_id               0
ein                           0
name                          0
name_cr                       0
street_address                0
city                          0
state                         0
zip_code                      0
fips_state_and_county_code    0
hospital_bed_count            0
chrch_affl_f                  0
urban_location_f              0
children_hospital_f           0
memb_counc_teach_hosps_f      0
medicare_provider_number      0
county                        1
hospital_bed_size             0
updated_dt                    0
dtype: int64

In [32]:
duplicate_rows = hospital_df[hospital_df.duplicated()]

print("Duplicate Rows except first occurrence:")
print(duplicate_rows)

if hospital_df.duplicated().any():
    print("Duplicates exist in the DataFrame.")
else:
    print("No duplicates found in the DataFrame.")

Duplicate Rows except first occurrence:
Empty DataFrame
Columns: [hospital_id, hospital_org_id, ein, name, name_cr, street_address, city, state, zip_code, fips_state_and_county_code, hospital_bed_count, chrch_affl_f, urban_location_f, children_hospital_f, memb_counc_teach_hosps_f, medicare_provider_number, county, hospital_bed_size, updated_dt]
Index: []
No duplicates found in the DataFrame.


## Table Manipulation

In [33]:
# Hospital Information Table

# create unique ID for address
hospital_df['address_id'] = hospital_df['street_address'] + '_' + hospital_df['city'] + '_' + hospital_df['state'] + '_' + hospital_df['zip_code'] + '_' + hospital_df['fips_state_and_county_code']
hospital_df['address_id'] = hospital_df['address_id'].rank(method='dense', ascending=False).astype(int)

#create unique ID for hospital capacity
hospital_df['capacity_id'] = hospital_df['hospital_id'].astype(str) + '_' + hospital_df['hospital_bed_count'].astype(str) + '_' + hospital_df['hospital_bed_size']
hospital_df['capacity_id'] = hospital_df['capacity_id'].rank(method='dense', ascending=False).astype(int)


# create hospital information table
hospital = hospital_df[['hospital_id', 'hospital_org_id', 'name', 'ein', 'medicare_provider_number', 'address_id', 'capacity_id', 'updated_dt']]
hospital

Unnamed: 0,hospital_id,hospital_org_id,name,ein,medicare_provider_number,address_id,capacity_id,updated_dt
0,1,1,Mizell Memorial Hospital,630307951,10007,609,2381,"November 20, 2023"
1,2,2,St Vincents East,630578923,10011,1075,1270,"November 20, 2023"
2,3,3,Shelby Baptist Medical Center,630312913,10016,3323,667,"November 20, 2023"
3,4,4,Callahan Eye Foundation Hosp,630459034,10018,2440,556,"November 20, 2023"
4,5,5,Cherokee Medical Center,581973570,10022,1337,445,"November 20, 2023"
...,...,...,...,...,...,...,...,...
3486,3487,2647,Bsw Medical Center - Austin,813040663,670136,949,730,"November 20, 2023"
3487,3488,2304,Ascension Seton Bastrop,741109643,670143,718,729,"November 20, 2023"
3488,3489,2648,Texas Health Hospital Frisco,831954982,670260,2901,728,"November 20, 2023"
3489,3490,2302,Methodist Midlothian Medical Center,750800661,670300,2967,726,"November 20, 2023"


In [34]:
# Church Affiliation Table
church_affiliation = hospital_df[['hospital_id', 'chrch_affl_f', 'updated_dt']]
church_affiliation

Unnamed: 0,hospital_id,chrch_affl_f,updated_dt
0,1,0,"November 20, 2023"
1,2,0,"November 20, 2023"
2,3,0,"November 20, 2023"
3,4,0,"November 20, 2023"
4,5,0,"November 20, 2023"
...,...,...,...
3486,3487,0,"November 20, 2023"
3487,3488,0,"November 20, 2023"
3488,3489,0,"November 20, 2023"
3489,3490,0,"November 20, 2023"


In [35]:
# Hospital Capacity Table extracting data from hospital_df
hospital_capacity = hospital_df[['capacity_id', 'hospital_bed_size', 'hospital_bed_count', 'updated_dt']]
hospital_capacity

Unnamed: 0,capacity_id,hospital_bed_size,hospital_bed_count,updated_dt
0,2381,<100 beds,99,"November 20, 2023"
1,1270,>299 beds,362,"November 20, 2023"
2,667,100-299 beds,252,"November 20, 2023"
3,556,100-299 beds,106,"November 20, 2023"
4,445,<100 beds,60,"November 20, 2023"
...,...,...,...,...
3486,730,<100 beds,16,"November 20, 2023"
3487,729,<100 beds,7,"November 20, 2023"
3488,728,<100 beds,63,"November 20, 2023"
3489,726,<100 beds,46,"November 20, 2023"


In [36]:
# Children Hospital Table - extracting data from hospital_df
children_hospital= hospital_df[['hospital_id', 'children_hospital_f', 'updated_dt']]
children_hospital

Unnamed: 0,hospital_id,children_hospital_f,updated_dt
0,1,0,"November 20, 2023"
1,2,0,"November 20, 2023"
2,3,0,"November 20, 2023"
3,4,0,"November 20, 2023"
4,5,0,"November 20, 2023"
...,...,...,...
3486,3487,0,"November 20, 2023"
3487,3488,0,"November 20, 2023"
3488,3489,0,"November 20, 2023"
3489,3490,0,"November 20, 2023"


In [37]:
# Teaching Table - extracting data from hospital_df
teaching = hospital_df[['hospital_id', 'memb_counc_teach_hosps_f', 'updated_dt']]
teaching

Unnamed: 0,hospital_id,memb_counc_teach_hosps_f,updated_dt
0,1,0,"November 20, 2023"
1,2,1,"November 20, 2023"
2,3,0,"November 20, 2023"
3,4,1,"November 20, 2023"
4,5,0,"November 20, 2023"
...,...,...,...
3486,3487,0,"November 20, 2023"
3487,3488,0,"November 20, 2023"
3488,3489,0,"November 20, 2023"
3489,3490,0,"November 20, 2023"


In [38]:
# Urban Table
urban = hospital_df[['urban_location_f', 'updated_dt','hospital_id']]
urban

Unnamed: 0,urban_location_f,updated_dt,hospital_id
0,0,"November 20, 2023",1
1,1,"November 20, 2023",2
2,1,"November 20, 2023",3
3,1,"November 20, 2023",4
4,0,"November 20, 2023",5
...,...,...,...
3486,1,"November 20, 2023",3487
3487,1,"November 20, 2023",3488
3488,1,"November 20, 2023",3489
3489,1,"November 20, 2023",3490


In [39]:
# connect to database

username = "jeremiahf24"
password = "$ads507password"
hostName = "ads507finalproject.mysql.database.azure.com"

conn = pymysql.connect(host=hostName,
                       port=3306,
                       user=username,
                       passwd=password, 
                       db= "ads507final", 
                       ssl={"fake_flag_to_enable_tls": True})

In [40]:
tableNames=pd.read_sql_query("""SHOW TABLES""", conn)

# print table names 
tableNames

Unnamed: 0,Tables_in_ads507final
0,address_location
1,children_hospital
2,church_affiliation
3,hospital
4,hospital_capacity
5,teaching
6,urban


In [46]:
# children hospital table - convert df to list of tuples
children_hospital_data = [tuple(x) for x in children_hospital.values]

# SQL query - insert data into children_hospital table
sql_insert_children_hospital = """
INSERT INTO children_hospital(hospital_id, children_hospital_f, updated_dt)
VALUES (%s, %s, %s)
"""

# check if insertion is needed
if children_hospital_data:
    # execute query
    cursor = conn.cursor()
    try:
        cursor.executemany(sql_insert_children_hospital, children_hospital_data)
        conn.commit()
    except IntegrityError as e:
        print("Error", e)
        conn.rollback()  # rollback the transaction if an error occurs 
    finally:
        cursor.close()
else:
    print("No new data to insert into children_hospital table.")

In [47]:
# hospital_capacity table - convert df to list of tuples
hospital_capacity_data = [tuple(x) for x in hospital_capacity.values]

# SQL query - insert data into hospital_capacity table
sql_insert_hospital_capacity = """
INSERT INTO hospital_capacity (capacity_id, hospital_bed_size, hospital_bed_count, updated_dt)
VALUES (%s, %s, %s, %s)
"""

# check if insertion is needed
if hospital_capacity_data:
    # execute query 
    cursor = conn.cursor()
    try:
        cursor.executemany(sql_insert_hospital_capacity, hospital_capacity_data)
        conn.commit()
    except IntegrityError as e:
        print("Error", e)
        conn.rollback()  # rollback the transaction if an error 
    finally:
        cursor.close()
else:
    print("No new data to insert into hospital_capacity table.")

Error (1062, "Duplicate entry '2381' for key 'hospital_capacity.PRIMARY'")


In [48]:
# teaching table - convert df to list of tuples
teaching_data = [tuple(x) for x in teaching.values]

# SQL query - insert data into children_hospital table
sql_insert_teaching = """
INSERT INTO teaching (hospital_id, memb_counc_teach_hosps_f, updated_dt)
VALUES (%s, %s, %s)
"""

# execute query 
if teaching_data:
    # Execute query 
    cursor = conn.cursor()
    try:
        cursor.executemany(sql_insert_teaching, teaching_data)
        conn.commit()
    except IntegrityError as e:
        print("Error", e)
        conn.rollback()  # rollback the transaction if an error occurs 
    finally:
        cursor.close()
else:
    print("No new data to insert into teaching table.")

In [44]:
# Create a cursor
cursor = conn.cursor()

# Execute SQL queries to get table names
cursor.execute("SHOW TABLES")

# Fetch all the tables
tables = cursor.fetchall()

for table in tables:
        table_name = table[0]

        # Execute SQL query to fetch sample data from each table
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 5")

        # Fetch and print sample data
        sample_data = cursor.fetchall()
        print(f"Sample data from table '{table_name}':")
        for row in sample_data:
            print(row)

Sample data from table 'address_location':
Sample data from table 'children_hospital':
(1, 0, '2023-11-20')
(2, 0, '2023-11-20')
(3, 0, '2023-11-20')
(4, 0, '2023-11-20')
(5, 0, '2023-11-20')
Sample data from table 'church_affiliation':
Sample data from table 'hospital':
Sample data from table 'hospital_capacity':
(1, '<100 beds', 69, '2023-11-20')
(2, '<100 beds', 25, '2023-11-20')
(3, '100-299 beds', 234, '2023-11-20')
(4, '100-299 beds', 150, '2023-11-20')
(5, '>299 beds', 649, '2023-11-20')
Sample data from table 'teaching':
(1, 0, '2023-11-20')
(2, 1, '2023-11-20')
(3, 0, '2023-11-20')
(4, 1, '2023-11-20')
(5, 0, '2023-11-20')
Sample data from table 'urban':
