### EDA for Hospital by County

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# importing the data
data = pd.read_csv('../data/hospitals_by_county.csv')

In [4]:
# lets look at sample data
data.head()

Unnamed: 0,county,todays_date,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
0,Plumas,2020-03-29,0.0,1.0,,,0.0,1.0,
1,Tehama,2020-03-29,0.0,0.0,,,0.0,0.0,2.0
2,Glenn,2020-03-29,,,,,,,
3,Mono,2020-03-29,0.0,1.0,,,0.0,0.0,2.0
4,Marin,2020-03-29,7.0,13.0,,,2.0,6.0,11.0


In [5]:
# number of null records

data.isna().sum()

county                                      0
todays_date                                 0
hospitalized_covid_confirmed_patients       8
hospitalized_suspected_covid_patients       8
hospitalized_covid_patients              1285
all_hospital_beds                        1375
icu_covid_confirmed_patients               29
icu_suspected_covid_patients               29
icu_available_beds                        804
dtype: int64

In [6]:

data.describe()

Unnamed: 0,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
count,16901.0,16901.0,15624.0,15534.0,16880.0,16880.0,16105.0
mean,113.187622,21.330395,137.904122,1266.172267,30.116706,3.284893,50.46855
std,441.167685,72.04045,497.901983,3020.403507,103.721833,11.331873,137.4166
min,0.0,0.0,0.0,0.0,0.0,0.0,-110.0
25%,1.0,0.0,1.0,56.0,0.0,0.0,3.0
50%,12.0,2.0,17.0,346.5,3.0,0.0,9.0
75%,67.0,15.0,84.0,1187.5,18.0,2.0,43.0
max,8098.0,1350.0,8422.0,23989.0,1731.0,244.0,1502.0


In [7]:
data.groupby(by= 'todays_date')['hospitalized_covid_patients'].count()

todays_date
2020-03-29     0
2020-03-30     0
2020-03-31     0
2020-04-01     0
2020-04-02     0
              ..
2021-01-20    56
2021-01-21    56
2021-01-22    56
2021-01-23    56
2021-01-24    56
Name: hospitalized_covid_patients, Length: 302, dtype: int64

In [8]:
data[data['hospitalized_covid_confirmed_patients'].isna()]

Unnamed: 0,county,todays_date,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
2,Glenn,2020-03-29,,,,,,,
11,Colusa,2020-03-29,,,,,,,
41,Inyo,2020-03-29,,,,,,,
46,Butte,2020-03-29,,,,,,,
54,Calaveras,2020-03-29,,,,,,,
99,Inyo,2020-03-30,,,,,,,
162,Inyo,2020-03-31,,83.0,,,0.0,0.0,6.0
205,Glenn,2020-04-01,,,,,,,


In [9]:
data[data['hospitalized_covid_patients'].isna()]

Unnamed: 0,county,todays_date,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
0,Plumas,2020-03-29,0.0,1.0,,,0.0,1.0,
1,Tehama,2020-03-29,0.0,0.0,,,0.0,0.0,2.0
2,Glenn,2020-03-29,,,,,,,
3,Mono,2020-03-29,0.0,1.0,,,0.0,0.0,2.0
4,Marin,2020-03-29,7.0,13.0,,,2.0,6.0,11.0
...,...,...,...,...,...,...,...,...,...
1280,Contra Costa,2020-04-20,37.0,28.0,,,13.0,8.0,75.0
1281,Mendocino,2020-04-20,0.0,0.0,,,0.0,0.0,7.0
1282,Calaveras,2020-04-20,0.0,1.0,,,0.0,0.0,11.0
1283,San Joaquin,2020-04-20,30.0,23.0,,,16.0,4.0,18.0


In [10]:
# the data in hospitalized Covid patients seems incorrect as it should be a sum of confirmed and 
# suspected
data['hospitalized_covid_patients'] = data['hospitalized_covid_confirmed_patients'] + data['hospitalized_suspected_covid_patients']

### FIX where there are NULLs for  hospital beds and  ICU beds 

### Approach
#### (1.) Lets take all the use cases where the Hospital and ICU beds are not null
#### (2.) Use this as the train data and set up a Linear Regression model
#### (3.) use this model once to predict the Hospital beds and once to predict the ICU beds


### Fix All  hospital beds are NULL

In [11]:
#  data set
data_train = data[data['all_hospital_beds'].notnull() & data['icu_available_beds'].notnull()][['all_hospital_beds','icu_available_beds']]
X = data_train.drop('all_hospital_beds', axis=1)
y  = data_train['all_hospital_beds']

In [12]:
# instantiate Linear Regression
from sklearn.linear_model import LinearRegression

In [13]:
# initiating Linear Regression
lr = LinearRegression()
# fitting the model
model = lr.fit(X, y)

In [14]:
# lets predict 'all_hospital_beds' based on icu_available_beds for all records where all_hospital_beds are null

data_hosp_bed_null = data[data['all_hospital_beds'].isna() & data['icu_available_beds'].notnull()][['all_hospital_beds','icu_available_beds']]



In [15]:
# Run the model on training data 
data_hosp_bed_null['all_hospital_beds'] = model.predict(data_hosp_bed_null[['icu_available_beds']]).round(0)

In [16]:
data_hosp_bed_null

Unnamed: 0,all_hospital_beds,icu_available_beds
1,308.0,2.0
3,308.0,2.0
4,493.0,11.0
5,266.0,0.0
6,4502.0,205.0
...,...,...
1939,349.0,4.0
1964,411.0,7.0
1981,308.0,2.0
2025,369.0,5.0


In [17]:
## update the original data frame for null hospital beds
data.loc[data['all_hospital_beds'].isna(),'all_hospital_beds']= data_hosp_bed_null['all_hospital_beds']

### Fix All  ICU beds  are NULL

In [18]:
#  data set

X = data_train.drop('icu_available_beds', axis=1)
y  = data_train['icu_available_beds']

In [19]:
# initiating Linear Regression
lr = LinearRegression()
# fitting the model
model = lr.fit(X, y)

In [20]:
# lets predict icu_available_beds  based on all_hospital_beds for all records where icu_available_beds are null

data_icu_bed_null = data[data['icu_available_beds'].isna() & data['all_hospital_beds'].notnull()][['all_hospital_beds','icu_available_beds']]




In [21]:
# Run the model on training data 
data_icu_bed_null['icu_available_beds'] = model.predict(data_icu_bed_null[['all_hospital_beds']]).round(0)

In [22]:
## update the original data frame for null ICU beds
data.loc[data['icu_available_beds'].isna(),'icu_available_beds']= data_icu_bed_null['icu_available_beds']

### There are still 216 ICU beds that have NULL values. lets manually eyeball and see how we can fix them

In [23]:
data[data['all_hospital_beds'].isna() & data['icu_available_beds'].isna()]

Unnamed: 0,county,todays_date,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
0,Plumas,2020-03-29,0.0,1.0,1.0,,0.0,1.0,
2,Glenn,2020-03-29,,,,,,,
7,Modoc,2020-03-29,0.0,0.0,0.0,,0.0,0.0,
9,Trinity,2020-03-29,0.0,0.0,0.0,,0.0,0.0,
11,Colusa,2020-03-29,,,,,,,
...,...,...,...,...,...,...,...,...,...
2967,Glenn,2020-05-21,0.0,0.0,0.0,,0.0,0.0,
3068,Glenn,2020-05-22,0.0,0.0,0.0,,0.0,0.0,
3098,Glenn,2020-05-23,0.0,0.0,0.0,,0.0,0.0,
3139,Glenn,2020-05-24,0.0,0.0,0.0,,0.0,0.0,


In [24]:
# understand what datasets have NULLs for all_hospital_beds
data[data['all_hospital_beds'].isna() & data['icu_available_beds'].isna()].groupby(by = 'county')['todays_date'].count()

county
Butte          1
Calaveras      1
Colusa         1
Del Norte      1
Glenn         57
Inyo           2
Lassen        24
Mariposa      21
Modoc         23
Plumas        23
Santa Cruz     1
Shasta         1
Sutter        36
Trinity       23
Tuolumne       1
Name: todays_date, dtype: int64

In [25]:
data[data['county'] == 'Tuolumne' ][['all_hospital_beds' ,'icu_available_beds' ]]

Unnamed: 0,all_hospital_beds,icu_available_beds
18,266.0,0.0
101,328.0,3.0
140,349.0,4.0
188,308.0,2.0
256,308.0,2.0
...,...,...
16663,84.0,2.0
16693,84.0,0.0
16779,84.0,2.0
16839,84.0,1.0


In [26]:
# update the null to the min value
data.loc[(data['county'] == 'Tuolumne') & (data['all_hospital_beds'].isna()) , 'all_hospital_beds'] = 251.0
# update the null to the min value
data.loc[(data['county'] == 'Tuolumne') & (data['icu_available_beds'].isna()) , 'icu_available_beds'] = 0.0

### Fix icu_covid_confirmed_patients  and icu_suspected_covid_patients 

In [27]:
# icu_covid_confirmed_patients
data.loc[(data['icu_covid_confirmed_patients'].isna()) , 'icu_covid_confirmed_patients'] = data['hospitalized_covid_confirmed_patients']

In [28]:
# icu_suspected_covid_patients
data.loc[(data['icu_suspected_covid_patients'].isna()) , 'icu_suspected_covid_patients'] = data['hospitalized_suspected_covid_patients']

In [29]:
### hospitalized_covid_confirmed_patients , hospitalized_suspected_covid_patients 

data.loc[(data['hospitalized_covid_confirmed_patients'].isna()) , 'hospitalized_covid_confirmed_patients'] = 0.0
data.loc[(data['hospitalized_suspected_covid_patients'].isna()) , 'hospitalized_suspected_covid_patients'] = 0.0

### Fix hospitalized_covid_patients 

In [30]:
data.loc[(data['hospitalized_covid_patients'].isna()) , 'hospitalized_covid_patients'] = data['hospitalized_covid_confirmed_patients']+data['hospitalized_suspected_covid_patients']

In [31]:
data

Unnamed: 0,county,todays_date,hospitalized_covid_confirmed_patients,hospitalized_suspected_covid_patients,hospitalized_covid_patients,all_hospital_beds,icu_covid_confirmed_patients,icu_suspected_covid_patients,icu_available_beds
0,Plumas,2020-03-29,0.0,1.0,1.0,,0.0,1.0,
1,Tehama,2020-03-29,0.0,0.0,0.0,308.0,0.0,0.0,2.0
2,Glenn,2020-03-29,0.0,0.0,0.0,,,,
3,Mono,2020-03-29,0.0,1.0,1.0,308.0,0.0,0.0,2.0
4,Marin,2020-03-29,7.0,13.0,20.0,493.0,2.0,6.0,11.0
...,...,...,...,...,...,...,...,...,...
16904,Marin,2021-01-24,28.0,2.0,30.0,328.0,7.0,1.0,8.0
16905,Plumas,2021-01-24,0.0,0.0,0.0,35.0,0.0,0.0,0.0
16906,Nevada,2021-01-24,12.0,0.0,12.0,124.0,3.0,0.0,3.0
16907,Sutter,2021-01-24,0.0,0.0,0.0,14.0,0.0,0.0,0.0


In [32]:
data.to_csv('../clean_data/hospitals_by_county.csv', index=False) 