Statistical estimation with a 95% confidence interval for number and percentage of inpatient beds occupied by COVID-19 patients for the given state and date

Data Dictionary:
1. state (string): The two digit state code
2. collection_date (date): Date estimated
3. Inpatient Beds Occupied by COVID-19 Patients Estimated (double): Estimated number of inpatient beds occupied by COVID-19 patients for the given state and date
4. Count LL (double): Estimated number of inpatient beds occupied by COVID-19 patients for the given state and date, lower limit, 95% confidence interval
5. Count UL (double): Estimated number of inpatient beds occupied by COVID-19 patients for the given state and date, upper limit, 95% confidence interval
6. Percentage of Inpatient Beds Occupied by COVID-19 Patients Estimated (double): Estimated percentage of inpatient beds occupied by COVID-19 patients for the given state and date
7. Percentage LL (double): Estimated percentage of inpatient beds occupied by COVID-19 patients for the given state and date, lower limit, 95% confidence interval
8. Percentage UL (double): Estimated percentage of inpatient beds occupied by COVID-19 patients for the given state and date, upper limit, 95% confidence interval

In [9]:
import pandas as pd

In [13]:
# Loading csv
csv = "estimated_beds.csv"
csv_df = pd.read_csv(csv)
csv_df

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
0,CW,2020-08-12,491077,488861,493293,68.74,68.17,69.31,696463,693474,699452
1,CW,2020-08-13,492601,490223,494978,68.71,68.07,69.34,699026,695947,702106
2,CW,2020-08-14,491212,489080,493345,68.29,67.74,68.84,701532,698728,704337
3,CW,2020-08-15,476285,473379,479190,66.30,65.23,67.37,700499,696701,704298
4,CW,2020-08-16,463508,460882,466133,64.79,63.89,65.70,697527,694125,700930
...,...,...,...,...,...,...,...,...,...,...,...
1691,WY,2020-09-10,563,487,639,37.51,33.75,41.27,1501,1276,1726
1692,WV,2020-09-11,3854,3854,3854,67.72,67.72,67.72,5691,5691,5691
1693,WY,2020-09-11,590,516,664,39.07,39.07,39.07,1510,1287,1733
1694,WV,2020-09-12,3702,3702,3702,65.24,65.24,65.24,5674,5674,5674


In [14]:
# dropped all rows with NaN values 
clean_df = csv_df.dropna(how='any')
clean_df

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
0,CW,2020-08-12,491077,488861,493293,68.74,68.17,69.31,696463,693474,699452
1,CW,2020-08-13,492601,490223,494978,68.71,68.07,69.34,699026,695947,702106
2,CW,2020-08-14,491212,489080,493345,68.29,67.74,68.84,701532,698728,704337
3,CW,2020-08-15,476285,473379,479190,66.30,65.23,67.37,700499,696701,704298
4,CW,2020-08-16,463508,460882,466133,64.79,63.89,65.70,697527,694125,700930
...,...,...,...,...,...,...,...,...,...,...,...
1691,WY,2020-09-10,563,487,639,37.51,33.75,41.27,1501,1276,1726
1692,WV,2020-09-11,3854,3854,3854,67.72,67.72,67.72,5691,5691,5691
1693,WY,2020-09-11,590,516,664,39.07,39.07,39.07,1510,1287,1733
1694,WV,2020-09-12,3702,3702,3702,65.24,65.24,65.24,5674,5674,5674


In [15]:
# checking names of states 
clean_df["state"].unique()

array(['CW', 'AL', 'DC', 'AR', 'CA', 'CT', 'GA', 'ID', 'IN', 'KS', 'LA',
       'MD', 'MI', 'AK', 'FL', 'CO', 'DE', 'IL', 'AZ', 'HI', 'IA', 'KY',
       'ME', 'MA', 'MN', 'MS', 'MT', 'NV', 'ND', 'NJ', 'NY', 'OR', 'PR',
       'RI', 'MO', 'NE', 'SD', 'TX', 'VT', 'NC', 'NH', 'NM', 'OH', 'OK',
       'PA', 'SC', 'VA', 'UT', 'WA', 'WI', 'TN', 'WV', 'WY'], dtype=object)

In [16]:
# checking number of states 
clean_df["state"].nunique()

53

In [17]:
# Dropping the state columns with the alue 'CW'
clean_states = csv_df.drop(csv_df.loc[csv_df['state']== 'CW'].index)
clean_states

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
32,AL,2020-08-12,10517,10290,10744,74.57,69.51,79.64,14103,13823,14383
33,DC,2020-08-12,2319,2319,2319,82.56,82.56,82.56,2809,2809,2809
34,AL,2020-08-13,10506,10492,10520,74.31,74.00,74.62,14138,14121,14155
35,DC,2020-08-13,2353,2353,2353,86.06,86.06,86.06,2734,2734,2734
36,AL,2020-08-14,10270,10214,10326,72.57,71.11,74.03,14152,14059,14245
...,...,...,...,...,...,...,...,...,...,...,...
1691,WY,2020-09-10,563,487,639,37.51,33.75,41.27,1501,1276,1726
1692,WV,2020-09-11,3854,3854,3854,67.72,67.72,67.72,5691,5691,5691
1693,WY,2020-09-11,590,516,664,39.07,39.07,39.07,1510,1287,1733
1694,WV,2020-09-12,3702,3702,3702,65.24,65.24,65.24,5674,5674,5674


In [18]:
# Dropping the state columns with the alue 'PR' 
df = clean_states.drop(clean_states.loc[clean_states['state']== 'PR'].index)
df

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
32,AL,2020-08-12,10517,10290,10744,74.57,69.51,79.64,14103,13823,14383
33,DC,2020-08-12,2319,2319,2319,82.56,82.56,82.56,2809,2809,2809
34,AL,2020-08-13,10506,10492,10520,74.31,74.00,74.62,14138,14121,14155
35,DC,2020-08-13,2353,2353,2353,86.06,86.06,86.06,2734,2734,2734
36,AL,2020-08-14,10270,10214,10326,72.57,71.11,74.03,14152,14059,14245
...,...,...,...,...,...,...,...,...,...,...,...
1691,WY,2020-09-10,563,487,639,37.51,33.75,41.27,1501,1276,1726
1692,WV,2020-09-11,3854,3854,3854,67.72,67.72,67.72,5691,5691,5691
1693,WY,2020-09-11,590,516,664,39.07,39.07,39.07,1510,1287,1733
1694,WV,2020-09-12,3702,3702,3702,65.24,65.24,65.24,5674,5674,5674


In [19]:
# Checking how many states are in the df after the deletions 
df["state"].nunique()

51

In [20]:
# Sorting data by state
sorted_df = df.sort_values(by=['state'])
sorted_df

Unnamed: 0,state,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
472,AK,2020-09-09,936,936,936,46.41,46.41,46.41,1211,1184,1238
470,AK,2020-09-08,868,865,870,41.82,38.63,45.02,1211,1182,1240
416,AK,2020-08-12,960,957,963,45.08,43.65,46.50,1280,1268,1292
418,AK,2020-08-13,921,921,921,43.95,43.95,43.95,1281,1281,1281
420,AK,2020-08-14,947,947,947,44.38,44.38,44.38,1280,1280,1280
...,...,...,...,...,...,...,...,...,...,...,...
1643,WY,2020-08-17,609,517,701,35.11,12.27,57.95,1734,1527,1941
1671,WY,2020-08-31,583,436,730,40.62,6.11,75.13,1435,1130,1740
1673,WY,2020-09-01,563,443,683,39.06,13.07,65.04,1441,1175,1707
1677,WY,2020-09-03,563,459,667,37.39,18.34,56.45,1505,1253,1757


In [28]:
# Number of dates per state
sorted_df.loc[:,["collection_date", "state"]].uni

Unnamed: 0,collection_date,state
472,2020-09-09,AK
470,2020-09-08,AK
416,2020-08-12,AK
418,2020-08-13,AK
420,2020-08-14,AK
...,...,...
1643,2020-08-17,WY
1671,2020-08-31,WY
1673,2020-09-01,WY
1677,2020-09-03,WY


In [21]:
# ## setting df index to state 
# state_df = sorted_df.set_index('state')
# state_df

Unnamed: 0_level_0,collection_date,Inpatient Beds Occupied Estimated,Count LL,Count UL,Percentage of Inpatient Beds Occupied Estimated,Percentage LL,Percentage UL,Total Inpatient Beds,Total LL,Total UL
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AK,2020-09-09,936,936,936,46.41,46.41,46.41,1211,1184,1238
AK,2020-09-08,868,865,870,41.82,38.63,45.02,1211,1182,1240
AK,2020-08-12,960,957,963,45.08,43.65,46.50,1280,1268,1292
AK,2020-08-13,921,921,921,43.95,43.95,43.95,1281,1281,1281
AK,2020-08-14,947,947,947,44.38,44.38,44.38,1280,1280,1280
...,...,...,...,...,...,...,...,...,...,...
WY,2020-08-17,609,517,701,35.11,12.27,57.95,1734,1527,1941
WY,2020-08-31,583,436,730,40.62,6.11,75.13,1435,1130,1740
WY,2020-09-01,563,443,683,39.06,13.07,65.04,1441,1175,1707
WY,2020-09-03,563,459,667,37.39,18.34,56.45,1505,1253,1757


In [29]:
# Writing df to csv 
state_df.to_csv('cleaned_estimated_beds.csv', index=False)

In [25]:
# Number of dates recorded
state_df["collection_date"].unique()

array(['2020-09-09', '2020-09-08', '2020-08-12', '2020-08-13',
       '2020-08-14', '2020-08-15', '2020-08-16', '2020-08-17',
       '2020-08-18', '2020-08-19', '2020-08-20', '2020-08-21',
       '2020-08-23', '2020-08-24', '2020-08-25', '2020-08-26',
       '2020-08-22', '2020-08-28', '2020-09-10', '2020-09-11',
       '2020-08-27', '2020-09-07', '2020-09-06', '2020-09-05',
       '2020-09-04', '2020-09-12', '2020-09-02', '2020-09-01',
       '2020-08-31', '2020-08-30', '2020-08-29', '2020-09-03'],
      dtype=object)

In [26]:
# Number of dates per state
state_df[["collection_date", "state"]].unique()

KeyError: "['state'] not in index"