Select Hospital Data - https://healthdata.gov/Hospital/COVID-19-Reported-Patient-Impact-and-Hospital-Capa/anag-cw7u

CDC De-identified individual cases - 9gb File!: https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data-with-Ge/n8mc-b4w4




# Download and Build Combined COVID Datasets #
This process takes roughly 40 minutes to complete.

### DataSets ###
**John's Hopkins COVID Cases:** https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv

**John's Hopkins COVID Deaths:** https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv 

**CDC U.S. Vaccine Data:** https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD

**CDC County Vaccine Data:** https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD

**State Hospitalization Data:** https://healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD

In [2]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import time

from importlib import reload #//*** Reload library reloads an external python File.

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.display import clear_output

from datetime import datetime
from pathlib import Path
import os


pd.set_option('display.max_colwidth', None)
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

#//*** Custom Functions outboarded here for Brevity
import process_covid


data_folder_name = "raw_data"
#//***  Raw and Processed Filenames
confirmed_data_filename = "z_us_confirmed.csv"
death_data_filename =  "z_us_death_cases.csv"
county_daily_df_filename = "z_county_daily_df.csv.zip"
aggregate_hospital_filename = "z_county_hospital_aggregate.csv.zip"
combined_daily_casevax_filename = "z_county_daily_casevax.csv.zip"
weekly_combined_filename = "z_county_weekly_df.csv.zip"

#//*** Weekly Combined Cases,Deaths, Vaccines, Hospital
weekly_combined_cdvh_df_filename = "z_county_weekly_combined_cdvh_df.csv.zip"


  from IPython.core.display import display, HTML


In [3]:

reload(process_covid)


#//******************************************************
#//*** Begin Full Download and Data set Processing
#//******************************************************

start_time = time.time()
#//***************************************************************************************************
#//*** Download Latest Data - Updates Daily for Confirmed And Deaths
#//***************************************************************************************************
pc = process_covid.download_data()

print(f"Data Downloaded: {int(time.time() - start_time)}s")

#//*** Merge County Level Cases and Deaths. Converts Column based Days to Row based Days.
process_covid.build_county_case_death(
    folder = data_folder_name,
    confirm = confirmed_data_filename,
    death = death_data_filename,
    export = county_daily_df_filename,
)

print(f"Total Elapsed Time: {int(time.time() - start_time)}s")

#//***************************************************************************************************
#//*** Build Weekly Aggregated hospital Values. Sum all hospital stats for each county by Week
#//*** Exports to File
#//***************************************************************************************************
#//*** 4 - 5 Minute Process!
#//***************************************************************************************************
#//*** Load Raw Hospital Data
raw_hosp_df = process_covid.load_data(filename="z_county_hospital.csv.zip")

#//*** Daily Average Columns
#//*** Columns to Display
#disp_cols = ['collection_week', 'fips_code','state', 'total_beds_7_day_avg','inpatient_beds_7_day_avg','inpatient_beds_used_7_day_avg', 'inpatient_beds_used_covid_7_day_avg', 'total_icu_beds_7_day_avg', 'icu_beds_used_7_day_avg','staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg']

#//*** Descriptive Classification Columns: Will take a single value
#base_cols = ['collection_week', "fips_code",'state']

#//*** Columns to Process, ie sum 
#process_cols = ['total_beds_7_day_avg','inpatient_beds_7_day_avg','inpatient_beds_used_7_day_avg', 'inpatient_beds_used_covid_7_day_avg', 'total_icu_beds_7_day_avg', 'icu_beds_used_7_day_avg','staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg']

#//*** Weekly Summed Bed Counts
#//*** Columns to Display
disp_cols = ['collection_week', 'fips_code','state', 'total_beds_7_day_sum','inpatient_beds_7_day_sum','inpatient_beds_used_7_day_sum', 'inpatient_beds_used_covid_7_day_sum', 'total_icu_beds_7_day_sum', 'icu_beds_used_7_day_sum','staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum']

#//*** Descriptive Classification Columns: Will take a single value
base_cols = ['collection_week', "fips_code",'state']

#//*** Columns to Process, ie sum 
process_cols = ['total_beds_7_day_sum','inpatient_beds_7_day_sum','inpatient_beds_used_7_day_sum', 'inpatient_beds_used_covid_7_day_sum', 'total_icu_beds_7_day_sum', 'icu_beds_used_7_day_sum','staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum']


#//***************************************************************************************************
#//*** Aggregate (sum) Raw Hospital Data
#//*** Combine all Hospitals in each county on a given day.
#//***************************************************************************************************
process_covid.aggregate_columns(raw_hosp_df,
    by = "fips_code", #//*** All Granular Values to sum. All Hospitals in a County
    date_col = 'collection_week', #//*** Date Column 
    method = 'sum', #//*** Add Columns
    base_cols = base_cols, #//** Descriptive Columns
    process_cols = process_cols, #//*** Columns to Sum
    disp_cols = disp_cols, #//*** Columns to Display
    folder="raw_data",
    export=aggregate_hospital_filename, #//*** Filename to export DataFrame Too
)
print(f"Total Elapsed Time: {int(time.time() - start_time)}s")


#//***************************************************************************************************
#//*** Merge Cases & Deaths with Vaccines
#//*** Daily Interval, by County
#//***************************************************************************************************

#//*** Load Vaccine Data. County Level, Daily Interval
county_vax_df = process_covid.load_data(action="county_vaccine")
display(county_vax_df)

#//*** Load Confirmed Cases and Deaths
county_daily_df = process_covid.load_data(file=county_daily_df_filename, min_date=county_vax_df['Date'].min())
county_daily_df = county_daily_df[county_daily_df["Population"] > 0]
display(county_daily_df)

#//*** Merge Daily Cases
process_covid.merge_df(county_vax_df,county_daily_df,by="Date",left_col="FIPS",right_col="FIPS",export=combined_daily_casevax_filename)

#//***************************************************************************************************
#//*** Convert Combined Cases & Deaths & Vaccines
#//*** To Weekly Interval from Daily
#//*** by County 
#//***************************************************************************************************

#//*** Load the Combined Daily Cases and Vaccinations by County
daily_casevax_df = process_covid.load_data(filename=combined_daily_casevax_filename)

    #//************************************************************
    #//*** Load aggregated (summed by county) Hospital Data
    #//************************************************************

#//*** Load Weekly Aggregated Hospital by County
week_hosp_df = process_covid.load_data(filename=aggregate_hospital_filename)


#//*** Convert county daily to weekly interval
#//*** 10 Minute Build
process_covid.create_weekly_data(daily_casevax_df,export=weekly_combined_filename,dates=week_hosp_df['collection_week'].unique() )

#//***************************************************************************************************
#//*** Combine Aggregated Hospital Data with Weekly Cases & Deaths & Vaccines
#//*** Weekly Interval
#//*** By County
#//***************************************************************************************************

#//*** Load Weekly Case Death Vax by County
casevax_weekly_df = process_covid.load_data(filename=weekly_combined_filename)



#//*** Convert collection_week to datetime
week_hosp_df["collection_week"] = pd.to_datetime(week_hosp_df["collection_week"])

#//*** Rename columns: collection_week --> Date
week_hosp_df.columns = ['Date' if item == 'collection_week' else item for item in week_hosp_df.columns]
#//*** Rename columns: fips_code --> FIPS
week_hosp_df.columns = ['FIPS' if item == 'fips_code' else item for item in week_hosp_df.columns]

#//*** Merge Hospital and Case Death Vax Dataframes
process_covid.merge_df(casevax_weekly_df, #//*** Left Df
                       week_hosp_df,      #//*** Right DF
                       by="Date",         #//*** Aggregate using By Column (should always be a date column since this is a timeseries)
                       left_col="FIPS",   #//*** Left Col to Merge
                       right_col="FIPS",  #//*** Right Col to Merge
                       remove_cols = ['Recip_County'], #//*** Remove these columns for tidiness
                       export=weekly_combined_cdvh_df_filename)


print(f"DONE! Total Elapsed Time: {int(time.time() - start_time)}s")


Elapsed Time: 0s
Exporting DataFrame to Disk.
DONE! Total Elapsed Time: 1211s


## Code Below is Designed to fail. Want to preserve the Notebook Output ##


In [3]:
del fail
fail

NameError: name 'fail' is not defined

In [None]:
if False:
    #//*** The original dataset is 9gb and comprises 69 million rows.
    #//*** The data has been split into separate dataframes by state, compressed and pickled.
    #//*** Process patient impact parses those files, and sum s
    reload(process_covid)


    pc = process_covid.process_covid()

    #//*** Build aggregated Dataframe from the original split dataset
    df = pc.process_patient_impact(

        #//*** Names of the Columns to Keep 
        base_cols = ["case_month","res_state","state_fips_code","res_county","county_fips_code"],

        #//*** Columns to Process (Sum)
        process_cols = ['hosp_yn','icu_yn','death_yn','underlying_conditions_yn'],

        #//*** Rename columns using a tuple
        rename_cols = [('hosp_yn','hosp'),('icu_yn','icu'),('death_yn','death'),('underlying_conditions_yn','uc')],
    )
    df.to_pickle(f"./raw_data/summed_outcomes.pkl.zip")

In [None]:
#//*** Aggregate Combined Data
reload(process_covid)

#//*** Run as Needed takes 4minutes to Process
if False:
    #//*** Load County Level Daily Cases and Deaths
    combined_df = process_covid.load_data(file=county_daily_df_filename )
    
    #//*** Aggregate Each County into Monthly Data, save results to a File
    process_covid.create_monthly_data(combined_df,export="z_county_monthly_df.csv.zip")


In [None]:
#//*** Load and combine Summed Outcomes and Monthly Data: Obsolete Since the Monthly Data is inaccurate for Hospitalization numbers
reload(process_covid)
#//*** Load Monthly Summed outcomes - Built from Individual Ethno - demographics values
outcomes_df = process_covid.load_data(filename="summed_outcomes.pkl.zip")

#//*** Load Cases and Deaths Aggregated to Monthly data
monthly_df = process_covid.load_data(filename="z_county_monthly_df.csv.zip",trim_first_date=True,trim_last_date=True)


#//*** Rename Case_month to Date
outcomes_df.columns = ["Date" if item == "case_month" else item for item in list(outcomes_df.columns)]
outcomes_df = outcomes_df[['Date','county_fips_code','case_count','hosp','icu','death','uc']]

#//*** Merge Monthly Cases with Monthly Ethno-Demographic Hospital Data
df = process_covid.merge_df(outcomes_df,monthly_df,by="Date",left_col="county_fips_code", right_col="FIPS")
del df['county_fips_code']
df



print(df['New_Confirm'].sum(),df['case_count'].sum(),(df['case_count'].sum()/ df['New_Confirm'].sum()))

cor_cols = ['hosp','icu','total_vaccinated_percent','total_vaccinated_count','tot_confirm','tot_deaths']
disp_cols = ['Date', 'Combined_Key','Population', 'total_vaccinated_percent', 'total_vaccinated_count', 'tot_confirm', 'tot_deaths', 'New_Confirm', 'hosp','icu','New_Deaths','death', 'case_count']  
         
#'Recip_County', 'FIPS', 'Recip_State' New_Confirm_100k','New_Confirm_avg_daily_100k', 'New_Deaths_100k','New_Deaths_avg_daily_100k'
ca_df = df [df['Recip_State'] == "CA"]
#ca_df = df [df['FIPS'] == 6037]
#tdf = ca_df[ca_df['Date'] == ca_df['Date'].max()].sort_values('New_Deaths_avg_daily_100k',ascending=False)[cor_cols]
#print(ca_df.columns)

#//*** Print Everything to watch it not work
for group in ca_df.groupby("FIPS"):
    display(group[1][disp_cols])
    print(group[1]['New_Confirm'].sum(),group[1]['case_count'].sum())