In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import seaborn as sns
from datetime import datetime
import json

In [2]:
def covid_data(country = 'us'):
    """ returns the data from https://api.covidtracking.com/v2/us/daily.json
    
    Args:
        country(str): inputed country that the user wants information about
        
    Returns:
        covid_data_dict(dict): a nested dictionary(tree) which 
        contains covid data
    
    """ 
    
    # build on url
    url =  f'https://api.covidtracking.com/v2/{country}/daily.json'
    
    # url to str
    url_str = requests.get(url).text
    
    # updates to a dict
    covid_data_dict = json.loads(url_str)
    
    return covid_data_dict['data']

In [3]:
def covid_dict_to_df(dict_covid):
    """ takes the dict and tranforms it into a DataFrame 
    
    Args:
        covid_data_dict(list-of(dict): a nested dictionary(tree)[list of dicts] which 
        contains covid data
        
    Returns:
        df_covid(DataFrame): outputs a DataFrame which contains information
        from the inputed dict
    
    """ 
    
    # initialize an empty DataFrame
    df_covid = pd.DataFrame()
    
    # iterates through the list of dict
    for item_in_dict in dict_covid:
        
        # change the date formatting so that it's in datetime
        series = pd.Series(item_in_dict)
        series['date'] = datetime.strptime(str(series['date']), '%Y-%m-%d').date()
        
        # coverting the series to a frame and concating to our empty DataFrame
        df_covid = pd.concat([df_covid, series.to_frame().T], ignore_index=True)
    
    return df_covid
    

In [4]:
def get_calc_val(list_val, val_dict, data_type): 
    """ retrieves the desired calculated values from a nested dict of covid data 

        Args: 
            list_val (list): a list of the names of calculated values that should be returned 
            val_dict (dictionary): the nested covid dictionary 
            data_type (string): the type of data that the calculated value is retrieved for

        Returns: 
            calc_val_dict (dictionary): a dictionary containing values for the desired calculated values 
    """
    
    # get the calculated value dict 
    total = val_dict['total']
    calc = total['calculated']
    
    # initialize an empty dict to track each value 
    calc_val_dict = {}
    
    for val in list_val: 
        val_calc = calc[val]
        
        calc_val_dict[f'{data_type} {val}'] = val_calc
        
    return calc_val_dict

In [5]:
def clean_covid_df(covid_df): 
    """ clean the cases, testing, and outcomes columns to retain data of interest 
    
        Args: 
            covid_df (DataFrame): the dataframe returned by covid_dict_to_df()
            
        Returns: 
            clean_covid_df (DataFrame): the dataframe containing only relevant data
    """
    
    # initialize a list of dicts for covid data 
    covid_list = []
    
    # access data for each day in the dataframe
    for idx, row in covid_df.iterrows():        
        
        # Access the total number of cases 
        daily_cases = row['cases']
        total_cases = daily_cases['total']
        total_case_num = total_cases['value']
        
        # Access the percent and change from prior day case data 
        case_dict = get_calc_val(['change_from_prior_day', 'population_percent'], row['cases'], 'cases')
        

        # add the day's data to a dict 
        daily_dict = {}
        daily_dict.update(case_dict)
        daily_dict['total cases'] = total_case_num
        daily_dict['Date'] = row['date']
        daily_dict['Date string'] = row['date'].strftime('%Y-%m-%d')
        
        # add the day's dict to the list of dicts 
        covid_list.append(daily_dict)
        
    # create a new dataframe with the cleaned data 
    clean_covid_df = pd.DataFrame(covid_list)
    
    # drop any rows with NaN
    clean_covid_df.dropna(axis = 0, how = 'any', inplace = True)
    
    # set index to date 
    clean_covid_df = clean_covid_df.set_index('Date')
    
    return clean_covid_df

In [6]:
import numpy as np

# list of unneeded columns 
list_col_drop = ['Date Type', 'Program', 'Total Doses Administered Cumulative', 'People Receiving 1 or More Doses Cumulative',
            'People Receiving 2 Doses Cumulative', '7-Day Avg Daily Count Dose 1', 'Total Doses Administered Daily Change', 
            '7-Day Avg Total Doses Administered Daily Change', 'People with at least One Dose Cumulative', 
            'People Fully Vaccinated Cumulative', '7-Day Avg Daily Count of People Fully Vaccinated', 
            '7-Day Avg Total Doses Daily', '7-Day Avg Daily Count Dose 2', 'Daily Count People Receiving Dose 1', 
            'Daily Count People Receiving Dose 2']


def clean_vax_df(vax_df,  drop_col = list_col_drop, pop_ct = 328200000): 
    """ drop unneeded rows and columns from a vaccine data frame
    
        Args: 
            vax_df (DataFrame): an uncleaned dataframe containing us vaccination data 
            list_col_drop (List): the names of columns that are not needed and can be dropped 
            pop_ct (int): the number of people in the area the dataset is collected for (eg, ~328 million in the US)
            
        Return: 
            new_vax_df (DataFrame): the dataframe with unnecessary data dropped 
    """

    # initialize empty dataframe 
    new_vax_df = pd.DataFrame()

    # drop rows with program = LTC rather than US
    us_bool = vax_df['Program'] == 'US'
    us_only_df = vax_df.loc[us_bool, :]

    # keep only admin rows
    admin_bool = vax_df['Date Type'] == 'Admin'
    us_admin_only = us_only_df.loc[admin_bool, :]

    # drop unneeded columns and rows with no date
    us_admin_only = us_admin_only.drop(drop_col, axis = 1)
    us_admin_only = us_admin_only.dropna(subset = ['Date'])

    # format the date as datetime.date
    for idx, row in us_admin_only.iterrows():
        
        # create a new series to track reformatted date and percent vaccination
        series = row
        series['Date'] = datetime.strptime(str(row['Date']), '%m/%d/%Y').date()
        
        new_vax_df = new_vax_df.append(series)
        
    # set the index to the date 
    new_vax_df = new_vax_df.set_index('Date').sort_index()
    
    # add columns for the cumulative people fully vaccinated and cumulative percent fully vaccinated 
    new_vax_df['Total People Fully Vaccinated'] = np.cumsum(a = new_vax_df.loc[:, 'Daily Count of People Fully Vaccinated'])
    new_vax_df['Percent People Fully Vaccinated'] = 100*(new_vax_df['Total People Fully Vaccinated']/pop_ct)
    
    return new_vax_df

In [7]:
mar_7 = datetime(month = 3, day = 7, year = 2021).date()

def combine_case_vax_df(case_df, vax_df, end_date = mar_7): 
    # combine the vaccination and case/test data: 
    df_covid_info = pd.concat([vax_df, case_df], axis=1, verify_integrity = True).sort_index()

    # drop dates after the end date (typically 3/7/21, the last day of case data)
    df_covid_info = df_covid_info.loc[:end_date, :]

    # There are many NaN values because vaccination data does not begin until December 2020. 
    # Convert these NaN to 0 
    df_covid_info = df_covid_info.fillna(0)
    
    # reset the index and name it
    df_covid_info = df_covid_info.reset_index()
    df_covid_info = df_covid_info.rename(columns = {'index': 'Date'})
    
    # set the index to date again
    df_covid_info = df_covid_info.set_index('Date')
    
    return df_covid_info

In [8]:
# retrieve covid case data 
us_dict = covid_data()
df_covid_cases_dirty = covid_dict_to_df(us_dict)
df_covid_cases = clean_covid_df(df_covid_cases_dirty)

# load and clean vaccination data 
df_vax_dirty = pd.read_csv('trends_in_number_of_covid19_vaccinations_in_the_us.csv')
df_covid_vax = clean_vax_df(df_vax_dirty)

# combine the dataframes and name the index 
df_covid_info = combine_case_vax_df(df_covid_cases, df_covid_vax)
df_covid_info = df_covid_info.rename(columns = {'index': 'Date'})

# save the dataframe to a csv
df_covid_info.to_csv('covid vaccine and case data.csv')

In [9]:
df_covid_info

Unnamed: 0_level_0,Daily Count of People Fully Vaccinated,Total Doses Administered Daily,Total People Fully Vaccinated,Percent People Fully Vaccinated,cases change_from_prior_day,cases population_percent,total cases,Date string
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-15,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0,2020-01-15
2020-01-16,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0,2020-01-16
2020-01-17,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0,2020-01-17
2020-01-18,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0,2020-01-18
2020-01-19,0.0,0.0,0.0,0.000000,1.0,0.0000,1.0,2020-01-19
...,...,...,...,...,...,...,...,...
2021-03-03,1018068.0,2638302.0,30033845.0,9.151080,66836.0,8.6218,28520365.0,2021-03-03
2021-03-04,1099418.0,2881742.0,31133263.0,9.486064,65487.0,8.6416,28585852.0,2021-03-04
2021-03-05,1159374.0,2907159.0,32292637.0,9.839317,68787.0,8.6624,28654639.0,2021-03-05
2021-03-06,777301.0,1849612.0,33069938.0,10.076154,60015.0,8.6806,28714654.0,2021-03-06


In [12]:
df_covid_info.index[0]

datetime.date(2020, 1, 15)