In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

# Daily infections number

In [2]:
county_daily_urls = ['https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2020.csv',
                     'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2021.csv',
                     'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-2022.csv',
                     'https://raw.githubusercontent.com/nytimes/covid-19-data/master/rolling-averages/us-counties-recent.csv'
                    ]

In [3]:
def get_county_daily_infections_nyt(urls):
    county_dfs = [pd.read_csv(url) for url in urls]
    df = pd.concat(county_dfs, ignore_index=True)
    # Column fips
    df. rename(columns = {'geoid':'fips'}, inplace = True)
    df['fips'] = df['fips'].str[4:]
    # Removing 2 columns
    df = df.drop(['cases_avg_per_100k', 'deaths_avg_per_100k', 'cases_avg', 'deaths_avg'], axis = 1)
    # Deleteing data for "Unknown" county 
    df = df.query('county != "Unknown"')
    df.sort_values(by=['fips', 'date'], inplace=True)
    # Remove duplicates
    df = df.drop_duplicates()
    # Add rows that are not there (dates that were without data)
    ## Get range of dates and all unique FIPS
    dates = pd.date_range(datetime.strptime(df['date'].agg('min'), '%Y-%m-%d'), datetime.strptime(df['date'].agg('max'), '%Y-%m-%d'))
    unique_fips = df.fips.unique()
    ## Map fips to counties and states
    map_county = df.set_index('fips')['county'].dropna().to_dict()
    map_state = df.set_index('fips')['state'].dropna().to_dict()
    ## NumPy array with all dates in df
    df_from_df = df[["date", "fips"]]
    df_from_df_new = df_from_df.to_numpy()
    df_from_df_new = df_from_df_new.astype('<U10')
    ## NumPy array with all dates in range
    df_from_dates = []
    for date in dates:
        for fips_code in unique_fips:
            lst = [str(date.date()), fips_code]
            df_from_dates.append(lst)
    df_from_dates_new = np.array(df_from_dates)
    ## Combine 2 NumPy arrays
    combined = np.concatenate((df_from_df_new, df_from_dates_new))
    ## Get rows that were not in the original df
    df_new = pd.DataFrame(combined, columns = ['date','fips'])
    df_new_all_columns = df_new.drop_duplicates(keep=False)
    df_new_all_columns['county'] = df_new['fips'].map(map_county)
    df_new_all_columns['state'] = df_new['fips'].map(map_state)
    df_new_all_columns['cases'] = 0
    df_new_all_columns['deaths'] = 0
    # New df with all dates
    df = pd.concat([df, df_new_all_columns], ignore_index=True)
    df.sort_values(by=['fips', 'date'], inplace=True)
    df = df.drop_duplicates()
    return df

In [None]:
county_daily_df = get_county_daily_infections_nyt(county_daily_urls)

In [None]:
county_daily_df

In [None]:
# Grouped by FIPS
# county_grouped = county_df.groupby(county_daily_df.fips)

# Cumulative infections

In [None]:
county_cum_urls = ['https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties-2020.csv',
                   'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties-2021.csv',
                   'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties-2022.csv',
                   'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties-recent.csv'
                  ]

In [None]:
def get_county_cum_infections_nyt(urls):
    county_dfs = [pd.read_csv(url) for url in urls]
    df = pd.concat(county_dfs, ignore_index=True)
    # Deleteing data for "Unknown" county
    df = df.query('county != "Unknown"')
    # Fill NaN values
    m_d = (df['deaths'].isnull() == True)
    m_c = (df['cases'].isnull() == True)
    df.loc[m_d, 'deaths'] = 0
    df.loc[m_c, 'cases'] = 0
    # New York City
    # All cases for the five boroughs of New York City (New York, Kings, Queens, Bronx and Richmond counties)
    # are assigned to a single area called New York City. The number of deaths in New York City also includes
    # probable deaths reported by the New York City health department.
    # Deaths are reported by county of residence, except for certain periods described below.
    # FIPS code for New York State
    m_NY = (df['county'] == "New York City") & (df['fips'].isnull() == True)
    df.loc[m_NY, 'fips'] = 36
    # Kansas City
    # Four counties (Cass, Clay, Jackson and Platte) overlap the municipality of Kansas City, Mo.
    # The cases and deaths that we show for these four counties are only for the portions exclusive of Kansas City.
    # Cases and deaths for Kansas City are reported as their own line.
    # FIPS code for Kansas State
    m_KC = (df['county'] == "Kansas City") & (df['fips'].isnull() == True)
    df.loc[m_KC, 'fips'] = 20
    # City Joplin
    # FIPS code for the city
    m_J = (df['county'] == "Joplin") & (df['fips'].isnull() == True)
    df.loc[m_J, 'fips'] = 2937592
    # FIPS code - change from .0
    df = df.astype({"fips": int, "deaths": int})
    return df

In [None]:
county_cum_df = get_county_cum_infections_nyt(county_cum_urls)

In [None]:
county_cum_df