In [13]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from datetime import date

from sklearn.ensemble import RandomForestRegressor

<h1>Global Dataframes & Variables</h1>

In [14]:
# We import the sample_submission.csv file as a way of determining
# the order of the rows in out output file
sample_submission = pd.read_csv("sample_submission.csv")

# The fips_key.csv file contains standard information about each county
key = pd.read_csv("data/us/processing_data/fips_key.csv", encoding='latin-1')

# Daily deaths contains the death count per day for each county.
# Cumulative deaths contains the total death count for each county
# by day.
daily_deaths = pd.read_csv("data/us/covid/nyt_us_counties_daily.csv")
cumulative_deaths = pd.read_csv("data/us/covid/deaths.csv")

# List of all counties
all_fips = key["FIPS"].tolist()

# Relevent dates
today = cumulative_deaths.columns[-1]
yesterday = cumulative_deaths.columns[-2]
one_week_ago = cumulative_deaths.columns[-8]
two_weeks_ago = cumulative_deaths.columns[-15]
beginning = cumulative_deaths.columns[4]

<h1>Global Helper Functions</h1>

In [34]:
def date_from_yyyy_mm_dd(date_string):
    splitting = [int(x) for x in date_string.split('-')]
    return date(splitting[0], splitting[2], splitting[1])

def date_from_mm_dd_yy(date_string):
    splitting = [int(x) for x in date_string.split('/')]
    return date(int("20" + str(splitting[2])), splitting[0], splitting[1])

# Assume date is in format mm/dd/yy, convert to yyyy-mm-dd
def convert_date_to_yyyy_mm_dd(date):
    parts = date.split('/')
    
    # Ensure leading zeros if necessary
    if len(parts[0]) == 1:
        parts[0] = "0" + parts[0]
    
    if len(parts[1]) == 1:
        parts[1] = "0" + parts[1]
        
    return "2020" + "-" + parts[0] + "-" + parts[1]

# Assume date is in format yyyy-mm-dd, convert to mm/dd/yy
def convert_date_to_mm_dd_yy(date):
    parts = date.split('-')
    
    # Remove leading zeros if necessary
    if parts[1][0] == "0":
        parts[1] = parts[1][1:]
    
    if parts[2][0] == "0":
        parts[2] = parts[2][1:]
        
    return parts[1] + "/" + parts[2] + "/" + "20"

# Get the name of a county from a given FIPS code
def get_name_from_fips(FIPS):
    return key.loc[key["FIPS"] == FIPS]["COUNTY"].values[0]

# Get the date of the first death of a given county. If the
# county has no deaths, return "N/A"
def get_date_of_first_death(FIPS):
    county = cumulative_deaths.loc[cumulative_deaths["countyFIPS"] == FIPS]
    deaths_dates = county.drop(columns=['countyFIPS', 'County Name', 'State', 'stateFIPS'])
    lst = deaths_dates.values[0]

    for i in range(len(lst)):
        if lst[i] != 0:
            return deaths_dates.columns[i]

    return "N/A"

# Get a list of all the deaths by date for a given county,
# starting from the date of the first case
def get_deaths_list(FIPS, endDate=convert_date_to_yyyy_dd_mm(today)):
    # Extract only the rows for this county in order by date
    rows = daily_deaths.loc[daily_deaths["fips"] == FIPS]
    deaths_list = rows["deaths"].values
    dates_list = rows["date"].values
    
    if endDate in dates_list:
        index = list(dates_list).index(endDate)
    else:
        return []
    
    return deaths_list[0:index+1]

# Returns true if there exists deaths data for the county,
# and false otherwise. We need this because some FIPS are
# not included in the ny times data at all.
def deaths_data_exists(FIPS):
    return len(daily_deaths.loc[daily_deaths["fips"] == FIPS].values) != 0

# Generate the quantiles for a given value and standard error
# according to a normal distribution.
def generate_quantiles(value, err):
    if err == 0:
        return [value] * 9
    
    quantiles = []
    for quantile in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        quantiles.append(norm.ppf(quantile, loc=value, scale=err))

    return quantiles

# Generate quantiles for a given list of values and errors
def generate_list_quantiles(lst, err_lst):
    quantiles = []
    for i in range(len(lst)):
        quantiles.append(generate_quantiles(lst[i], err_lst[i]))

    return quantiles