In [1]:
#import librarys

import requests
import pandas as pd
from datetime import timedelta, date

In [2]:
def daterange(start_date, end_date):
    """Function to create a list of dates between the inputted start and end date"""
    for n in range(int ((end_date - start_date).days)):
        current_day = start_date + timedelta(n)
        formatted_date = current_day.strftime("%m-%d-%Y")
        yield formatted_date

In [3]:
def request_data(start_date, end_date):
    """Function that requests US data and concatenates it into a single dataframe"""
    df = pd.DataFrame()
    for single_date in daterange(start_date, end_date):
        date = single_date
        url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/' + date + '.csv'
        raw_data = pd.read_csv(url, error_bad_lines=False)
        raw_data['Date'] = single_date
        df = pd.concat([df, raw_data])
    return df

# Define inputs
start_date = date(2020, 4, 12)
today = date.today()
one_day = timedelta(1)
end_date = today - one_day

#Run function
data = request_data(start_date, end_date)

In [4]:
 def clean_df(df):
    """Function that cleans the dataframe of unwanted data, drops outliers, and defines indexes"""
    # Set multi-index for dataframe
    df['State'] = df['Province_State']
    df.set_index(['State','Date'], inplace=True)
    df.sort_index(inplace=True)

    # Remove irrelavent rows
    df.drop(index=["American Samoa","Diamond Princess","Recovered","Guam"], level=0, inplace=True)
    df = df[df.Country_Region == 'US']
    df.drop(["Province_State","Country_Region","Last_Update", "Lat", "Long_", "Recovered", "Active", "FIPS", "Incident_Rate", "People_Hospitalized", "Mortality_Rate", "UID", "ISO3", "Testing_Rate", "Hospitalization_Rate"], axis=1, inplace=True)
    return df
    
data = clean_df(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
def daily_confirmed(df):
    """Function that adds the number of daily cases confirmed"""
    previous = 0
    for i, row in df.iterrows():
        df.loc[i, 'Daily_Confirmed'] = row['Confirmed'] - previous
        previous = row['Confirmed']
    return df   

data = daily_confirmed(data)

In [6]:
def avg_daily_confirmed(df):
    """Function that adds the average of daily cases confirmed over a fixed period of time"""
    df['Avg_Daily_Confirmed'] = df['Daily_Confirmed'].rolling(window=5).mean()
    return df

data = avg_daily_confirmed(data)

In [7]:
def change_avg_daily_confirmed(df):
    """Function that adds the change of the daily confirmed cases"""
    previous = 0
    for i, row in df.iterrows():
        if previous == 0:
            pass
        else:
            df.loc[i,"Change_Avg_Daily_Confirmed"] = ((row['Avg_Daily_Confirmed'] - previous ) / previous) * 100
        previous = row['Avg_Daily_Confirmed']
    return df

data = change_avg_daily_confirmed(data)

In [8]:
def daily_tests(df):
    """Function that adds the number of daily tests"""
    previous = 0
    for i, row in df.iterrows():
        df.loc[i, 'Daily_Tests'] = row['People_Tested'] - previous
        previous = row['People_Tested']
    return df   

data = daily_tests(data)

In [9]:
def daily_tests_positive(df):
    """Function that adds the number of daily positive tests"""
    for i, row in df.iterrows():
        if row['Daily_Tests'] == 0:
            pass
        else:
            df.loc[i, 'Daily_Tests_Positive'] = row['Daily_Confirmed'] / row['Daily_Tests'] * 100
    return df   

data = daily_tests_positive(data)

In [10]:
# Determine 14 day average of perecent positive tests
def avg_daily_tests_positive(df):
    """Function that adds the average number of positive tests over a fixed period of time"""
    df['Avg_Daily_Tests_Positive'] = df['Daily_Tests_Positive'].rolling(window=5).mean()
    return df

data = avg_daily_tests_positive(data)

In [11]:
def change_avg_daily_tests_positive(df):
    """Function that adds the change of the daily positive test percentage"""
    previous = 0
    for i, row in df.iterrows():
        if previous == 0:
            pass
        else:
            df.loc[i,"Change_Avg_Daily_Tests_Positive"] = ((row['Avg_Daily_Tests_Positive'] - previous ) / previous) * 100
        previous = row['Avg_Daily_Tests_Positive']
    return df

data = change_avg_daily_tests_positive(data)

In [12]:
def generate_excel_output(df):
    """Function that organizes the dataframe and exports the data to 'output' file"""
    df = df[['Confirmed', 'Daily_Confirmed', 'Avg_Daily_Confirmed', 'Change_Avg_Daily_Confirmed', 'Daily_Tests', 'Daily_Tests_Positive', 'Avg_Daily_Tests_Positive', 'Change_Avg_Daily_Tests_Positive','Deaths']]
    df.to_excel("output.xlsx")

generate_excel_output(data)