# Open NYC COVID-19 Analysis 

This notebook is meant to serve as a starter for anyone interested in diving into the most recent data surrounding the global 2019 novel coronavirus (COVID-19) outbreak.

Datasets:
- Epidemiology Data: https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
- Population Data: https://population.un.org/wpp/Download/Standard/Population/

In [1]:
import pandas as pd
import cufflinks as cf
import plotly.graph_objs as go
import plotly.express as px
import numpy as np
from scipy.optimize import curve_fit
from typing import List
from IPython.display import display
cf.go_offline()

## Fetch data from JHU
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports

In [2]:
daily_jhu_report_url_base = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/%s.csv"
def gather_daily_jhu_reports(date_rng):
    collected = pd.DataFrame()
    for date in date_rng:
        date_str = date.strftime("%m-%d-%Y")
        print("Collecting data from: %s" % date_str)
        try:
            daily_report = pd.read_csv(daily_jhu_report_url_base % date_str)
            daily_report.insert(0, "Date", date)
            daily_report.rename(columns={
                'Country_Region':'Country/Region', 
                "Province_State": "Province/State", 
                "Last_Update": "Last Update",
                "Lat": "Latitude",
                "Long_": "Longitude"
            }, inplace=True)
            collected = pd.concat([collected, daily_report], axis=0, ignore_index=True)
        except:
            print("FAILED TO FETCH DATA FOR: %s" % date_str)
    return collected
report_data = gather_daily_jhu_reports(pd.date_range("1-22-2020", "03-25-2020").to_pydatetime())
report_data.to_csv("daily_jhu_report_data.csv", index=0)


Collecting data from: 01-22-2020
Collecting data from: 01-23-2020
Collecting data from: 01-24-2020
Collecting data from: 01-25-2020
Collecting data from: 01-26-2020
Collecting data from: 01-27-2020
Collecting data from: 01-28-2020
Collecting data from: 01-29-2020
Collecting data from: 01-30-2020
Collecting data from: 01-31-2020
Collecting data from: 02-01-2020
Collecting data from: 02-02-2020
Collecting data from: 02-03-2020
Collecting data from: 02-04-2020
Collecting data from: 02-05-2020
Collecting data from: 02-06-2020
Collecting data from: 02-07-2020
Collecting data from: 02-08-2020
Collecting data from: 02-09-2020
Collecting data from: 02-10-2020
Collecting data from: 02-11-2020
Collecting data from: 02-12-2020
Collecting data from: 02-13-2020
Collecting data from: 02-14-2020
Collecting data from: 02-15-2020
Collecting data from: 02-16-2020
Collecting data from: 02-17-2020
Collecting data from: 02-18-2020
Collecting data from: 02-19-2020
Collecting data from: 02-20-2020
Collecting

In [3]:
jhu_data = pd.read_csv("daily_jhu_report_data.csv")
selector = np.logical_or(
    jhu_data["Combined_Key"] == "New York City, New York, US", 
    jhu_data["Province/State"] == "New York City, NY"
)
selector = np.logical_or(selector, jhu_data["Province/State"] == "New York County, NY")
selector = np.logical_or(selector, jhu_data["Admin2"] == "New York City")
jhu_nyc_data = jhu_data[selector]
jhu_nyc_data.tail(5)

Unnamed: 0,Date,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,FIPS,Admin2,Active,Combined_Key
4327,2020-03-09,"New York County, NY",US,2020-03-09T17:13:16,19.0,0.0,0.0,40.7128,-74.006,,,,
7617,2020-03-22,New York,US,3/22/20 23:45,9654.0,63.0,0.0,40.767273,-73.971526,36061.0,New York City,0.0,"New York City, New York, US"
13101,2020-03-23,New York,US,2020-03-23 23:19:34,12305.0,99.0,0.0,40.767273,-73.971526,36061.0,New York City,0.0,"New York City, New York, US"
16516,2020-03-24,New York,US,2020-03-24 23:37:31,14904.0,131.0,0.0,40.767273,-73.971526,36061.0,New York City,0.0,"New York City, New York, US"
19933,2020-03-25,New York,US,2020-03-25 23:33:19,17856.0,199.0,0.0,40.767273,-73.971526,36061.0,New York City,0.0,"New York City, New York, US"


In [4]:
jhu_hubei_data = jhu_data[jhu_data["Province/State"] == "Hubei"]
jhu_hubei_data["Date"] = pd.to_datetime(jhu_hubei_data["Date"])
jhu_hubei_data.set_index("Date", inplace=True)

jhu_hunan_data = jhu_data[jhu_data["Province/State"] == "Hunan"]
jhu_hunan_data["Date"] = pd.to_datetime(jhu_hunan_data["Date"])
jhu_hunan_data.set_index("Date", inplace=True)

In [5]:
jhu_data.groupby("Province/State")["Confirmed"].max().sort_values(ascending=False).head(20)

Province/State
Hubei               67801.0
French Polynesia    19874.0
New York            17856.0
France              14282.0
United Kingdom       5018.0
Netherlands          3631.0
Washington           1793.0
Guangdong            1433.0
Illinois             1418.0
California           1364.0
Quebec               1342.0
New Jersey           1327.0
Denmark              1326.0
Henan                1274.0
Zhejiang             1241.0
Michigan             1122.0
New South Wales      1029.0
Hunan                1018.0
Anhui                 990.0
Jiangxi               936.0
Name: Confirmed, dtype: float64

## Extract data from covid.direct
https://covid-19.direct/county/NY/New%20York%20City

In [6]:
nyc_pop = 8623000
raw_data = pd.read_json("covid_direct_3-25-20.json")
clean_nyc_data = raw_data[["confirmed", "death", "fulldate", "newcase"]]
clean_nyc_data.dropna(inplace=True)
clean_nyc_data["fulldate"] = pd.to_datetime(clean_nyc_data["fulldate"])
clean_nyc_data.set_index("fulldate", inplace=True)
clean_nyc_data[["confirmed", "death"]].iplot()
(clean_nyc_data[["confirmed", "death"]]*100000/nyc_pop).iplot()

## Fetch data from csbs.org

https://github.com/tomquisel/covid19-data/tree/master/data/csv

In [7]:
daily_csbs_report_url_base = "https://raw.githubusercontent.com/tomquisel/covid19-data/master/data/csv/%s.csv"
def gather_daily_csbs_reports(date_rng):
    collected = pd.DataFrame()
    for date in date_rng:
        date_str = date.strftime("%Y-%m-%d")
        print("Collecting data from: %s" % date_str)
        try:
            daily_report = pd.read_csv(daily_csbs_report_url_base % date_str)
            daily_report.insert(0, "Date", date)
            collected = pd.concat([collected, daily_report], axis=0, ignore_index=True)
        except:
            print("FAILED TO FETCH DATA FOR: %s" % date_str)
    return collected
report_data = gather_daily_csbs_reports(pd.date_range("03-14-2020", "03-25-2020").to_pydatetime())
report_data.to_csv("daily_csbs_report_data.csv", index=0)


Collecting data from: 2020-03-14
Collecting data from: 2020-03-15
Collecting data from: 2020-03-16
Collecting data from: 2020-03-17
Collecting data from: 2020-03-18
Collecting data from: 2020-03-19
Collecting data from: 2020-03-20
Collecting data from: 2020-03-21
Collecting data from: 2020-03-22
Collecting data from: 2020-03-23
Collecting data from: 2020-03-24
Collecting data from: 2020-03-25


In [8]:
csbs_data = pd.read_csv("daily_csbs_report_data.csv")
nyc_csbs_data = csbs_data[csbs_data["County_Name"] == "New York"]
nyc_csbs_data["Date"] = pd.to_datetime(nyc_csbs_data["Date"])
nyc_csbs_data.set_index("Date", inplace=True)
nyc_csbs_data[["Confirmed", "Death"]].iplot()
nyc_csbs_data

Unnamed: 0_level_0,County_Name,State_Name,Confirmed,New,Death,Fatality_Rate,Last_Update,Latitude,Longitude,New_Death
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-03-14,New York,New York,154,,1,0.006494,2020-03-14 12:04 EDT,40.71455,-74.00714,
2020-03-15,New York,New York,269,115.0,3,0.011152,2020-03-15 16:23 EDT,40.71455,-74.00714,
2020-03-16,New York,New York,463,194.0,9,0.019438,2020-03-16 21:26 UTC,40.71455,-74.00714,
2020-03-17,New York,New York,814,351.0,7,0.0086,2020-03-17 21:54 EDT,40.71455,-74.00714,
2020-03-18,New York,New York,1871,1057.0,11,0.005879,2020-03-18 19:42 EDT,40.71455,-74.00714,
2020-03-19,New York,New York,3954,2083.0,26,0.006576,2020-03-19 21:57 EDT,40.71455,-74.00714,
2020-03-20,New York,New York,5683,1729.0,43,0.007566,2020-03-21 00:57 EDT,40.71455,-74.00714,
2020-03-21,New York,New York,8115,2432.0,60,0.007394,2020-03-22 00:28 EDT,40.71455,-74.00714,
2020-03-22,New York,New York,10764,2649.0,99,0.009197,2020/03/22 22:06 EDT,40.71455,-74.00714,99.0
2020-03-23,New York,New York,12305,1541.0,99,0.008046,2020-03-23 21:39 EDT,40.71455,-74.00714,99.0


# Regression Analysis

## Logistic Modeling of Outbreaks

The following models show logistic regressions using the most recent data for each country's outbreak. 95% confidence intervals are provided (if not shown, the intervals are too large to depict, and models may be highly inaccurate).  

<img src="http://andymath.com/wp-content/uploads/2019/08/Logistic-Function.jpg" width="240px" style="float: left">

In [9]:
def logist_reg(cum_confirmed_series, city_pop: int, city_name: str, forecast_days: int = 5, show_ci: bool = True):
    def logist_fn(x, a, b, c):
        return a / (1 + np.exp(-b*(x-c)))
    reg_data = cum_confirmed_series[cum_confirmed_series > 10]
    reg_data = 100000 * reg_data / city_pop
    reg_data.reset_index(drop=True, inplace=True)
    try:
        param_est, param_cov = curve_fit(logist_fn, reg_data.index.values, reg_data.values, bounds=(0, [400, 0.6, 200]))
        #compute 1x StDev errors for params
        param_stdev = np.sqrt(np.diag(param_cov))
        pred_data = logist_fn(range(len(reg_data.index) + forecast_days), *param_est)
        formula_str = "%0.2f / (1 + exp(-%0.2f(x-%0.2f)))" % (param_est[0], param_est[1], param_est[2])
        if show_ci:# and np.all(np.less(param_stdev, [20, 10, 5])):
            plot_stdev = param_stdev * [1, 1, -1]
            pred_data_upper = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est + 2*plot_stdev))
            pred_data_lower = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est - 2*plot_stdev))
            pd.DataFrame([pred_data, reg_data, pred_data_upper, pred_data_lower], index=[formula_str, "Actual", "Upper", "Lower"]).T.iplot(
                title="Forecasting COVID-19 Confirmed Cases: " + city_name,
                xTitle="Days since first confirmed case in country",
                yTitle="# of confirmed cases per 100,000 people",
                dash=["solid", "solid", "dash", "dash"],
                legend="bottom"
            )
        else:
          pd.DataFrame([pred_data, reg_data], index=[formula_str, "Actual"]).T.iplot(
            title="Forecasting COVID-19 Confirmed Cases: " + city_name,
            xTitle="Days since first confirmed case in country",
            yTitle="# of confirmed cases per 100,000 people",
            dash=["dash", "solid"],
            legend="bottom"
          )
        return (param_est, param_stdev)
    except RuntimeError:
        print("Failed to generate adequate regression for: " + city_name)
        return [None, None, None, None]
logist_param_est = []
logist_param_stdev = []
pop_counts = {"NYC": nyc_pop, "Hubei": 58500000, "Hunan": 67370000}
loc_dict = {"NYC": clean_nyc_data["confirmed"], "Hubei": jhu_hubei_data["Confirmed"], "Hunan": jhu_hunan_data["Confirmed"]}
for (loc, locdf) in loc_dict.items(): 
    res = logist_reg(locdf, pop_counts[loc], loc)
    logist_param_est.append(res[0])
    logist_param_stdev.append(res[1])
print("Logistic Regression Parameter Estimates")
display(pd.DataFrame(logist_param_est, index=loc_dict.keys(), columns=["Max Value", "Growth Rate", "Midpoint"]))
logist_param_95ci = [zip(np.maximum([0, 0, 0], x - 2*logist_param_stdev[i]).round(5), np.maximum([0, 0, 0], x + 2*logist_param_stdev[i]).round(5)) for i,x in enumerate(logist_param_est)]
print("")
print("Logistic Regression Parameter 95% Confidence Intervals")
display(pd.DataFrame(logist_param_95ci, index=loc_dict.keys(), columns=["Max Value", "Growth Rate", "Midpoint"]))

Logistic Regression Parameter Estimates


Unnamed: 0,Max Value,Growth Rate,Midpoint
NYC,258.519854,0.519443,16.488006
Hubei,115.781145,0.234776,18.692394
Hunan,1.511187,0.282525,9.895721



Logistic Regression Parameter 95% Confidence Intervals


Unnamed: 0,Max Value,Growth Rate,Midpoint
NYC,"(189.59863, 327.44108)","(0.41344, 0.62545)","(15.34489, 17.63112)"
Hubei,"(114.53532, 117.02696)","(0.21929, 0.25026)","(18.36796, 19.01683)"
Hunan,"(1.50551, 1.51687)","(0.27427, 0.29078)","(9.78097, 10.01047)"


### Retrospective Logistic Regression

The following code goes back in time and performs logistic regressions using only the data available from the 10 days prior to the date of analysis. See how the predictions evolve over time as we gain more information on the trajectory of each country's outbreak!

In [10]:
#Generate animated plots showing the change in forecast as each days stats are added
def logist_reg_retro(cum_confirmed_series, city_pop: int, city_name: str, retrospect_days: int = 15, forecast_days: int = 10, show_ci: bool = True):
    def logist_fn(x, a, b, c):
        return a / (1 + np.exp(-b*(x-c)))
    reg_data = cum_confirmed_series[cum_confirmed_series > 10]
    reg_data = 100000 * reg_data / city_pop
    reg_data.reset_index(drop=True, inplace=True)
    plot_data = pd.DataFrame(columns=["Outbreak Day", "Predicted", "Actual", "Upper", "Lower", "Prediction Day", "Regression Eq"])
    retrospect_days = int(len(reg_data.index) * 3/4)
    for forecast_day in range(len(reg_data.index) - retrospect_days, len(reg_data.index)):
        limited_reg_data = reg_data[:forecast_day]
        try:
            param_est, param_cov = curve_fit(logist_fn, limited_reg_data.index.values, limited_reg_data.values, bounds=(0, [200, 1, 100]))
            #compute 1x StDev errors for params
            param_stdev = np.sqrt(np.diag(param_cov))
            pred_data = logist_fn(range(len(reg_data.index) + forecast_days), *param_est)
            formula_str = "%0.2f / (1 + exp(-%0.2f(x-%0.2f)))" % (param_est[0], param_est[1], param_est[2])
            if show_ci and np.all(np.less(param_stdev, [100, 50, 25])):
                plot_stdev = param_stdev * [1, 1, -1]
                pred_data_upper = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est + 2*plot_stdev))
                pred_data_lower = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est - 2*plot_stdev))
                forecast_data = pd.DataFrame([range(len(pred_data)), pred_data, reg_data, pred_data_upper, pred_data_lower, [forecast_day]*len(pred_data), [formula_str]*len(pred_data)], index=["Outbreak Day", "Predicted", "Actual", "Upper", "Lower", "Prediction Day", "Formula"]).T
                plot_data = plot_data.append(forecast_data)
            else:
                forecast_data = pd.DataFrame([reg_data.index.values, pred_data, reg_data, [None]*len(pred_data), [None]*len(pred_data), [forecast_day]*len(pred_data), [formula_str]*len(pred_data)], index=["Outbreak Day", "Predicted", "Actual", "Upper", "Lower", "Prediction Day", "Formula"]).T
                plot_data = plot_data.append(forecast_data)
        except RuntimeError:
            print("Failed to generate adequate regression for: " + city_name)
    melt_plot_data = pd.melt(plot_data, id_vars=["Outbreak Day", "Prediction Day", "Formula"], value_vars=["Predicted", "Actual", "Upper", "Lower"])
    melt_plot_data.columns = ["Outbreak Day", "Prediction Day", "Regression Eq", "Trend Line", "# of confirmed cases per 100k people"]
    plt = px.line(melt_plot_data, x="Outbreak Day", y="# of confirmed cases per 100k people", title="Forecasting Retrospective: " + city_name,
            hover_data=["Regression Eq"], line_group="Trend Line", color="Trend Line", animation_frame="Prediction Day")
    plt["layout"]["updatemenus"][0]["buttons"][0]["args"][1]["transition"]["duration"] = 0
    plt["layout"]["updatemenus"][0]["buttons"][0]["args"][1]["frame"]["duration"] = 250
    plt.show()
    
for (loc, locdf) in loc_dict.items(): 
    res = logist_reg_retro(locdf, pop_counts[loc], loc)
