Dylan Han (djhan2)

In [1]:
import numpy as np
import pandas as pd
import altair as alt
from vega import VegaLite

In [2]:
# https://catalog.data.gov/dataset/covid-19-daily-counts-of-cases-hospitalizations-and-deaths
df = pd.read_csv("https://raw.githubusercontent.com/nychealth/coronavirus-data/master/trends/data-by-day.csv")

In [3]:
df.head()

Unnamed: 0,date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,...,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
0,02/29/2020,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,03/01/2020,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,03/02/2020,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,03/03/2020,1,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,03/04/2020,5,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df['time'] = pd.to_datetime(df['date_of_interest'], format='%m/%d/%Y')

In [5]:
df2 = df[['time', 'CASE_COUNT']].copy()

In [6]:
chart2 = alt.Chart(df2, title="Total Count of Covid Cases Per Month in New York City").mark_rect().encode(
    alt.X('month(time):O').title('by month'),
    alt.Y('year(time):O').title('by year'),
    alt.Color('CASE_COUNT:Q', scale=alt.Scale(scheme='reds', domain=[0, 30000])).title('cases'),
    tooltip=['month(time)', 'year(time)', 'CASE_COUNT']
).properties(
    width=500,
    height=400
).interactive()

In [7]:
chart2

In [8]:
import panel as pn
from panel.interact import interact

In [9]:
pn.extension('vega')

def build_chart(var):
    chart = alt.Chart(df, title="Total Count of Covid Cases Per Month in New York City").mark_rect().encode(
    alt.X('month(time):O').title('by month'),
    alt.Y('year(time):O').title('by year'),
    alt.Color(var, scale=alt.Scale(scheme='reds')),
    tooltip=['month(time)', 'year(time)', var]
).properties(
    width=500,
    height=400
).interactive()
    return chart

interact(build_chart, var=df.select_dtypes('number').columns)

In [10]:
month_map = {'1':'January', '2':'February', '3':'March', '4':'April', '5':'May', '6':'June',
             '7':'July', '8':'August', '9':'September', '10':'October', '11':'November', '12':'December'}

df['month'] = df['time'].dt.month
df['month'] = df['month'].astype(str)
df['month'] = df['month'].apply(lambda x: month_map[x])
months = df['month'].values
months = list(set(months))

In [11]:
labels = [m + ' ' for m in months]
input_dropdown = alt.binding_select(
    options = months + [None],
    labels = labels + ['All'],
    name = 'Month: '
)
selection = alt.selection_point(fields=['month'], bind=input_dropdown)

brush = alt.selection_interval(name='brush')
chart3 = alt.Chart(df, title='Counts of Covid19 Cases in NYC').mark_point().encode(
    x = alt.X('CASE_COUNT:Q'),
    y = alt.Y('HOSPITALIZED_COUNT:Q'),
    color = alt.condition(brush, 'DEATH_COUNT:Q', alt.value('lightgray')),
 #   tooltip=['date_of_interest', 'CASE_COUNT', 'HOSPITALIZED_COUNT', 'DEATH_COUNT']
).properties(
    width=400,
    height=500
).add_params(
    brush,
    selection
).transform_filter(
    selection
)
#.interactive()

vega_panel = pn.pane.Vega(chart3, debounce=10)

vega_panel

In [12]:
def filtering(selection):
    if not selection:
        return 'nothing selected'
    query = ' & '.join(f'{crange[0]:.3f} <= `{col}` <= {crange[1]:.3f}'
                      for col, crange in selection.items())
    return pn.Column(f'Query: {query}', pn.pane.DataFrame(df.query(query), width=15000, height=500))

pn.Row(vega_panel, pn.bind(filtering, vega_panel.selection.param.brush))

This dashboard shows the scatterplot of case_counts vs hospitalized_counts per day for covid19 cases in New York City. The user is able to select regions and take a closer look at the individual points and all the numbers associated with the mark. The user can also sort first by month before selecting a rectangle so that the data points are less clustered and easier to pick out. 
I tried to have the graph originally be interactive which would allow the user to hover over datapoints and see a quick statistic of the date and certain count values. However, this makes the graph draggable and makes brush selection very tough so I removed it. 

A possible contextual dataset to visit would be https://catalog.data.gov/dataset/covid-19-daily-cases-deaths-and-hospitalizations which shows a similar dataset but for city of Chicago instead of New York City. It may be useful to look at these two and compare where peaks lay in terms of when pandemic is running high.

For dataset size, the NYC dataset may be pulled directly off of github. I may save a daily screenshot in case of some error but that file is also within the limits of github uploads. The Chiacgo dataset is also of adequate size. Should I pick another dataset of Covid statistics for another city, if the size is too large, I can remove some of the columns. As we can see, there are over 50 columns and many aren't being utilized currently and may be removed for upload purposes. 