In [1]:
#turn the read csvs into a dataframe

import pandas as pd

nytimes_data = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"

df_nytimes_data = pd.read_csv(nytimes_data)

nytimes_state_data = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

df_nytimes_state_data = pd.read_csv(nytimes_state_data)

In [2]:
len(df_nytimes_state_data)

8429

In [3]:
countycrosswalk_data = 'cbsatocountycrosswalk2017.csv'

df_countrycrosswalk_data = pd.read_csv(countycrosswalk_data, engine='python')

In [4]:
#export the state data to a csv

df_nytimes_state_data.to_csv('nytimes_state_coviddata.csv',index = False, header=True)

In [5]:
#read in a crosswalk file that's in the same directory as this script

countycrosswalk_data = 'cbsatocountycrosswalk2017.csv'

df_countrycrosswalk_data = pd.read_csv(countycrosswalk_data, engine='python')

df_countrycrosswalk_data = df_countrycrosswalk_data.dropna(subset=['cbsaname'])

In [6]:
df_countrycrosswalk_data = df_countrycrosswalk_data.filter(items=['countyname','fipscounty','cbsaname'])

df_countrycrosswalk_data = df_countrycrosswalk_data.rename(columns={"fipscounty":"fips"})

In [7]:
#join the crosswalk to ny times data to pick up cbsa

df_nytimes_data = pd.merge(df_nytimes_data,df_countrycrosswalk_data, how='left',on =['fips'])


In [8]:
#This is how to select a subset of values using like on a string
#df_nytimes_data[df_nytimes_data['cbsaname'].str.contains('New York', na=False)].groupby('cbsaname').count()
#This is how you pull data where a column is null
#df_nytimes_data[df_nytimes_data['fips'].isnull()].groupby(['county']).count()
#This is how you select a subset of the dataset using loc
#df_nytimes_data.loc[df_nytimes_data['county']=="New York City",'cbsaname']

#Below, I set the values for NYC to be the CBSA Name for Jersey City
df_nytimes_data.loc[df_nytimes_data['county']=="New York City",'cbsaname']="New York-Jersey City-White Plains, NY-NJ"

In [9]:
#get the data from the prior day for each country to assess new cases

df_nytimes_data["datem1"] = pd.to_datetime(df_nytimes_data["date"]) + pd.DateOffset(1)

In [10]:
df_nytimes_prior = df_nytimes_data.filter(items=['datem1','county',\
                                              'state','fips','cases','deaths'])

In [11]:
df_nytimes_prior = df_nytimes_prior.rename(columns={"datem1":"date"})

In [12]:
#convert the date on the original covid dataset from an object to a date

df_nytimes_data["date"] = pd.to_datetime(df_nytimes_data["date"])

In [13]:
df_nytimes_wprior = pd.merge(df_nytimes_data,df_nytimes_prior, how='left',\
                             on =['date','county', 'state','fips'])

In [14]:
df_nytimes_wprior = df_nytimes_wprior.rename(columns={"cases_x":"total_cases",\
                                                  "deaths_x":"total_deaths",\
                                                  "cases_y":"prior_cases",\
                                                  "deaths_y":"prior_deaths"})

In [15]:
df_nytimes_wprior['new_cases'] = df_nytimes_wprior['total_cases'] - df_nytimes_wprior['prior_cases']
df_nytimes_wprior['new_deaths'] = df_nytimes_wprior['total_deaths'] - df_nytimes_wprior['prior_deaths']

In [16]:
#sort the dataframe

df_nytimes_wprior = df_nytimes_wprior.sort_values(by=['county', 'state','date'])

In [17]:
df_nytimes_wprior['newcases_7dayavg'] = df_nytimes_wprior.new_cases.rolling(7).mean()
df_nytimes_wprior['newdeaths_7dayavg'] = df_nytimes_wprior.new_deaths.rolling(7).mean()
df_nytimes_wprior['newcases_3dayavg'] = df_nytimes_wprior.new_cases.rolling(3).mean()
df_nytimes_wprior['newdeaths_3dayavg'] = df_nytimes_wprior.new_deaths.rolling(3).mean()
df_nytimes_wprior['death_ratio'] = df_nytimes_wprior['total_deaths'] / df_nytimes_wprior['total_cases']

In [18]:
min_date = '2020-03-15'

In [19]:
nyc_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="New York City")&\
                                       (df_nytimes_wprior['date']>=min_date)]
pheonix_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Maricopa")&\
                                       (df_nytimes_wprior['date']>=min_date)]
westchester_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Westchester")&\
                                       (df_nytimes_wprior['state']=="New York")&\
                                       (df_nytimes_wprior['date']>=min_date)]
chicago_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Cook")&\
                                   (df_nytimes_wprior['state']=="Illinois")&\
                                       (df_nytimes_wprior['date']>=min_date)]
detroit_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Wayne")&\
                                   (df_nytimes_wprior['state']=="Michigan")&\
                                       (df_nytimes_wprior['date']>=min_date)]
neworleans_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Orleans")&\
                                   (df_nytimes_wprior['state']=="Louisiana")&\
                                       (df_nytimes_wprior['date']>=min_date)]
seattle_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="King")&\
                                   (df_nytimes_wprior['state']=="Washington")&\
                                       (df_nytimes_wprior['date']>=min_date)]
la_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Los Angeles")&\
                                   (df_nytimes_wprior['state']=="California")&\
                                       (df_nytimes_wprior['date']>=min_date)]
nassau_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Nassau")&\
                                   (df_nytimes_wprior['state']=="New York")&\
                                       (df_nytimes_wprior['date']>=min_date)]
suffolk_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Suffolk")&\
                                   (df_nytimes_wprior['state']=="New York")&\
                                       (df_nytimes_wprior['date']>=min_date)]
austin_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Travis")&\
                                   (df_nytimes_wprior['state']=="Texas")&\
                                       (df_nytimes_wprior['date']>=min_date)]
houston_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Harris")&\
                                   (df_nytimes_wprior['state']=="Texas")&\
                                       (df_nytimes_wprior['date']>=min_date)]
miami_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Miami-Dade")&\
                                   (df_nytimes_wprior['state']=="Florida")&\
                                       (df_nytimes_wprior['date']>=min_date)]
camden_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Camden")&\
                                   (df_nytimes_wprior['state']=="New Jersey")&\
                                       (df_nytimes_wprior['date']>=min_date)]
bergen_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="Bergen")&\
                                   (df_nytimes_wprior['state']=="New Jersey")&\
                                       (df_nytimes_wprior['date']>=min_date)]
dc_sample = df_nytimes_wprior[(df_nytimes_wprior['county']=="District of Columbia")&\
                                   (df_nytimes_wprior['state']=="District of Columbia")&\
                                       (df_nytimes_wprior['date']>=min_date)]

In [20]:
df_nytimes_wprior.to_csv('nytimes_coviddata.csv',index = False, header=True)

In [22]:
#graph the plot variable

plot_variable = 'newdeaths_7dayavg'

import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Line(name='New York City', x=nyc_sample['date'], y=nyc_sample[plot_variable]),
    #go.Line(name='Westchester', x=westchester_sample['date'], y=westchester_sample[plot_variable]),
    go.Line(name='Houston', x=houston_sample['date'], y=houston_sample[plot_variable]),
    go.Line(name='Pheonix', x=pheonix_sample['date'], y=pheonix_sample[plot_variable]),    
    #go.Line(name='DC', x=dc_sample['date'], y=dc_sample[plot_variable]),    
    #go.Line(name='Chicago', x=chicago_sample['date'], y=chicago_sample[plot_variable]),
    #go.Line(name='Detroit', x=detroit_sample['date'], y=detroit_sample[plot_variable]),
    #go.Line(name='Suffolk', x=suffolk_sample['date'], y=suffolk_sample[plot_variable]),
    #go.Line(name='Nassau', x=nassau_sample['date'], y=nassau_sample[plot_variable]),
    #go.Line(name='Camden', x=bergen_sample['date'], y=camden_sample[plot_variable]),
    #go.Line(name='Bergen', x=camden_sample['date'], y=bergen_sample[plot_variable]), 
    go.Line(name='Miami', x=miami_sample['date'], y=miami_sample[plot_variable]),
    go.Line(name='Austin', x=austin_sample['date'], y=austin_sample[plot_variable]),
    #go.Line(name='New Orleans', x=neworleans_sample['date'], y=neworleans_sample[plot_variable]),
    #go.Line(name='Seattle', x=seattle_sample['date'], y=seattle_sample[plot_variable]),
    go.Line(name='Los Angeles', x=la_sample['date'], y=la_sample[plot_variable]),
])
# Change the bar mode
#fig.update_layout(barmode='group')
fig.update_layout(title_text=plot_variable)
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [23]:
#graph the plot variable

variable = 'deaths'
average_length = '7'
data_selection = westchester_sample

import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='new_'+variable, x=data_selection['date'], y=data_selection['new_'+variable]),
    #go.Line(name= average_length+' day avg', x=data_selection['date'], 
    #        y=data_selection['new'+variable+'_'+average_length+'dayavg'])
])
# Change the bar mode
#fig.update_layout(barmode='group')
fig.update_layout(title_text="New "+variable+" in Westchester")
fig.show()

In [24]:
#create a dual axis line chart

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(name='NYC cases', x=nyc_sample['date'], y=nyc_sample['total_cases']),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(name='NYC deaths', x=nyc_sample['date'], y=nyc_sample['total_deaths']),
    secondary_y=True,
)

# Set x-axis title
fig.update_xaxes(title_text="date")

# Set y-axes titles
fig.update_yaxes(title_text="total cases", secondary_y=False)
fig.update_yaxes(title_text="total deaths", secondary_y=True)

fig.update_layout(title_text="total cases and deaths over time for NYC")

fig.show()

In [25]:
#create a grouping of all counties in the tri state area to compare to the rest of the country

import numpy as np

tristate = ((df_nytimes_wprior['county']=="New York City")&(df_nytimes_wprior['state']=="New York"))|\
           ((df_nytimes_wprior['county']=="Westchester")&(df_nytimes_wprior['state']=="New York")) |\
           ((df_nytimes_wprior['county']=="Nassau")&(df_nytimes_wprior['state']=="New York")) |\
           ((df_nytimes_wprior['county']=="Suffolk")&(df_nytimes_wprior['state']=="New York")) |\
           ((df_nytimes_wprior['county']=="Rockland")&(df_nytimes_wprior['state']=="New York")) |\
           ((df_nytimes_wprior['county']=="Fairfeild")&(df_nytimes_wprior['state']=="Connecticut")) |\
           ((df_nytimes_wprior['county']=="Bergen")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Passaic")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Hudson")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Essex")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Morris")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Union")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Sussex")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Somerset")&(df_nytimes_wprior['state']=="New Jersey")) |\
           ((df_nytimes_wprior['county']=="Monmouth")&(df_nytimes_wprior['state']=="New Jersey"))

df_nytimes_wprior['region'] = np.where(tristate,"tri state area", "other")

In [26]:
#create a grouping of all counties in new york state

nystate = (df_nytimes_wprior['state']=="New York")

df_nytimes_wprior['is_nystate'] = np.where(nystate,"new york state", "other")

In [27]:
#group by the region field

#first limit the amount of data in a new dataframe

df_nytimes_region = df_nytimes_wprior.filter(items=[
                                                'date','region','total_cases', 'total_deaths',
                                                'datem1', 'prior_cases', 'prior_deaths', 'new_cases', 'new_deaths',
                                                'newcases_7dayavg', 'newdeaths_7dayavg','is_nystate'])

In [28]:
#then group by the region and date fields

df_nytimes_region_grouped = df_nytimes_region.groupby(['date','region'], as_index=False).sum()

In [29]:
#add death ratio

df_nytimes_region_grouped['death_ratio'] = df_nytimes_region_grouped['total_deaths'] \
                                            / df_nytimes_region_grouped['total_cases']

In [30]:
#create samples for tristate and other

tristate_sample = df_nytimes_region_grouped[(df_nytimes_region_grouped['region']=="tri state area")&\
                                       (df_nytimes_region_grouped['date']>=min_date)]
other_sample = df_nytimes_region_grouped[(df_nytimes_region_grouped['region']=="other")&\
                                       (df_nytimes_region_grouped['date']>=min_date)]
#nystate_sample = df_nytimes_region_grouped[(df_nytimes_region_grouped['is_nystate']=="new york state")&\
#                                       (df_nytimes_region_grouped['date']>=min_date)]
#otherstate_sample = df_nytimes_region_grouped[(df_nytimes_region_grouped['is_nystate']=="other")&\
#                                       (df_nytimes_region_grouped['date']>=min_date)]

In [31]:
#graph the plot variable

plot_variable = 'newdeaths_7dayavg'

import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Line(name='tristate area', x=tristate_sample['date'], y=tristate_sample[plot_variable]),
    go.Line(name='outside tristate area', x=other_sample['date'], y=other_sample[plot_variable])
])
# Change the bar mode
#fig.update_layout(barmode='group')
fig.update_layout(title_text=plot_variable)
fig.show()