In [2]:
%matplotlib inline
# Load in some common libraries to do our analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, date2num
import geopandas
import seaborn as sns

### Lets go get some data

The new york times has a dataset which they share on their github page. We will look at the [historical data](https://github.com/nytimes/covid-19-data#historical-data) and we're going to select the [county level data](https://github.com/nytimes/covid-19-data#county-level-data) as it will let us breakdown based on seperate states and counties. They have links for the raw data which will let us easily download the data through pandas.


In [29]:
# Load data from nytimes
# We also tell pandas to treat the column names date as a special varialbe type 
# called a datetime which will make analysis easier

df = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv', 
                 parse_dates=['date'])

Now lets look at the data using the [head](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) command which shows off the first few rows of the dataframe.

In [18]:
#df.head(10) - shows top 10
#df - shows all elements, might be annotated
#df.head() - shows top 5
#df['county'] - select for data under specific header
#df.county - select for data under specific header
#df['cases'] *2 - data frames allows for the numpy operations
#df.state.unique() - list of all unique members of a column
#len(df.state.unique()) - number of unique elements
df.state.unique().shape #- gives you rows / columns of dataset

(55,)

Looks great except we're in South Carolina so it might be nice to just look at the data from here.

We can select items from a dataframe by using boolean operators (`>, <, >=, <=, ==, !=`) for a specific column. It looks like we should be able to select on the state column and see which ones are equal to (`==`) "South Carolina"

In [36]:
#df.state == 'South Carolina' - gives you false values
#(df.state == 'South Carolina').unique() - array values
(df.state == "South Carolina") | (df.state == "North Carolina")

0         False
1         False
2         False
3         False
4         False
          ...  
359973    False
359974    False
359975    False
359976    False
359977    False
Name: state, Length: 359978, dtype: bool

In [39]:
# Select South Carolina
SC = df[df.state == "South Carolina"]
# Select both Carolinas
#Car = df[(df.state == "South Carolina") | (df.state == "North Carolina")]
# Tail works similar to head but for the bottom of the list
SC.tail(10)

Unnamed: 0,date,county,state,fips,cases,deaths
359189,2020-07-22,Oconee,South Carolina,45073.0,585,2
359190,2020-07-22,Orangeburg,South Carolina,45075.0,1697,39
359191,2020-07-22,Pickens,South Carolina,45077.0,1484,16
359192,2020-07-22,Richland,South Carolina,45079.0,6581,116
359193,2020-07-22,Saluda,South Carolina,45081.0,366,4
359194,2020-07-22,Spartanburg,South Carolina,45083.0,3252,68
359195,2020-07-22,Sumter,South Carolina,45085.0,1877,35
359196,2020-07-22,Union,South Carolina,45087.0,238,0
359197,2020-07-22,Williamsburg,South Carolina,45089.0,671,18
359198,2020-07-22,York,South Carolina,45091.0,2571,17


Now we have data for South Carolina but it's still not in the best format. We have each row as values for each date in each county. It might be more useful to use a [pivot](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html) function similar to pivots in excel in order to sum our cases per-day, per-county. We'll use our date as an index and have our county information as the columns so that we can easily plot them later.

In [43]:
# Group cases by county for each date
#cases_sc = SC.pivot(index='date', columns='county', values='cases')
#cases_sc = cases_sc.fillna(0.0)

# Lets look at the last 10 days
#cases_sc.head(10)

# Treat data as data frame
df2 = pd.DataFrame.pivot(SC, index='date', columns='county', values='cases')
df2.head()

county,Abbeville,Aiken,Allendale,Anderson,Bamberg,Barnwell,Beaufort,Berkeley,Calhoun,Charleston,...,Oconee,Orangeburg,Pickens,Richland,Saluda,Spartanburg,Sumter,Union,Williamsburg,York
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-06,,,,,,,,,,1.0,...,,,,,,,,,,
2020-03-07,,,,,,,,,,1.0,...,,,,,,,,,,
2020-03-08,,,,,,,,,,1.0,...,,,,,,1.0,,,,
2020-03-09,,,,,,,,,,1.0,...,,,,,,1.0,,,,
2020-03-10,,,,,,,,,,1.0,...,,,,,,1.0,,,,


### Plotting

Let's see if we can get some more information from the data from plotting the data. Pandas has a built in plot method which calls matplotlib to plot the data. Let's see what we get when we just try to plot the data.

In [None]:
plot = cases_sc.plot(figsize=(12,8), linewidth=5, cmap='PuBu')
plot.legend(ncol=2, bbox_to_anchor=(1, 1), loc='upper left')
plt.show()

In [None]:
# Show plotting inside of pandas
plot = cases_sc.Richland.plot(figsize=(12,8), linewidth=5, legend=True)

# You can also access a column by a string name
plot = cases_sc["Lexington"].plot(linewidth=5, legend=True)

### TODO: Add a few more counties to this graph

### Now let's also look at deaths instead of cases.
TODO: re-do the last few steps from the pivot but this time for deaths. Label the new dataframe "deaths_sc"


In [None]:
# deaths_sc = ...

In [None]:
# Copy data
new_per_day = cases_sc.copy(deep=True)

# Get difference per day instead of total cases
for county in cases_sc.columns:
    new_per_day[county] = new_per_day[county].diff()
    
new_per_day = new_per_day.fillna(0.0)
new_per_day.tail(5) 


In [None]:
# Plot difference
plot = new_per_day.Richland.plot(figsize=(12,8), linewidth=3, legend=True)
# Looks jumpy....

### Let's make this look better

We're going to use [matplotlib](https://matplotlib.org/3.1.1/index.html) to plot these graphs instead of the builtin plotting function. Sometimes when we want to add more features to our plots it is easier to use matplotlib instead of pandas built in plotting.

In [None]:
# Setup Figure (fig) and Axes (ax)
fig, ax = plt.subplots(figsize=(12,8))

# This function converts our dates into numbers so matplotlib can plot them
dates = date2num(cases_sc.index)
# We setup how the date format and put them onto the axes
date_form = DateFormatter("%b-%d")
ax.xaxis.set_major_formatter(date_form)

# Lets get the new cases per day for Richland county
npd_richland = new_per_day.Richland

# Then we'll plot it with plot_dates functon
ax.plot_date(dates, npd_richland, fmt='-', linewidth=3)

# Show our plot at the end
plt.show()

We've repoduced our plot from before let's customize it some more. To get rid of the large jumps in our data we can do a rolling mean over the data we have.

In [None]:
# Setup Figure (fig) and Axes (ax)
fig, ax = plt.subplots(figsize=(12,8))

# This function converts our dates into numbers so matplotlib can plot them
dates = date2num(cases_sc.index)
# We setup how the date format and put them onto the axes
date_form = DateFormatter("%b-%d")
ax.xaxis.set_major_formatter(date_form)

# Lets get the new cases per day for Richland county
npd_richland = new_per_day.Richland

#### Let's average over the last two weeks and plot it
rolling_days = 14
npd_richland_avg = new_per_day.Richland.rolling(rolling_days).mean()
####


# Then we'll plot it with plot_dates functon
ax.plot_date(dates, npd_richland_avg, fmt='-', linewidth=3)

# Show our plot at the end
plt.show()

## Great our plot is looking a bit better now

Now let's add a bar graph to show the original new per day data.

In [None]:
# Setup Figure (fig) and Axes (ax)
fig, ax = plt.subplots(figsize=(12,8))

# This function converts our dates into numbers so matplotlib can plot them
dates = date2num(cases_sc.index)
# We setup how the date format and put them onto the axes
date_form = DateFormatter("%b-%d")
ax.xaxis.set_major_formatter(date_form)

# Lets get the new cases per day for Richland county
npd_richland = new_per_day.Richland

#### We'll add a bar graph to our axes now that is a little lighter in color (alpha)
ax.bar(dates, npd_richland, alpha=0.2)
###

# Let's average over the last two weeks and plot it
rolling_days = 14
npd_richland_avg = new_per_day.Richland.rolling(rolling_days).mean()

# Then we'll plot it with plot_dates functon
ax.plot_date(dates, npd_richland_avg, fmt='-', linewidth=3)

# Show our plot at the end
plt.show()

## Let's finish by putting a trendline for the last two weeks

In [None]:
# Setup Figure (fig) and Axes (ax)
fig, ax = plt.subplots(figsize=(12,8))

# This function converts our dates into numbers so matplotlib can plot them
dates = date2num(cases_sc.index)
# We setup how the date format and put them onto the axes
date_form = DateFormatter("%b-%d")
ax.xaxis.set_major_formatter(date_form)

# Lets get the new cases per day for Richland county
npd_richland = new_per_day.Richland

# We'll add a bar graph to our axes now that is a little lighter in color (alpha)
ax.bar(dates, npd_richland, alpha=0.2)

# Let's average over the last two weeks and plot it
rolling_days = 14
npd_richland_avg = new_per_day.Richland.rolling(rolling_days).mean()

# Then we'll plot it with plot_dates functon
ax.plot_date(dates, npd_richland_avg, fmt='-', linewidth=3)

### Plot trend line for the last set of days averaged over
z = np.polyfit(dates[-rolling_days:], npd_richland_avg[-rolling_days:], 1)
p = np.poly1d(z)
ax.plot(dates[-rolling_days:],p(dates[-rolling_days:]),'-', linewidth=4, label=f'Trend for last {rolling_days} days')
###


# Show our plot at the end
plt.legend()
plt.show()

## Let's make a Map

We're going to use the geopandas to plot our data. There are a ton of different maps availible that work with geopandas on this [github page](https://github.com/deldersveld/topojson).

In [None]:
# Get geometry data
# https://github.com/deldersveld/topojson

JSON_url = "https://raw.githubusercontent.com/deldersveld/topojson/master/countries/us-states"
SC_json = "SC-45-south-carolina-counties"

geodata = geopandas.read_file(JSON_url+"/"+SC_json+".json")

geodata.head()

In [None]:
# Make a new figure and axis
fig, ax = plt.subplots(figsize=(16,9))
# Plot the outlines of SC
geodata.plot(color="white", edgecolor='black', ax=ax)

plt.show()

## To add color we can add a new column based on our current cases

Let's look at our cases dataframe and see how we'll match it to the plot.

In [None]:
cases_sc.tail(3)

In [None]:
# Make a new figure and axis
fig, ax = plt.subplots(figsize=(16,9))
# Plot the outlines of SC
geodata.plot(color="white", edgecolor='black', ax=ax)

#### First we add a new column for our colors
geodata["color"] = np.zeros_like(geodata.NAME)

#### Now we can match our data from our cases_sc dataframe, 
#### to our plotting dataframe using the name of the county

# iteritems will return the  and the column name (In this case our county names)
# And the entire column of data (Our numbers)
for county_name, county_nums in cases_sc.iteritems():
    # To get the most recent we can get the -1 index
    most_recent_num = county_nums[-1]
    # We want to put the value for our most recent cases in the color part
    # Only for the location of the row of the geodata that is the same as our county name
    geodata.loc[geodata.NAME == county_name, 'color'] = most_recent_num

# And plot based on the new color cloumn
geodata.plot(column='color', ax=ax, cmap='PuBu')
####

ax.set_title("Current Cases in SC")
ax.set_yticklabels([])
ax.set_xticklabels([])
ax.get_yaxis().set_visible(False)
ax.get_xaxis().set_visible(False)
plt.show()

### Try to make some more plots on your own