<a href="https://colab.research.google.com/github/jacobsgomez/covid-19-analysis/blob/master/covid_19_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from calendar import monthrange
from datetime import date
from datetime import datetime
from pytz import timezone

# covid-19 Analysis
This is an personal exploratory analysis of covid-19. The objective for this notebook is to gain insight in where the virus is growing fastest and get strong visualization for communities/areas/countries impacted the most by it.

start by importing the root paths. There are 2 parent folders holding all of the data:
1. covid-19 daily reports
2. covid-19 time series

In [0]:
### ROOT_URL - 'https://github.com/CSSEGISandData/COVID-19/tree/master' ###

# can navigate to daily_reports or time_series from this path
RAW_DATA_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data'

daily_reports_path = os.path.join(RAW_DATA_URL, 'csse_covid_19_daily_reports') # contain dates in format %dd - dd - year
time_series_root = os.path.join(RAW_DATA_URL, 'csse_covid_19_time_series')

In [0]:
def get_days_in_months(year, current_month):
    ''' retrieves days in every month prior to (and including) current month '''
    days_in_month = []
    for month in np.arange(1, current_month+1):
        days_in_month.append(monthrange(year, month)[1]) # append second element of monthrange function (days in month)
    return days_in_month

def format_date(month, day, year, delim='-'):
    return '{}{}{}{}{}'.format(str(month).zfill(2), delim, str(day).zfill(2), delim, str(year).zfill(4))

In [0]:
format_date(4, 5, 2020, delim='/'), format_date(4, 5, 2020) 

In [0]:
get_days_in_months(2020, 5)

# Script for importing all separate .csv files into 1 dataframe

In [0]:
current_date = datetime.now(timezone('US/Pacific'))
current_month = current_date.month
current_day = current_date.day
current_year = current_date.year
current_date_str_formatted = format_date(current_month, current_day, current_year)
days_in_months = get_days_in_months(current_year, current_month)

In [0]:
reports_df = pd.DataFrame([])

month_counter = 1 # month iterator
while month_counter <= current_month:
    for days in days_in_months:
        k = 1 # days iterator

        if month_counter == 1:
            k = 22 # start at 1/22 -- the first day of recorded cases

        while k <= days:
            if (month_counter == current_month and k == current_day):
                print('Reached current day. No more information. Exiting...')
                break
            day_month_year_str = format_date(month_counter, k, current_year)
            report_url = daily_reports_path + '/{}.csv'.format(day_month_year_str)
            # print(report_url)
            reports_df = reports_df.append(pd.read_csv(report_url, error_bad_lines=False))
            k = k + 1
        month_counter += 1

## explore dastaset for daily reports

In [0]:
reports_df.columns

In [0]:
reports_df['Confirmed']

`Last Update` column has multiple formats, which prevents proper graphing. We will use `pd.to_datetime` to consolidate formats.

In [0]:
reports_df['Last Update'] = pd.to_datetime(reports_df['Last Update'])

In [0]:
reports_df[['Last Update', 'Confirmed']]

After looking at the way GitHub is indexing the CSV files, it seems that indices need to be reset (to account for the CSV files previously added to the df)

In [0]:
reports_df.reset_index(inplace=True)

In [0]:
latitude, longitude = reports_df["Lat"], reports_df["Long_"]
latitude.dropna(inplace=True)
longitude.dropna(inplace=True)
latitude.shape, longitude.shape

In [0]:
latitude.head()

In [0]:
fig, ax = plt.subplots()
reports_df.plot(kind="scatter", x="Lat", y="Long_", alpha=0.005, figsize=(12,8))
plt.show()

# Exploring Time Series Data

In [0]:
''' TIME SERIES PATHS'''
# global time series paths
time_series_confirmed_global = os.path.join(time_series_root, 'time_series_covid19_confirmed_global.csv')
time_series_deaths_global = os.path.join(time_series_root, 'time_series_covid19_deaths_global.csv')
time_series_recovered_global = os.path.join(time_series_root, 'time_series_covid19_recovered_global.csv')
# US-only time series paths (yes, we're a special breed)
time_series_confirmed_us = os.path.join(time_series_root, 'time_series_covid19_confirmed_US.csv')
time_series_deaths_us = os.path.join(time_series_root, 'time_series_covid19_deaths_US.csv')

In [0]:
US_confirmed = pd.read_csv(time_series_confirmed_us, error_bad_lines=False)
US_deaths = pd.read_csv(time_series_deaths_us, error_bad_lines=False)

In [0]:
US_confirmed.columns

Taken from Github Repo readme:
> * FIPS: US only. Federal Information Processing Standards code that uniquely identifies counties within the USA.
> * Admin2: County name. US only.
> * Active: Active cases = total confirmed - total recovered - total deaths.
> * [UID Lookup Logic](https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_daily_reports/03-21-2020.csv)

In [0]:
US_confirmed.head(5)

### Creating functions to automate confirmed/death numbers by state

In [0]:
def get_confirmed_by_state(state):
    return US_confirmed.loc[US_confirmed["Province_State"] == str(state)]

def get_deaths_by_state(state):
    return US_deaths.loc[US_deaths['Province_State'] == str(state)]


CA_confirmed = get_confirmed_by_state("California")
CA_deaths = get_deaths_by_state("California")

In [0]:
CA_confirmed = CA_confirmed[CA_confirmed["Lat"] !=0]# remove zero-coordinate values before plotting
CA_deaths = CA_deaths[CA_deaths["Lat"] != 0]

In [0]:
CA_confirmed.plot(x='Lat', y='Long_', kind='scatter', c=CA_confirmed.iloc[:, -1], s=CA_confirmed.iloc[:, -1]/80, cmap=plt.get_cmap('jet'), \
                  figsize=(14,6), label="County")
plt.legend()


In [0]:
def get_confirmed_by_date_and_state(state, date):
    '''date formatted as xx/yy/zz'''
    state = get_confirmed_by_state(str(state))
    confirmed = state[str(date)].sum()
    return confirmed

In [0]:
get_confirmed_by_date_and_state('California', '5/21/20')

In [0]:
total_daily_confirmed = CA_confirmed.loc[:, "1/22/20":].sum(axis=0) # sum by row
total_daily_deaths = CA_deaths.loc[:, "1/22/20":].sum(axis=0)

col_dates = CA_confirmed.columns.values[11:]

In [0]:
fig = plt.figure(figsize=(14,7), dpi=300)
ax = fig.add_subplot(1,1,1)
ax.tick_params(axis='y', colors='white')
ax.tick_params(axis='x', colors='white')
ax.yaxis.grid(True)
ax.set_title(label="California covid-19", color='white')
ax.set_facecolor((0.98, 0.98, 0.98))
# plt.grid()
plt.rc('grid', linestyle="-", color=(0.8, 0.8, 0.8))

confirmed_to_date = get_confirmed_by_date_and_state('California', '5/21/20')
plt.plot(col_dates, total_daily_confirmed, c=(212/255, 160/255, 65/255), label='Confirmed Cases')
plt.plot(col_dates, total_daily_deaths, label='Confirmed Deaths')
plt.title("California covid-19")
plt.legend(frameon=False)
plt.text(0.3, 78000, 'Confirmed Cases: ' + str(confirmed_to_date))
plt.show()