# DataSet Creation
This notebook requests each and everyone of the time-series([covid19-mx-time-series](https://github.com/mariorz/covid19-mx-time-series)) files created by [@mariorz](https://twitter.com/mariorz), filtered  by state (see how on the comments) and creates a single DataFrame to simplify the analysis by state. 

The time-series selected for this project are: 

| Indicator | Description | File |
|:------------------------------------------------|:-------------------------------------------------|:-------------------------------------------------|
| Confirmed cases | Confirmed by confirmation date | [covid19_confirmed_mx.csv](https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_confirmed_mx.csv) |
| Suspects | Suspect cases by date of official publication | [covid19_suspects_mx.csv](https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_suspects_mx.csv) |
| Hospitalized | Confirmed hospitalized by admission date | [hospitalized_confirmed_by_admission_date_mx.csv](https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/full/by_hospital_state/hospitalized_confirmed_by_admission_date_mx.csv) |
| Deaths | Deaths confirmed by death date | [deaths_confirmed_by_death_date_mx.csv](https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/full/by_hospital_state/deaths_confirmed_by_death_date_mx.csv) |

In [1]:
import pandas as pd
import altair as alt
from altair_saver import save
from datetime import datetime, timedelta
from urllib import request
import json

In [2]:
# Fetch & enable a Spanish timeFormat locale.
with request.urlopen('https://raw.githubusercontent.com/d3/d3-time-format/master/locale/es-ES.json') as f:
  es_time_format = json.load(f)
alt.renderers.set_embed_options(timeFormatLocale=es_time_format)

RendererRegistry.enable('default')

## Confirmed cases
- Confirmed by confirmation date
- Colima's first confirmed case: `17-03-2020`.

In [3]:
confirmed_url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_confirmed_mx.csv'
confirmed = pd.read_csv(confirmed_url, index_col=0)
#Change the state and the initial date in the following line
confirmed = confirmed.loc['Colima','15-03-2020':]
confirmed = pd.DataFrame(confirmed)
confirmed.index = pd.to_datetime(confirmed.index, format='%d-%m-%Y')
#the following lines will create three more columns with the daily difference of cases, and the moving average of the daily cases with windows of 7 and 14 days.
confirmed_daily = confirmed['confirmed_daily'] = confirmed.Colima.diff()
confirmed['confirmed_ma_7'] = confirmed_daily.rolling(window=7).mean()
confirmed['confirmed_ma_14'] = confirmed_daily.rolling(window=14).mean()
confirmed.columns = ['confirmed','confirmed_daily','confirmed_ma_7','confirmed_ma_14']

## Suspects
- Suspect cases aggregated by date of official publication.
- First suspect on Colima: `15-03-15`

In [4]:
suspects_url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_suspects_mx.csv'
suspects = pd.read_csv(suspects_url, index_col=0)
suspects = suspects.loc['Colima','15-03-2020':]
suspects = pd.DataFrame(suspects)
suspects.index = pd.to_datetime(suspects.index, format='%d-%m-%Y')
suspects['suspects_ma_7'] = suspects.Colima.rolling(window=7).mean()
suspects['suspects_ma_14'] = suspects.Colima.rolling(window=14).mean()
suspects.columns = ['suspects','suspects_ma_7','suspects_ma_14']


## Hospitalized
-  Confirmed by admission date

In [5]:
hospitalized_url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/full/by_hospital_state/hospitalized_confirmed_by_admission_date_mx.csv'
hospitalized = pd.read_csv(hospitalized_url, index_col=0)
hospitalized = hospitalized.loc['Colima','15-03-2020':]
hospitalized = pd.DataFrame(hospitalized)
hospitalized.index = pd.to_datetime(hospitalized.index, format='%d-%m-%Y')
hospitalized_daily = hospitalized['hospitalized_daily'] = hospitalized.Colima.diff()
hospitalized['hospitalized_ma_7'] = hospitalized_daily.rolling(window=7).mean()
hospitalized['hospitalized_ma_14'] = hospitalized_daily.rolling(window=14).mean()
hospitalized.columns = ['hospitalized','hospitalized_daily','hospitalized_ma_7','hospitalized_ma_14']

## Deaths
- Confirmed by death date

In [6]:
deaths_url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/full/by_hospital_state/deaths_confirmed_by_death_date_mx.csv'
deaths = pd.read_csv(deaths_url, index_col=0)
deaths = deaths.loc['Colima','15-03-2020':]
deaths = pd.DataFrame(deaths)
deaths.index = pd.to_datetime(deaths.index, format='%d-%m-%Y')
deaths_daily = deaths['deaths_daily'] = deaths.Colima.diff()
deaths['hospitalized_ma_7'] = deaths_daily.rolling(window=7).mean()
deaths['hospitalized_ma_14'] = deaths_daily.rolling(window=14).mean()
deaths.columns = ['deaths','deaths_daily ','deaths_ma_7','deaths_ma_14']

In [7]:
merge_1 = confirmed.merge(suspects, left_index=True, right_index=True)
merge_2 = merge_1.merge(hospitalized, left_index=True, right_index=True)
df = merge_2.merge(deaths, left_index=True, right_index=True)

df 

Unnamed: 0,confirmed,confirmed_daily,confirmed_ma_7,confirmed_ma_14,suspects,suspects_ma_7,suspects_ma_14,hospitalized,hospitalized_daily,hospitalized_ma_7,hospitalized_ma_14,deaths,deaths_daily,deaths_ma_7,deaths_ma_14
2020-03-15,0,,,,1,,,1,,,,0,,,
2020-03-16,0,0.0,,,3,,,2,1.0,,,0,0.0,,
2020-03-17,0,0.0,,,2,,,2,0.0,,,0,0.0,,
2020-03-18,1,1.0,,,2,,,2,0.0,,,0,0.0,,
2020-03-19,1,0.0,,,3,,,2,0.0,,,0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-02,7391,17.0,27.285714,24.642857,2004,2096.428571,2068.285714,7463,5.0,17.428571,17.928571,810,0.0,1.428571,1.357143
2020-12-03,7420,29.0,29.857143,26.142857,2032,2079.142857,2071.857143,7477,14.0,16.285714,17.285714,810,0.0,0.714286,1.214286
2020-12-04,7428,8.0,26.857143,25.571429,2096,2075.285714,2079.928571,7485,8.0,14.428571,16.857143,810,0.0,0.428571,1.071429
2020-12-05,7471,43.0,31.000000,25.928571,2041,2057.142857,2083.642857,7485,0.0,12.285714,15.785714,811,1.0,0.428571,1.071429


In [8]:
# Saving the DataFrame as a CSV file

df.to_csv('data_output/colima_df.csv', index = True)