<a href="https://colab.research.google.com/github/ldejuan/covid/blob/master/descriptive_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import zipfile

In [0]:
#read zip train and test data set from zip Archive
PATH_DATA= "./drive/My Drive/covid/"
fileNameZip = 'covid19-global-forecasting-week-1.zip'
archiveName = "%s%s"%(PATH_DATA,fileNameZip)
trainName= 'train.csv'
testName = 'test.csv'
submissionNane = 'submission.csv'
if zipfile.is_zipfile(archiveName):
  with zipfile.ZipFile(archiveName) as archive:
    dfTrain = pd.read_csv(archive.open(trainName,'r')).\
      rename(columns={"Country/Region":"Country_Region",
                      "Province/State":"Province_State"})
      
    dfTest  = pd.read_csv(archive.open(testName,'r')).\
      rename(columns={"Country/Region":"Country_Region",
                      "Province/State":"Province_State"})
    dfSoubmission  = pd.read_csv(archive.open(submissionNane,'r'))
  

In [0]:
#Basic transformations
dfTrain['Date'] = pd.to_datetime(dfTrain['Date'])

In [0]:
dfTest.head()

In [0]:
dfSoubmission.head()

In [0]:
# Recover the number of dates per Province_State
dfTrain[['Country_Region','Province_State','Date','Fatalities']]\
  .groupby(['Country_Region','Province_State']).agg(['min','max','count'])

In [0]:
#Create a new columns with the sum of Fatalities and ConfirmedCases by Country_Region

dfCountryAgg = dfTrain[["Country_Region","Date","Fatalities","ConfirmedCases"]]\
  .groupby(["Country_Region","Date"]).transform(lambda x: np.sum(x))\
  .rename(columns = {'Fatalities':'Country_Fatalities', 'ConfirmedCases':'Country_ConfirmedCases'})

dfTrain = dfTrain.merge(dfCountryAgg, left_index=True,right_index=True)



In [0]:
#Study only at country level
dfCountry = dfTrain[["Country_Region","Date","Fatalities","ConfirmedCases"]]\
  .groupby(["Country_Region","Date"]).agg(lambda x: np.sum(x))\
  .reset_index()

In [0]:
dfCountry.head()

In [0]:
#Some Graph per Country/Region
selectedCountry = 'US'
dfSelectedCountry = dfCountry.query("Country_Region == @selectedCountry")
dfSelectedCountry[['Date','ConfirmedCases',"Fatalities"]].plot()

In [0]:
#Find a date offset for each country on Fatalities Rate
FatalitiesLevel = 5
dfSelectedCountry[dfSelectedCountry["Fatalities"]>=FatalitiesLevel]['Date'].min()


In [0]:
#Analyze only countries with fatalities > SelectedFatalities
FatalitiesLevel = 5
def aggFunction(x):
  firstDate=x[x.Fatalities>=FatalitiesLevel].Date.min()
  x['firstDate'] = firstDate
  x['Fatality_Period'] = (x.Date - firstDate) / np.timedelta64(1, 'D')
  return x
dfCountryNoZero = dfCountry.groupby(["Country_Region"])\
  .apply(aggFunction)\
  .dropna()
dfCountryNoZero = dfCountryNoZero[dfCountryNoZero.Fatality_Period >=0]

In [0]:
dfCountryNoZero.head()

In [0]:
dfCountryNoZero.Country_Region.unique()