# Analisando os dados da COVID-19
> Uma análise exploratória de séries temporais

- toc: true 
- badges: true
- hide_binder_badge: true
- comments: true
- categories: [colab]
- image: images/chart-preview.png

In [0]:
#hide 
import datetime

import pandas as pd
import plotly.express as px

# base variables
initial_date = datetime.date(2020,1,22)
days_elapsed = (datetime.date.today() - initial_date).days
base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports"

# dicts for data collection
dates = (f"{(initial_date + datetime.timedelta(days=days)).strftime('%m-%d-%Y')}" for days in range(0, days_elapsed))
csvs = {f"{day}": f"{base_url}/{day}.csv" for day in dates}

# collecting data
dfs = pd.concat(pd.read_csv(url).assign(date=day) for day, url in csvs.items())
dfs.index = range(0, len(dfs))
dfs.columns = ["confirmed", "country", "deaths", "update", "lat", "long", "ps", "recovered", "date"]

# fixing inconsistencies
dfs.loc[dfs["country"] == 'Iran (Islamic Republic of)', "country"] = "Iran"
dfs.loc[dfs["country"] == 'Republic of Korea', "country"] = "Korea, South"

# removing empty entries
df_imputed = dfs.drop(["update", "lat", "long", "ps"], axis=1)
all_na = df_imputed["confirmed"].isna() & df_imputed["deaths"].isna() & df_imputed["recovered"].isna()
df_imputed = df_imputed[~all_na]


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





## Usando contagem de ocorrências em valores absolutos

### Casos confirmados por dia

In [0]:
#hide 
# aggregating occurrences
df_aggregated = df_imputed.pivot_table(index=["date", "country"], 
                                       values=["confirmed", "deaths", "recovered"], 
                                       aggfunc='sum').reset_index()

# filtering countries
countries = ['Italy', 'Iran', 'Spain', 'Germany', 'France', 'US', 
             'Switzerland', 'United Kingdom', 'Japan', 'Portugal', 'Brazil']
# df_countries = df_aggregated.query("date == '03-22-2020'").sort_values("confirmed", ascending=False).set_index("country")
# countries = list(df_countries.loc[:"Brazil"].index)
df_brazil = df_aggregated.query(f"country in {countries} and country != 'China'")

# resampling time series
df_brazil["date"] = pd.to_datetime(df_brazil["date"])
ts_brazil = df_brazil.groupby("country").resample("D", on="date").sum().reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
#hide_input
po = px.line(ts_brazil, x="date", y="confirmed", color="country", height=750, width=1000)
po.update_layout(
    title='Evolução dos casos confirmados de COVID-19 nos países que atualmente tem mais casos que o Brasil',
    xaxis_title="Dia",
    yaxis_title="Confirmados"
)

### Casos confirmados em dias desde o primeiro caso

In [0]:
#hide 

# computing day zero
start_date = df_imputed.groupby("country")["date"].min().reset_index()
start_date.columns = ["country", "start_date"]

# adding day zero to df
df_extended = pd.merge(start_date, df_imputed)
days_elapsed = pd.to_datetime(df_extended["date"]) - pd.to_datetime(df_extended["start_date"])
df_extended["days"] = days_elapsed.astype('timedelta64[D]')
df_extended[["confirmed","deaths","recovered"]] = df_extended[["confirmed","deaths","recovered"]].fillna(0)

# aggregating occurrences
df_aggregated_day0 = df_extended.pivot_table(index=["date","days","country"],
                                             values=["confirmed", "deaths", "recovered"], 
                                             aggfunc='sum').reset_index()

# filtering countries
df_brazil_day0 = df_aggregated_day0.query(f"country in {countries} and country != 'China'")

# resampling time series
df_brazil_day0["date"] = pd.to_datetime(df_brazil_day0["date"])
ts_brazil_day0 = df_brazil_day0.groupby("country").resample("D", on="date").sum().reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [0]:
#hide_input
po = px.line(ts_brazil_day0, x="days", y="confirmed", color="country", height=750, width=1000)
po.update_layout(
    title='Evolução dos casos confirmados de COVID-19 nos países que atualmente tem mais casos que o Brasil',
    xaxis_title="Dias desde a confirmação do 1o caso",
    yaxis_title="Confirmados"
)

### Casos confirmados desde o primeiro dia com 100 casos confirmados

In [0]:
#hide

# removing entries with less than 100 ocurrences
ts_100_confirmed = ts_brazil_day0.query("confirmed >= 100").drop("date", axis=1)

# computing start day
start_day = ts_100_confirmed.groupby("country")["days"].min().reset_index()
start_day.columns = ["country", "start_day"]

# updating start day
df_100_confirmed = pd.merge(start_day, ts_100_confirmed)
df_100_confirmed["days100"] = df_100_confirmed["days"] - df_100_confirmed["start_day"]
df_100_confirmed = df_100_confirmed.drop(["start_day","days"], axis = 1)

In [0]:
#hide_input
po = px.line(df_100_confirmed, x="days100", y="confirmed", color="country", height=750, width=1000)
po.update_layout(
    title='Evolução dos casos confirmados de COVID-19 nos países que atualmente tem mais casos que o Brasil',
    xaxis_title="Dias desde a confirmação do 100o caso",
    yaxis_title="Confirmados"
)

## Normalizando os dados em relação ao tamanho da população

In [0]:
#hide 
pwt = pd.read_excel("https://www.rug.nl/ggdc/docs/pwt91.xlsx", sheet_name="Data")

data_pop = pwt.loc[pwt["year"] == 2017, ["countrycode", "country", "pop"]]
data_pop["pop"] = data_pop["pop"] * 1000000
data_pop = data_pop.set_index("countrycode")

data_pop.loc["USA", "countrycode"] = "US"
data_pop.loc["IRN", "countrycode"] = "Iran"
data_pop.loc["KOR", "countrycode"] = "Korea, South"

df_relative = pd.merge(ts_brazil, data_pop)
df_relative["confirmed"] = (df_relative["confirmed"] / df_relative["pop"]) * 100000

### Casos confirmados por 100 mil habitantes por dia

In [0]:
#hide_input
po = px.line(df_relative, x="date", y="confirmed", color="country", height=750, width=1000)

po.update_layout(
    title='Evolução dos casos confirmados de COVID-19 nos países que atualmente tem mais casos que o Brasil',
    xaxis_title="Dia",
    yaxis_title="Confirmados por 100 mil habitantes"
)

### Casos confirmados  por 100 mil habitantes em dias desde o primeiro caso

In [0]:
#hide 
df_relative_day0 = pd.merge(ts_brazil_day0, data_pop)
df_relative["confirmed"] = (df_relative["confirmed"] / df_relative["pop"]) * 100000

In [0]:
#hide_input
po = px.line(df_relative_day0, x="days", y="confirmed", color="country", height=750, width=1000)

po.update_layout(
    title='Evolução dos casos confirmados de COVID-19 nos países que atualmente tem mais casos que o Brasil',
    xaxis_title="Dias desde a confirmação do 1o caso",
    yaxis_title="Confirmados por 100 mil habitantes"
)

### Casos confirmados  por 100 mil habitantes desde o primeiro dia com 100 casos confirmados

In [0]:
#hide
df_relative_100_confirmed = pd.merge(df_100_confirmed, data_pop)
df_relative_100_confirmed["confirmed"] = (df_relative_100_confirmed["confirmed"] / df_relative_100_confirmed["pop"]) * 100000

In [0]:
#hide_input
po = px.line(df_relative_100_confirmed, x="days100", y="confirmed", color="country", height=750, width=1000)
po.update_layout(
    title='Evolução dos casos confirmados de COVID-19 nos países que atualmente tem mais casos que o Brasil',
    xaxis_title="Dias desde a confirmação do 100o caso",
    yaxis_title="Confirmados por 100 mil habitantes"
)