# Análise estatística do dataset "World COVID-19 Data"
https://www.kaggle.com/datasets/abhishek14398/world-covid19-data

Arquivos utilizados:
- CONVENIENT_global_confirmed_cases.csv
    - Primeira linha: lista de países/regiões
    - Segunda linha: lista de províncias/estados para alguns países apenas
    - Primeira coluna: datas em dias
    - Colunas seguintes, quantidade de casos confirmados
- CONVENIENT_global_deaths.csv
    - Primeira linha: lista de países/regiões
    - Segunda linha: lista de províncias/estados para alguns países apenas
    - Primeira coluna: datas em dias
    - Colunas seguintes, quantidade de casos de mortes
- CONVENIENT_global_metadata.csv
    - Primeira linha: header
    - Primeira coluna: índice
    - Segunda coluna: países/regiões
    - Terceira coluna: províncias/estados
    - Quarta coluna: latitude
    - Quinta coluna: longitude

In [258]:
# instalação de ffmpeg:
# !pip install ffmpeg-python
# from IPython.display import HTML
# Necessita do ffmpeg, então optamos por não utilizar

# importando pacotes
import os
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import plotly.express as px

# ignorar warnings (limpar notebook)
warnings.filterwarnings('ignore')

# constantes
DATA_FOLDER = './data'

CONFIRMED_CASES_DATA = 'CONVENIENT_global_confirmed_cases.csv'
DEATH_CASES_DATA = 'CONVENIENT_global_deaths.csv'
LOCATION_DATA = 'CONVENIENT_global_metadata.csv'
GAPMINDER_POP_AREA = 'gapminder_en_pop_dnst.csv'
GAPMINDER_GDP_TOTAL = 'gapminder_gdp_total_yearly_growth.csv'
GAPMINDER_GDPPERCAPTA = 'gapminder_gdppercapita_us_inflation_adjusted.csv'
GAPMINDER_MEDIAN_AGE = 'gapminder_median_age_years.csv'
GAPMINDER_POP_TOTAL = 'gapminder_population_total.csv'

In [259]:
# realizando a leitura dos dados
# esses arquivos tem duas linhas de header, onde a primeira é o nome do país
# a segunda é uma região do país, caso exista, senão, vazio (NaN)
path_ = os.path.join(DATA_FOLDER, DEATH_CASES_DATA)
death = pd.read_csv(path_, sep=',', header=[0,1])

# agregando os valores para o mesmo país (somando)
# dado que os países eram os nomes das colunas,
# fazer transposição para poder agregar por nome do país
death = death.T
# reseta o index para que os index virem colunas
death.reset_index(inplace=True)
# modifica o nome das colunas para as datas
death.columns = death.loc[0]
# dropa os nomes das colunas usadas acima
death.drop([0], axis=0, inplace=True)
# dropa a coluna da província, dado que vamos agrupar posteriormente por país
death.drop(['Province/State'], inplace=True, axis=1)
# agrupa por país somando os valores de casos
death = death.groupby("Country/Region").sum()
# faz a transposta novamente para retornar ao formato original
death = death.T
# joga os index como colunas
death.reset_index(inplace=True)
# renomeia a coluna 0 antigo índice para data
death.rename(columns={0: 'data'}, inplace=True)
# remove o nome das colunas que fica aparecendo em cima do novo index
death.columns.names = ['']
# transformando a coluna Data no tipo Data
death.data = pd.to_datetime(death.data)

# idem para confirmed
path_ = os.path.join(DATA_FOLDER, CONFIRMED_CASES_DATA)
confirmed = pd.read_csv(path_, sep=',', header=[0,1])
confirmed = confirmed.T
confirmed.reset_index(inplace=True)
confirmed.columns = confirmed.loc[0]
confirmed.drop([0], axis=0, inplace=True)
confirmed.drop(['Province/State'], inplace=True, axis=1)
confirmed = confirmed.groupby("Country/Region").sum()
confirmed = confirmed.T
confirmed.reset_index(inplace=True)
confirmed.rename(columns={0: 'data'}, inplace=True)
confirmed.columns.names = ['']
confirmed.data = pd.to_datetime(confirmed.data)

# arquivo de latitude e longitude
# primeira coluna de indice
path_ = os.path.join(DATA_FOLDER, LOCATION_DATA)
location = pd.read_csv(path_, sep=',', index_col=0)
# no caso do location tem que criar uma nova coluna para match os nomes dos headers dos dfs death e confirmed
#location["Region"] = location["Country/Region"].str.cat(location["Province/State"], sep="_", na_rep="")
# remover _ de células onde o Province/State era NaN
#location["Region"] = location["Region"].str.removesuffix("_")


In [192]:
death

Unnamed: 0,data,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,Armenia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe
0,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,2023-01-08,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
1082,2023-01-09,1.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0,0.0,...,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1083,2023-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1084,2023-01-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [193]:
confirmed

Unnamed: 0,data,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,Armenia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe
0,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,2023-01-08,53.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,24.0,52.0,0.0,0.0,0.0,0.0,0.0
1082,2023-01-09,22.0,1.0,1.0,0.0,0.0,0.0,0.0,40982.0,0.0,...,5649.0,172.0,0.0,14.0,71.0,0.0,0.0,0.0,177.0,0.0
1083,2023-01-10,25.0,47.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,27.0,0.0,25.0,77.0,0.0,0.0,0.0,258.0,0.0
1084,2023-01-11,34.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,36.0,0.0,23.0,35.0,0.0,0.0,0.0,349.0,0.0


In [194]:
location
# embora haja muitos valores NaN, nao podemos remover as linhas com NaN, podemos substituir, mas nao sera necessario
# pois sera feito posteriormente
#checar qtd de NaN e qtd em fração de Nan

Unnamed: 0,Country/Region,Province/State,Lat,Long,Region
0,Afghanistan,,34.039110,67.709953,Afghanistan
1,Albania,,41.253300,20.168300,Albania
2,Algeria,,28.133900,1.659600,Algeria
3,Andorra,,42.606300,1.521800,Andorra
4,Angola,,-11.102700,17.873900,Angola
...,...,...,...,...,...
284,West Bank and Gaza,,32.052200,35.233200,West Bank and Gaza
285,Winter Olympics 2022,,40.004200,116.407400,Winter Olympics 2022
286,Yemen,,15.652727,48.516388,Yemen
287,Zambia,,-13.033897,27.849332,Zambia


In [231]:
# checando valores nulos
print(sum(death.isna().sum())) #não há valores nulos
print(sum(confirmed.isna().sum())) #não há valores nulos
print(location.isna().sum()) #há valores nulos

0
0
Country/Region    0
Lat               0
Long              0
Region            0
dtype: int64


In [260]:
# verificando coluna a coluna o que são os valores nulos
display(location[location['Lat'].isna()]) #dois países com regiões não utilizados
display(location[location['Long'].isna()]) #dois países com regiões não utilizados
location.dropna(subset=['Long', 'Lat'], inplace=True)
# não usaremos a coluna Province/States
location.drop('Province/State', axis=1, inplace=True)


Unnamed: 0,Country/Region,Province/State,Lat,Long
53,Canada,Repatriated Travellers,,
89,China,Unknown,,


Unnamed: 0,Country/Region,Province/State,Lat,Long
53,Canada,Repatriated Travellers,,
89,China,Unknown,,


In [261]:
# agrupando location pelo pais:
location = location.groupby('Country/Region').first()
#location[location['Country/Region'] == "Australia"]

In [264]:
# confirmando se o shape de death e confirmed são iguais (mesma qtd de linhas e colunas)
print(death.shape == confirmed.shape)
# confirmando se as datas são iguais analisando uma a uma, somando os trues e comparando a quantidade de datas
print(sum(death.data == confirmed.data) == death.shape[0])
# confirmando se os nomes das colunas são iguais
print(sum(death.columns == confirmed.columns) == (death.columns.size))
# confirmando se o location["Regions"] bate com os nomes das colunas dos outros dois dfs:
print(sum(location.index == death.columns[1:]) == death.columns[1:].size)

True
True
True
True


In [None]:
# verificando informações do DF
death.info

In [None]:
# checando por dados do tipo NaN
print(sum(death.isna().sum()))
print(sum(confirmed.isna().sum()))
# nao ha dados do tipo NaN
print("="*20)
# checando por dados negativos
print((death.iloc[:,1:] < 0).cumsum().loc[1085][(death.iloc[:,1:] < 0).cumsum().loc[1085]>0])
print("="*20)
print((confirmed.iloc[:,1:] < 0).cumsum().loc[1085][(confirmed.iloc[:,1:] < 0).cumsum().loc[1085]>0])
# dados negativos neste caso podem ser dados anteriores errados sendo corrigidas


0
0
Andorra                                                        1
Angola                                                         1
Armenia                                                        1
Australia_Australian Capital Territory                         4
Australia_New South Wales                                      2
                                                              ..
United Kingdom_Cayman Islands                                  1
United Kingdom_Montserrat                                      1
United Kingdom_Saint Helena, Ascension and Tristan da Cunha    1
Venezuela                                                      1
Vietnam                                                        2
Name: 1085, Length: 106, dtype: int32
Afghanistan                                2
Antigua and Barbuda                        2
Australia_Australian Capital Territory     7
Australia_New South Wales                  8
Australia_Northern Territory              23
                  

In [None]:
# analise estatistica
stats_death = death.describe().T
stats_death

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Afghanistan,1086.0,7.232044,15.972543,0.0,0.0,2.0,6.0,159.0
Albania,1086.0,3.311234,4.524262,0.0,0.0,1.0,5.0,21.0
Algeria,1086.0,6.336096,7.518476,0.0,0.0,5.0,9.0,49.0
Andorra,1086.0,0.151934,0.544711,-2.0,0.0,0.0,0.0,7.0
Angola,1086.0,1.777164,3.100776,-3.0,0.0,0.0,3.0,30.0
...,...,...,...,...,...,...,...,...
West Bank and Gaza,1086.0,5.255985,11.210053,0.0,0.0,1.0,7.0,268.0
Winter Olympics 2022,1086.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Yemen,1086.0,1.988029,4.476960,0.0,0.0,0.0,2.0,60.0
Zambia,1086.0,3.714549,9.796901,0.0,0.0,0.0,3.0,72.0


In [None]:
# análise estatística
stats_confirmed = confirmed.describe().T
stats_confirmed

In [None]:
# eliminar variáveis de variância nula (abaixo de um limiar)
# stats_death['CV'] = abs(stats_death['std'] * 100 / stats_death['mean'])
# stats_confirmed['CV'] = abs(stats_confirmed['std'] * 100 / stats_confirmed['mean'])
# print(stats_death)
# print(stats_confirmed)
# nao faz sentido este tratamento para este dataset.

In [None]:
# checando paises com numero de mortes menores que dois
stats_death[stats_death['max'] < 2]

In [265]:
# criando novo DF agrupando por ano e mes e somando os casos de mortes
death_monthly = death.groupby([death.data.dt.year, death.data.dt.month])[death.columns[1:]].sum()
# fazendo a soma cumulativa dos casos
death_monthly = death_monthly.cumsum()
# renomeando o indice
death_monthly.index.rename(['ano', 'mes'], inplace=True)
# resetando o indice para transformar em colunas do DF
death_monthly.reset_index(inplace=True)
# criando uma nova coluna de data unindo ano e mes e adicionando leading zero
death_monthly["data"] = death_monthly["ano"].astype(str) + death_monthly["mes"].astype(str).str.zfill(2)
death_monthly

Unnamed: 0,ano,mes,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,...,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe,data
0,2020,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,202001
1,2020,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,202002
2,2020,3,4.0,15.0,44.0,12.0,2.0,0.0,0.0,27.0,...,2.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,202003
3,2020,4,60.0,31.0,450.0,42.0,2.0,0.0,3.0,218.0,...,9.0,0.0,16.0,0.0,2.0,0.0,2.0,3.0,4.0,202004
4,2020,5,254.0,33.0,653.0,51.0,4.0,0.0,3.0,539.0,...,15.0,0.0,14.0,0.0,3.0,0.0,80.0,7.0,4.0,202005
5,2020,6,739.0,62.0,912.0,52.0,13.0,0.0,3.0,1307.0,...,26.0,0.0,51.0,0.0,8.0,0.0,312.0,24.0,7.0,202006
6,2020,7,1275.0,157.0,1210.0,52.0,52.0,0.0,3.0,3543.0,...,141.0,0.0,164.0,3.0,82.0,0.0,493.0,151.0,67.0,202007
7,2020,8,1406.0,284.0,1510.0,53.0,108.0,0.0,3.0,8660.0,...,320.0,0.0,386.0,34.0,152.0,0.0,566.0,288.0,202.0,202008
8,2020,9,1462.0,387.0,1736.0,53.0,183.0,0.0,3.0,16937.0,...,470.0,0.0,628.0,35.0,311.0,0.0,587.0,332.0,228.0,202009
9,2020,10,1533.0,509.0,1964.0,75.0,284.0,0.0,3.0,31002.0,...,566.0,0.0,798.0,35.0,483.0,0.0,599.0,349.0,243.0,202010


In [266]:
# idem para confirmed
confirmed_monthly = confirmed.groupby([confirmed.data.dt.year, confirmed.data.dt.month])[confirmed.columns[1:]].sum()
confirmed_monthly = confirmed_monthly.cumsum()
confirmed_monthly.index.rename(['ano', 'mes'], inplace=True)
confirmed_monthly.reset_index(inplace=True)
confirmed_monthly["data"] = confirmed_monthly["ano"].astype(str) + confirmed_monthly["mes"].astype(str).str.zfill(2)
confirmed_monthly

Unnamed: 0,ano,mes,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,...,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe,data
0,2020,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,202001
1,2020,2,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,202002
2,2020,3,166.0,243.0,716.0,376.0,7.0,0.0,7.0,1054.0,...,172.0,0.0,135.0,212.0,119.0,0.0,0.0,35.0,8.0,202003
3,2020,4,1827.0,773.0,4006.0,745.0,27.0,0.0,24.0,4428.0,...,2039.0,0.0,333.0,270.0,344.0,0.0,6.0,106.0,40.0,202004
4,2020,5,15180.0,1137.0,9394.0,764.0,86.0,0.0,26.0,16851.0,...,3623.0,0.0,1510.0,328.0,448.0,0.0,323.0,1057.0,178.0,202005
5,2020,6,31445.0,2535.0,13907.0,855.0,284.0,0.0,69.0,64530.0,...,8503.0,0.0,5832.0,355.0,2428.0,0.0,1158.0,1594.0,591.0,202006
6,2020,7,36628.0,5276.0,30394.0,925.0,1148.0,0.0,91.0,191302.0,...,24009.0,0.0,18574.0,558.0,11837.0,0.0,1728.0,5963.0,3169.0,202007
7,2020,8,38248.0,9513.0,44494.0,1176.0,2654.0,0.0,94.0,417735.0,...,41893.0,0.0,46728.0,1044.0,22729.0,0.0,1958.0,12097.0,6497.0,202008
8,2020,9,39354.0,13649.0,51530.0,2050.0,4972.0,0.0,101.0,751001.0,...,56717.0,0.0,75122.0,1094.0,39899.0,0.0,2034.0,14759.0,7838.0,202009
9,2020,10,41334.0,20875.0,57942.0,4756.0,10805.0,0.0,128.0,1166924.0,...,66932.0,0.0,92013.0,1180.0,53520.0,0.0,2063.0,16432.0,8367.0,202010


In [270]:
# criando novo dataframe com o formato apropriado para utilizar plotly.express.scatter_geo:
# data - confirmed - pais - lat - lon
# para utilizar com scatter_geo
novo_df = pd.DataFrame(columns=['data', 'confirmed', 'death', 'pais', 'lat', 'lon'])
for coluna in confirmed_monthly.columns[2:-1]:
    temp = confirmed_monthly[['data', coluna]]
    temp["death"] = death_monthly[[coluna]]
    temp['pais'] = temp.columns[1]
    temp['lat'] = location.loc[coluna]['Lat']
    temp['lon'] = location.loc[coluna]['Long']
    temp.columns = ['data', 'confirmed', 'death', 'pais', 'lat', 'lon']
    novo_df = pd.concat([novo_df, temp], ignore_index=True)
novo_df

Unnamed: 0,data,confirmed,death,pais,lat,lon
0,202001,0.0,0.0,Afghanistan,34.039110,67.709953
1,202002,5.0,0.0,Afghanistan,34.039110,67.709953
2,202003,166.0,4.0,Afghanistan,34.039110,67.709953
3,202004,1827.0,60.0,Afghanistan,34.039110,67.709953
4,202005,15180.0,254.0,Afghanistan,34.039110,67.709953
...,...,...,...,...,...,...
7432,202209,257465.0,5602.0,Zimbabwe,-18.915438,29.154857
7433,202210,257893.0,5606.0,Zimbabwe,-18.915438,29.154857
7434,202211,259164.0,5620.0,Zimbabwe,-18.915438,29.154857
7435,202212,259981.0,5637.0,Zimbabwe,-18.915438,29.154857


In [272]:
# plotando animacao com scatterplot durante a pandemia para verificar se há países onde há crescimento exagerado
fig = px.scatter_geo(novo_df,
                     lat=novo_df['lat'],
                     lon=novo_df['lon'],
                     color=novo_df['pais'], # which column to use to set the color of markers
                     hover_name="pais", # column added to hover information
                     size="confirmed", # size of markers
                     projection="natural earth",
                     animation_frame='data',
                     animation_group='pais',
                     width=1600,
                     height=800)
fig.show()

In [None]:
# plotando animacao com scatterplot durante a pandemia para verificar se há países onde há crescimento exagerado
fig = px.scatter_geo(novo_df,
                     lat=novo_df['lat'],
                     lon=novo_df['lon'],
                     color=novo_df['pais'], # which column to use to set the color of markers
                     hover_name="pais", # column added to hover information
                     size="death", # size of markers
                     projection="natural earth",
                     animation_frame='data',
                     animation_group='pais',
                     width=1600,
                     height=800)
fig.show()

In [None]:
# plotando grafico de linhas com os valores acumulados por pais
fig = px.line(novo_df, x="data", y="confirmed", color='pais', width=1600, height=800)
fig.show()

In [None]:
# plotando grafico de linhas com os valores acumulados por pais
fig = px.line(novo_df, x="data", y="death", color='pais', width=1600, height=800)
fig.show()

# PAREI AQUI
# Analisando dados do COVID-19 em conjunto com dados sócio-econômicos

In [None]:
# realizando a leitura dos dados
# esses arquivos tem uma linha de header
# a segunda é uma região do país, caso exista, senão, vazio (NaN)
path_ = os.path.join(DATA_FOLDER, GAPMINDER_POP_AREA)
pop_area = pd.read_csv(path_, sep=',')
pop_area

Unnamed: 0,country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,308,312,315,317,319,321,322,324,326,...,567,570,573,577,580,583,585,588,591,593
1,Afghanistan,14,14.3,14.6,14.9,15.3,15.6,15.9,16.3,16.7,...,46.1,47.7,49.4,51.1,52.7,54.2,55.6,56.9,58.3,59.6
2,Angola,4.44,4.5,4.56,4.6,4.63,4.64,4.63,4.63,4.66,...,19.4,20.1,20.9,21.6,22.4,23.1,23.9,24.7,25.5,26.4
3,Albania,60.6,62.5,64.3,66.2,68.1,69.9,71.7,73.8,76,...,106,106,106,105,105,105,105,105,104,104
4,Andorra,30.6,32.7,34.9,37.2,39.5,41.8,44.2,46.6,49,...,178,175,172,169,166,164,164,164,164,164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,Samoa,39.6,40.9,42.2,43.6,44.9,46.2,47.4,48.6,49.6,...,66.2,66.8,67.4,67.9,68.4,68.7,69,69.3,69.6,70.1
213,Yemen,10.2,10.4,10.5,10.7,10.8,11,11.2,11.4,11.5,...,45.1,46.4,47.6,48.9,50.2,51.5,52.7,54,55.2,56.5
214,South Africa,14.4,14.8,15.2,15.6,16,16.4,16.8,17.3,17.7,...,42.9,43.6,44.3,45,45.7,46.3,47,47.6,48.3,48.9
215,Zambia,4.26,4.39,4.52,4.66,4.8,4.95,5.11,5.27,5.44,...,18.9,19.5,20.1,20.7,21.4,22,22.7,23.3,24,24.7


In [None]:
float(pop_area[pop_area["country"] == "Brazil"]["2019"])

25.4

In [None]:
for pais in confirmed_monthly.columns[2:-1]:
    print(pais, " - ", pop_area[pop_area["country"] == pais]["2019"])

Afghanistan  -  1    59.6
Name: 2019, dtype: object
Albania  -  3    104
Name: 2019, dtype: object
Algeria  -  54    18.4
Name: 2019, dtype: object
Andorra  -  4    164
Name: 2019, dtype: object
Angola  -  2    26.4
Name: 2019, dtype: object
Antarctica  -  Series([], Name: 2019, dtype: object)
Antigua and Barbuda  -  9    223
Name: 2019, dtype: object
Argentina  -  6    16.6
Name: 2019, dtype: object
Armenia  -  7    104
Name: 2019, dtype: object
Australia_Australian Capital Territory  -  Series([], Name: 2019, dtype: object)
Australia_New South Wales  -  Series([], Name: 2019, dtype: object)
Australia_Northern Territory  -  Series([], Name: 2019, dtype: object)
Australia_Queensland  -  Series([], Name: 2019, dtype: object)
Australia_South Australia  -  Series([], Name: 2019, dtype: object)
Australia_Tasmania  -  Series([], Name: 2019, dtype: object)
Australia_Victoria  -  Series([], Name: 2019, dtype: object)
Australia_Western Australia  -  Series([], Name: 2019, dtype: object)
Austri

In [None]:
# Pergunta 1: Há relação entre população e quantidade de casos?


In [None]:
# focando em dados do Brasil:
brasil = pd.DataFrame().assign(data=confirmed.Data, confirmed=confirmed.Brazil, death=death.Brazil)
brasil["confirmed_ac"] = brasil["confirmed"].cumsum()
brasil["death_ac"] = brasil["death"].cumsum()
brasil

In [None]:
# checando por valores negativos:
print(brasil[brasil["confirmed"] < 0])
print(brasil[brasil["death"] < 0])
# valores negativos possivelmente se devem a reavaliações ou retratação
# de dados anteriores errados

In [None]:
df = px.data.gapminder().query("year == 2007")
df

In [None]:
plt.figure(figsize=(10,7))
plt.plot(brasil.data, brasil.confirmed)

In [None]:
plt.figure(figsize=(10,7))
plt.plot(brasil[(brasil["data"] > "2022-09-19") & (brasil["data"] < "2022-09-22")].data, brasil[(brasil["data"] > "2022-09-19") & (brasil["data"] < "2022-09-22")].confirmed)

In [None]:
brasil[(brasil["data"] > "2022-09-19") & (brasil["data"] < "2022-09-22")]

In [None]:
plt.plot(brasil.data, brasil.confirmed_ac)

In [None]:
df = px.data.gapminder().query("year == 2007")
fig = px.scatter_geo(df, locations="iso_alpha",
                     color="continent", # which column to use to set the color of markers
                     hover_name="country", # column added to hover information
                     size="pop", # size of markers
                     projection="natural earth")
# fig = px.scatter_geo(geo_df,
#                     lat=geo_df.geometry.y,
#                     lon=geo_df.geometry.x,
#                     hover_name="name")
fig.show()

In [None]:
df

In [None]:
df = px.data.gapminder()
df

In [None]:
novo_confirmed_2 = confirmed.copy()
novo_confirmed_2.drop('Total', inplace=True)
novo_confirmed_2

In [None]:
location
location[location["Country/Region"] == "Brazil"]['Lat'].values[0]

In [None]:
#checa se tem menor que zero
empty_df[empty_df['confirmed'] < 0]

In [None]:

fig, ax = plt.subplots()
ax.set_xlim([0, 10])

scat = ax.scatter(1, 0)
x = np.linspace(0, 10)


def animate(i):
    scat.set_offsets((x[i], 0))
    return scat,

ani = animation.FuncAnimation(fig, animate, repeat=True,
                                    frames=len(x) - 1, interval=50)

# To save the animation using Pillow as a gif
writer = animation.PillowWriter(fps=15,
                                metadata=dict(artist='Me'),
                                bitrate=1800)
ani.save('scatter.gif', writer=writer)
# plt.show()