# Análise estatística do dataset "World COVID-19 Data"
https://www.kaggle.com/datasets/abhishek14398/world-covid19-data

Arquivos utilizados:
- CONVENIENT_global_confirmed_cases.csv
    - Primeira linha: lista de países/regiões
    - Segunda linha: lista de províncias/estados para alguns países apenas
    - Primeira coluna: datas em dias
    - Colunas seguintes, quantidade de casos confirmados
- CONVENIENT_global_deaths.csv
    - Primeira linha: lista de países/regiões
    - Segunda linha: lista de províncias/estados para alguns países apenas
    - Primeira coluna: datas em dias
    - Colunas seguintes, quantidade de casos de mortes
- CONVENIENT_global_metadata.csv
    - Primeira linha: header
    - Primeira coluna: índice
    - Segunda coluna: países/regiões
    - Terceira coluna: províncias/estados
    - Quarta coluna: latitude
    - Quinta coluna: longitude

In [1]:
# instalação de ffmpeg:
# !pip install ffmpeg-python
# from IPython.display import HTML
# Necessita do ffmpeg, então optamos por não utilizar

# importando pacotes
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import plotly.express as px
import plotly.graph_objects as go

# ignorar warnings (limpar notebook)
warnings.filterwarnings('ignore')

# constantes
DATA_FOLDER = './data'

CONFIRMED_CASES_DATA = 'CONVENIENT_global_confirmed_cases.csv'
DEATH_CASES_DATA = 'CONVENIENT_global_deaths.csv'
LOCATION_DATA = 'CONVENIENT_global_metadata.csv'
GAPMINDER_POP_AREA = 'gapminder_en_pop_dnst.csv'
GAPMINDER_GDP_TOTAL = 'gapminder_gdp_total_yearly_growth.csv'
GAPMINDER_GDPPERCAPTA = 'gapminder_gdppercapita_us_inflation_adjusted.csv'
GAPMINDER_MEDIAN_AGE = 'gapminder_median_age_years.csv'
GAPMINDER_POP_TOTAL = 'gapminder_population_total.csv'

In [2]:
# realizando a leitura dos dados
# esses arquivos tem duas linhas de header, onde a primeira é o nome do país
# a segunda é uma região do país, caso exista, senão, vazio (NaN)
path_ = os.path.join(DATA_FOLDER, DEATH_CASES_DATA)
death = pd.read_csv(path_, sep=',', header=[0,1])

# agregando os valores para o mesmo país (somando)
# dado que os países eram os nomes das colunas,
# fazer transposição para poder agregar por nome do país
death = death.T
# reseta o index para que os index virem colunas
death.reset_index(inplace=True)
# modifica o nome das colunas para as datas
death.columns = death.loc[0]
# dropa os nomes das colunas usadas acima
death.drop([0], axis=0, inplace=True)
# dropa a coluna da província, dado que vamos agrupar posteriormente por país
death.drop(['Province/State'], inplace=True, axis=1)
# agrupa por país somando os valores de casos
death = death.groupby("Country/Region").sum()
# faz a transposta novamente para retornar ao formato original
death = death.T
# joga os index como colunas
death.reset_index(inplace=True)
# renomeia a coluna 0 antigo índice para data
death.rename(columns={0: 'data'}, inplace=True)
# remove o nome das colunas que fica aparecendo em cima do novo index
death.columns.names = ['']
# transformando a coluna Data no tipo Data
death.data = pd.to_datetime(death.data)

# idem para confirmed
path_ = os.path.join(DATA_FOLDER, CONFIRMED_CASES_DATA)
confirmed = pd.read_csv(path_, sep=',', header=[0,1])
confirmed = confirmed.T
confirmed.reset_index(inplace=True)
confirmed.columns = confirmed.loc[0]
confirmed.drop([0], axis=0, inplace=True)
confirmed.drop(['Province/State'], inplace=True, axis=1)
confirmed = confirmed.groupby("Country/Region").sum()
confirmed = confirmed.T
confirmed.reset_index(inplace=True)
confirmed.rename(columns={0: 'data'}, inplace=True)
confirmed.columns.names = ['']
confirmed.data = pd.to_datetime(confirmed.data)

# arquivo de latitude e longitude
# primeira coluna de indice
path_ = os.path.join(DATA_FOLDER, LOCATION_DATA)
location = pd.read_csv(path_, sep=',', index_col=0)
# no caso do location tem que criar uma nova coluna para match os nomes dos headers dos dfs death e confirmed
#location["Region"] = location["Country/Region"].str.cat(location["Province/State"], sep="_", na_rep="")
# remover _ de células onde o Province/State era NaN
#location["Region"] = location["Region"].str.removesuffix("_")


In [3]:
death

Unnamed: 0,data,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,Armenia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe
0,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,2023-01-08,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
1082,2023-01-09,1.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0,0.0,...,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
1083,2023-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1084,2023-01-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
confirmed

Unnamed: 0,data,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,Armenia,...,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe
0,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,2023-01-08,53.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,24.0,52.0,0.0,0.0,0.0,0.0,0.0
1082,2023-01-09,22.0,1.0,1.0,0.0,0.0,0.0,0.0,40982.0,0.0,...,5649.0,172.0,0.0,14.0,71.0,0.0,0.0,0.0,177.0,0.0
1083,2023-01-10,25.0,47.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,27.0,0.0,25.0,77.0,0.0,0.0,0.0,258.0,0.0
1084,2023-01-11,34.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,36.0,0.0,23.0,35.0,0.0,0.0,0.0,349.0,0.0


In [5]:
location
# embora haja muitos valores NaN, nao podemos remover as linhas com NaN, podemos substituir, mas nao sera necessario
# pois sera feito posteriormente
#checar qtd de NaN e qtd em fração de Nan

Unnamed: 0,Country/Region,Province/State,Lat,Long
0,Afghanistan,,34.039110,67.709953
1,Albania,,41.253300,20.168300
2,Algeria,,28.133900,1.659600
3,Andorra,,42.606300,1.521800
4,Angola,,-11.102700,17.873900
...,...,...,...,...
284,West Bank and Gaza,,32.052200,35.233200
285,Winter Olympics 2022,,40.004200,116.407400
286,Yemen,,15.652727,48.516388
287,Zambia,,-13.033897,27.849332


In [6]:
# checando valores nulos
print(sum(death.isna().sum())) #não há valores nulos
print(sum(confirmed.isna().sum())) #não há valores nulos
print(location.isna().sum()) #há valores nulos

0
0
Country/Region      0
Province/State    198
Lat                 2
Long                2
dtype: int64


In [7]:
# verificando coluna a coluna o que são os valores nulos
display(location[location['Lat'].isna()]) #dois países com regiões não utilizados
display(location[location['Long'].isna()]) #dois países com regiões não utilizados
location.dropna(subset=['Long', 'Lat'], inplace=True)
# não usaremos a coluna Province/States
location.drop('Province/State', axis=1, inplace=True)


Unnamed: 0,Country/Region,Province/State,Lat,Long
53,Canada,Repatriated Travellers,,
89,China,Unknown,,


Unnamed: 0,Country/Region,Province/State,Lat,Long
53,Canada,Repatriated Travellers,,
89,China,Unknown,,


In [8]:
# agrupando location pelo pais:
location = location.groupby('Country/Region').first()
#location[location['Country/Region'] == "Australia"]

In [9]:
# confirmando se o shape de death e confirmed são iguais (mesma qtd de linhas e colunas)
print(death.shape == confirmed.shape)
# confirmando se as datas são iguais analisando uma a uma, somando os trues e comparando a quantidade de datas
print(sum(death.data == confirmed.data) == death.shape[0])
# confirmando se os nomes das colunas são iguais
print(sum(death.columns == confirmed.columns) == (death.columns.size))
# confirmando se o location["Regions"] bate com os nomes das colunas dos outros dois dfs:
print(sum(location.index == death.columns[1:]) == death.columns[1:].size)

True
True
True
True


In [10]:
# verificando informações do DF
death.info

<bound method DataFrame.info of            data  Afghanistan  Albania  Algeria  Andorra  Angola  Antarctica  \
0    2020-01-23          0.0      0.0      0.0      0.0     0.0         0.0   
1    2020-01-24          0.0      0.0      0.0      0.0     0.0         0.0   
2    2020-01-25          0.0      0.0      0.0      0.0     0.0         0.0   
3    2020-01-26          0.0      0.0      0.0      0.0     0.0         0.0   
4    2020-01-27          0.0      0.0      0.0      0.0     0.0         0.0   
...         ...          ...      ...      ...      ...     ...         ...   
1081 2023-01-08          3.0      0.0      0.0      0.0     0.0         0.0   
1082 2023-01-09          1.0      0.0      0.0      0.0     0.0         0.0   
1083 2023-01-10          0.0      0.0      0.0      0.0     0.0         0.0   
1084 2023-01-11          0.0      0.0      0.0      0.0     0.0         0.0   
1085 2023-01-12          0.0      0.0      0.0      0.0     0.0         0.0   

      Antigua and B

In [11]:
# analise estatistica
stats_death = death.describe().T
stats_death

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
,,,,,,,,
Afghanistan,1086.0,7.232044,15.972543,0.0,0.0,2.0,6.0,159.0
Albania,1086.0,3.311234,4.524262,0.0,0.0,1.0,5.0,21.0
Algeria,1086.0,6.336096,7.518476,0.0,0.0,5.0,9.0,49.0
Andorra,1086.0,0.151934,0.544711,-2.0,0.0,0.0,0.0,7.0
Angola,1086.0,1.777164,3.100776,-3.0,0.0,0.0,3.0,30.0
...,...,...,...,...,...,...,...,...
West Bank and Gaza,1086.0,5.255985,11.210053,0.0,0.0,1.0,7.0,268.0
Winter Olympics 2022,1086.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Yemen,1086.0,1.988029,4.476960,0.0,0.0,0.0,2.0,60.0


In [12]:
# análise estatística
stats_confirmed = confirmed.describe().T
stats_confirmed

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
,,,,,,,,
Afghanistan,1086.0,191.436464,335.302905,-6.0,27.0,76.0,193.75,3243.0
Albania,1086.0,307.567219,449.182574,0.0,18.0,107.0,492.25,4789.0
Algeria,1086.0,249.794659,337.602214,0.0,12.0,148.0,320.00,2521.0
Andorra,1086.0,43.997238,139.883490,0.0,0.0,0.0,36.00,2313.0
Angola,1086.0,96.772560,243.838095,0.0,0.0,26.0,110.00,5035.0
...,...,...,...,...,...,...,...,...
West Bank and Gaza,1086.0,647.539595,1727.721221,0.0,0.0,137.5,651.75,30356.0
Winter Olympics 2022,1086.0,0.492634,4.123378,0.0,0.0,0.0,0.00,72.0
Yemen,1086.0,10.999079,23.055073,-1.0,0.0,1.0,11.00,287.0


In [13]:
# eliminar variáveis de variância nula (abaixo de um limiar)
stats_death['CV'] = abs(stats_death['std'] * 100 / stats_death['mean'])
stats_confirmed['CV'] = abs(stats_confirmed['std'] * 100 / stats_confirmed['mean'])
display(stats_death)
display(stats_confirmed)
# nao faz sentido este tratamento para este dataset.

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,CV
,,,,,,,,,
Afghanistan,1086.0,7.232044,15.972543,0.0,0.0,2.0,6.0,159.0,220.857934
Albania,1086.0,3.311234,4.524262,0.0,0.0,1.0,5.0,21.0,136.633706
Algeria,1086.0,6.336096,7.518476,0.0,0.0,5.0,9.0,49.0,118.661030
Andorra,1086.0,0.151934,0.544711,-2.0,0.0,0.0,0.0,7.0,358.519160
Angola,1086.0,1.777164,3.100776,-3.0,0.0,0.0,3.0,30.0,174.478922
...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,1086.0,5.255985,11.210053,0.0,0.0,1.0,7.0,268.0,213.281666
Winter Olympics 2022,1086.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,
Yemen,1086.0,1.988029,4.476960,0.0,0.0,0.0,2.0,60.0,225.195878


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,CV
,,,,,,,,,
Afghanistan,1086.0,191.436464,335.302905,-6.0,27.0,76.0,193.75,3243.0,175.151012
Albania,1086.0,307.567219,449.182574,0.0,18.0,107.0,492.25,4789.0,146.043709
Algeria,1086.0,249.794659,337.602214,0.0,12.0,148.0,320.00,2521.0,135.151894
Andorra,1086.0,43.997238,139.883490,0.0,0.0,0.0,36.00,2313.0,317.936984
Angola,1086.0,96.772560,243.838095,0.0,0.0,26.0,110.00,5035.0,251.970285
...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,1086.0,647.539595,1727.721221,0.0,0.0,137.5,651.75,30356.0,266.813216
Winter Olympics 2022,1086.0,0.492634,4.123378,0.0,0.0,0.0,0.00,72.0,837.007289
Yemen,1086.0,10.999079,23.055073,-1.0,0.0,1.0,11.00,287.0,209.609122


In [14]:
# checando paises com numero de mortes menores que dois
stats_death[stats_death['max'] < 2]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,CV
,,,,,,,,,
Antarctica,1086.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Holy See,1086.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Nauru,1086.0,0.000921,0.030345,0.0,0.0,0.0,0.0,1.0,3295.451411
Summer Olympics 2020,1086.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Tuvalu,1086.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Winter Olympics 2022,1086.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [15]:
# criando novo DF agrupando por ano e mes e somando os casos de mortes
death_monthly = death.groupby([death.data.dt.year, death.data.dt.month])[death.columns[1:]].sum()
# fazendo a soma cumulativa dos casos
death_monthly = death_monthly.cumsum()
# renomeando o indice
death_monthly.index.rename(['ano', 'mes'], inplace=True)
# resetando o indice para transformar em colunas do DF
death_monthly.reset_index(inplace=True)
# criando uma nova coluna de data unindo ano e mes e adicionando leading zero
death_monthly["data"] = death_monthly["ano"].astype(str) + death_monthly["mes"].astype(str).str.zfill(2)
death_monthly

Unnamed: 0,ano,mes,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,...,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe,data
0,2020,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,202001
1,2020,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,202002
2,2020,3,4.0,15.0,44.0,12.0,2.0,0.0,0.0,27.0,...,2.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,202003
3,2020,4,60.0,31.0,450.0,42.0,2.0,0.0,3.0,218.0,...,9.0,0.0,16.0,0.0,2.0,0.0,2.0,3.0,4.0,202004
4,2020,5,254.0,33.0,653.0,51.0,4.0,0.0,3.0,539.0,...,15.0,0.0,14.0,0.0,3.0,0.0,80.0,7.0,4.0,202005
5,2020,6,739.0,62.0,912.0,52.0,13.0,0.0,3.0,1307.0,...,26.0,0.0,51.0,0.0,8.0,0.0,312.0,24.0,7.0,202006
6,2020,7,1275.0,157.0,1210.0,52.0,52.0,0.0,3.0,3543.0,...,141.0,0.0,164.0,3.0,82.0,0.0,493.0,151.0,67.0,202007
7,2020,8,1406.0,284.0,1510.0,53.0,108.0,0.0,3.0,8660.0,...,320.0,0.0,386.0,34.0,152.0,0.0,566.0,288.0,202.0,202008
8,2020,9,1462.0,387.0,1736.0,53.0,183.0,0.0,3.0,16937.0,...,470.0,0.0,628.0,35.0,311.0,0.0,587.0,332.0,228.0,202009
9,2020,10,1533.0,509.0,1964.0,75.0,284.0,0.0,3.0,31002.0,...,566.0,0.0,798.0,35.0,483.0,0.0,599.0,349.0,243.0,202010


In [16]:
# idem para confirmed
confirmed_monthly = confirmed.groupby([confirmed.data.dt.year, confirmed.data.dt.month])[confirmed.columns[1:]].sum()
confirmed_monthly = confirmed_monthly.cumsum()
confirmed_monthly.index.rename(['ano', 'mes'], inplace=True)
confirmed_monthly.reset_index(inplace=True)
confirmed_monthly["data"] = confirmed_monthly["ano"].astype(str) + confirmed_monthly["mes"].astype(str).str.zfill(2)
confirmed_monthly

Unnamed: 0,ano,mes,Afghanistan,Albania,Algeria,Andorra,Angola,Antarctica,Antigua and Barbuda,Argentina,...,Uzbekistan,Vanuatu,Venezuela,Vietnam,West Bank and Gaza,Winter Olympics 2022,Yemen,Zambia,Zimbabwe,data
0,2020,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,202001
1,2020,2,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,202002
2,2020,3,166.0,243.0,716.0,376.0,7.0,0.0,7.0,1054.0,...,172.0,0.0,135.0,212.0,119.0,0.0,0.0,35.0,8.0,202003
3,2020,4,1827.0,773.0,4006.0,745.0,27.0,0.0,24.0,4428.0,...,2039.0,0.0,333.0,270.0,344.0,0.0,6.0,106.0,40.0,202004
4,2020,5,15180.0,1137.0,9394.0,764.0,86.0,0.0,26.0,16851.0,...,3623.0,0.0,1510.0,328.0,448.0,0.0,323.0,1057.0,178.0,202005
5,2020,6,31445.0,2535.0,13907.0,855.0,284.0,0.0,69.0,64530.0,...,8503.0,0.0,5832.0,355.0,2428.0,0.0,1158.0,1594.0,591.0,202006
6,2020,7,36628.0,5276.0,30394.0,925.0,1148.0,0.0,91.0,191302.0,...,24009.0,0.0,18574.0,558.0,11837.0,0.0,1728.0,5963.0,3169.0,202007
7,2020,8,38248.0,9513.0,44494.0,1176.0,2654.0,0.0,94.0,417735.0,...,41893.0,0.0,46728.0,1044.0,22729.0,0.0,1958.0,12097.0,6497.0,202008
8,2020,9,39354.0,13649.0,51530.0,2050.0,4972.0,0.0,101.0,751001.0,...,56717.0,0.0,75122.0,1094.0,39899.0,0.0,2034.0,14759.0,7838.0,202009
9,2020,10,41334.0,20875.0,57942.0,4756.0,10805.0,0.0,128.0,1166924.0,...,66932.0,0.0,92013.0,1180.0,53520.0,0.0,2063.0,16432.0,8367.0,202010


In [17]:
# criando novo dataframe com o formato apropriado para utilizar plotly.express.scatter_geo:
# data - confirmed - pais - lat - lon
# para utilizar com scatter_geo
novo_df = pd.DataFrame(columns=['data', 'confirmed', 'death', 'pais', 'lat', 'lon'])
# itera pelas colunas com nomes dos países
for coluna in confirmed_monthly.columns[2:-1]:
    # cria um dataframe temporário com a data e o país
    temp = confirmed_monthly[['data', coluna]]
    # cria a coluna de mortes buscando no dataframe das mortes
    temp["death"] = death_monthly[[coluna]]
    temp['pais'] = temp.columns[1]
    temp['lat'] = location.loc[coluna]['Lat']
    temp['lon'] = location.loc[coluna]['Long']
    temp.columns = ['data', 'confirmed', 'death', 'pais', 'lat', 'lon']
    novo_df = pd.concat([novo_df, temp], ignore_index=True)
novo_df

Unnamed: 0,data,confirmed,death,pais,lat,lon
0,202001,0.0,0.0,Afghanistan,34.039110,67.709953
1,202002,5.0,0.0,Afghanistan,34.039110,67.709953
2,202003,166.0,4.0,Afghanistan,34.039110,67.709953
3,202004,1827.0,60.0,Afghanistan,34.039110,67.709953
4,202005,15180.0,254.0,Afghanistan,34.039110,67.709953
...,...,...,...,...,...,...
7432,202209,257465.0,5602.0,Zimbabwe,-18.915438,29.154857
7433,202210,257893.0,5606.0,Zimbabwe,-18.915438,29.154857
7434,202211,259164.0,5620.0,Zimbabwe,-18.915438,29.154857
7435,202212,259981.0,5637.0,Zimbabwe,-18.915438,29.154857


In [18]:
fig = px.scatter_geo(novo_df, locations="pais", locationmode='country names', color="confirmed",
                        size="confirmed", hover_name="pais", range_color= [0, 50000000],
                        projection="natural earth", animation_frame="data",
                        color_continuous_scale="portland", opacity=0.7,
                        title='Casos confirmados de COVID-19 por país', width=1000, height=500)
fig.show()

In [19]:
fig = px.scatter_geo(novo_df, locations="pais", locationmode='country names', color="death",
                        size="death", hover_name="pais", range_color= [0, 500000],
                        projection="natural earth", animation_frame="data",
                        color_continuous_scale="portland", opacity=0.7,
                        title='Mortes de COVID-19 por país', width=1000, height=500)
fig.show()

In [20]:
# plotando grafico de linhas com os valores acumulados por pais
fig = px.line(novo_df, x="data", y="confirmed", color='pais', width=1000, height=500)
fig.show()

In [21]:
# plotando grafico de linhas com os valores acumulados por pais
fig = px.line(novo_df, x="data", y="death", color='pais', width=1000, height=500)
fig.show()

In [22]:
# Pergunta: Qual o máximo de casos confirmados diariamente de covid? Qual o país?
#            e para mortes?
print(confirmed.max(numeric_only=True).sort_values(ascending=False)) #US foi o maior caso
print("="*20)
print(death.max(numeric_only=True).sort_values(ascending=False))




US                      1354456.0
United Kingdom           848169.0
Turkey                   823225.0
Korea, South             621317.0
France                   503349.0
                          ...    
Winter Olympics 2022         72.0
Antarctica                   11.0
MS Zaandam                    7.0
Holy See                      7.0
Korea, North                  1.0
Length: 201, dtype: float64

Chile                   11447.0
Ecuador                  8786.0
India                    7374.0
US                       4389.0
Mexico                   4272.0
                         ...   
Winter Olympics 2022        0.0
Tuvalu                      0.0
Antarctica                  0.0
Summer Olympics 2020        0.0
Holy See                    0.0
Length: 201, dtype: float64


In [23]:
# Pergunta: Quando houve este pico máximo?
print(confirmed[confirmed["US"] == confirmed.max(numeric_only=True).max()]["data"]) #2022-01-10
print("="*20)
print(death[death["Chile"] == death.max(numeric_only=True).max()]["data"]) #2022-03-21

718   2022-01-10
Name: data, dtype: datetime64[ns]
788   2022-03-21
Name: data, dtype: datetime64[ns]


In [24]:
# Pergunta: Será que houve algum "erro" de informação onde acumulou muito caso em um dia?
display(confirmed[(confirmed["data"] > "2022-01-05") & (confirmed["data"] < "2022-01-15")][["data", "US"]])
print("="*20)
display(death[(death["data"] > "2022-03-16") & (death["data"] < "2022-03-26")][["data", "Chile"]])


Unnamed: 0,data,US
714,2022-01-06,844346.0
715,2022-01-07,878799.0
716,2022-01-08,408575.0
717,2022-01-09,493357.0
718,2022-01-10,1354456.0
719,2022-01-11,785891.0
720,2022-01-12,853584.0
721,2022-01-13,865030.0
722,2022-01-14,878894.0




Unnamed: 0,data,Chile
784,2022-03-17,184.0
785,2022-03-18,93.0
786,2022-03-19,101.0
787,2022-03-20,78.0
788,2022-03-21,11447.0
789,2022-03-22,11.0
790,2022-03-23,16.0
791,2022-03-24,155.0
792,2022-03-25,75.0


In [25]:
# Consideração: O dado do Chile aparenta estar com algum erro, o dado está destoante do restante
fig = px.line(death, x="data", y="Chile", width=1000, height=500)
fig.show()
# Confirmando online, verificou-se que nesta data teve esta quantidade de mortes no Chile.

In [26]:
# Pergunta: Qual país teve a menor variância no número de casos confirmados e de morte?
display(stats_confirmed[stats_confirmed["CV"] == stats_confirmed["CV"].min()])
display(stats_death[stats_death["CV"] == stats_death["CV"].min()])

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,CV
,,,,,,,,,
United Arab Emirates,1086.0,964.862799,903.007555,0.0,264.25,623.5,1520.75,4471.0,93.589219


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,CV
,,,,,,,,,
US,1086.0,1012.41989,937.863306,-254.0,338.25,728.5,1448.5,4389.0,92.635804


In [27]:
# criando um novo dataframe concatenando as séries com a somatória das mortes e casos confirmados para cada país
#utilize apply para criar uma nova coluna com a taxa de mortalidade
mortalidade = pd.concat([death.sum(), confirmed.sum()], axis=1)
mortalidade.reset_index(inplace=True)
mortalidade.columns = ['country', 'death', 'confirmed']
mortalidade['taxa_mortalidade'] = mortalidade.apply(lambda x: x['death']/x['confirmed'], axis=1)
mortalidade

Unnamed: 0,country,death,confirmed,taxa_mortalidade
0,Afghanistan,7854.0,207900.0,0.037778
1,Albania,3596.0,334018.0,0.010766
2,Algeria,6881.0,271277.0,0.025365
3,Andorra,165.0,47781.0,0.003453
4,Angola,1930.0,105095.0,0.018364
...,...,...,...,...
196,West Bank and Gaza,5708.0,703228.0,0.008117
197,Winter Olympics 2022,0.0,535.0,0.000000
198,Yemen,2159.0,11945.0,0.180745
199,Zambia,4034.0,336340.0,0.011994


In [28]:
# Pergunta: Qual o país que teve maior taxa de mortalidade para casos confirmados?
mortalidade.sort_values(by='taxa_mortalidade', ascending=False, inplace=True)
mortalidade.head(60)
# Verifica-se que os dados da Korea do Norte não fazem sentidos, tem um total de 6 mortes e um total de 1 caso confirmado
# Verifica-se também que a maioria dos países que tiveram uma maior taxa de mortalidade são países pobres, com exceção do Peru, México e Equador

Unnamed: 0,country,death,confirmed,taxa_mortalidade
93,"Korea, North",6.0,1.0,6.0
107,MS Zaandam,2.0,9.0,0.222222
198,Yemen,2159.0,11945.0,0.180745
169,Sudan,4995.0,63702.0,0.078412
174,Syria,3163.0,57423.0,0.055082
164,Somalia,1361.0,27310.0,0.049835
141,Peru,218455.0,4471726.0,0.048853
54,Egypt,24803.0,515645.0,0.048101
117,Mexico,331407.0,7297914.0,0.045411
22,Bosnia and Herzegovina,16241.0,401168.0,0.040484


In [29]:
# Pergunta: Qual o país que teve menor taxa de mortalidade para casos confirmados?
fig = px.scatter(mortalidade, x="confirmed", y="death", hover_name="country", width=800, height=400, log_x=True, log_y=True)
# Adicionando uma linha de tendência
fig.add_traces(go.Scatter(x=mortalidade["confirmed"], y=mortalidade["confirmed"]*0.01, mode='lines', name='Tendência'))
fig.show()

In [218]:
#fazendo copia do dataframe para não alterar o original
mortalidade2 = mortalidade.copy()
# removendo os países com poucos casos confirmados
mortalidade2 = mortalidade2[mortalidade2["confirmed"] > 1000]
# removendo os países com poucas mortes
mortalidade2 = mortalidade2[mortalidade2["death"] > 100]
# plotando mortes por casos confirmados com grafico scatter logaritmico
fig = px.scatter(mortalidade2, x="confirmed", y="death", hover_name="country", width=800, height=400, log_x=True, log_y=True)
# tracar uma reta de tendência
fig.add_traces(go.Scatter(x=mortalidade2["confirmed"], y=mortalidade2["confirmed"]*0.01, mode='lines', name='Tendência'))
fig.show()