In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go

In [5]:
# Loading data
df = pd.read_csv('/Users/henrique_oliveira/Projetos/projects/covid19/data/covid_19_data.csv', parse_dates=['ObservationDate', 'Last Update'])

In [7]:
# Verifying data types
df.dtypes

SNo                         int64
ObservationDate    datetime64[ns]
Province/State             object
Country/Region             object
Last Update        datetime64[ns]
Confirmed                 float64
Deaths                    float64
Recovered                 float64
dtype: object

Column names must haven't capital letters nor special characters. Let's build a function for correct teh columns names.

In [12]:
import re

def correct_columns(col_name):
    return re.sub(r"[/| ]", "", col_name).lower()

In [15]:
# Let's correct all the columns from the dataframe
df.columns = [correct_columns(col) for col in df.columns]

In [17]:
df

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
306424,306425,2021-05-29,Zaporizhia Oblast,Ukraine,2021-05-30 04:20:55,102641.0,2335.0,95289.0
306425,306426,2021-05-29,Zeeland,Netherlands,2021-05-30 04:20:55,29147.0,245.0,0.0
306426,306427,2021-05-29,Zhejiang,Mainland China,2021-05-30 04:20:55,1364.0,1.0,1324.0
306427,306428,2021-05-29,Zhytomyr Oblast,Ukraine,2021-05-30 04:20:55,87550.0,1738.0,83790.0


# Brazil

Let's select only the data from Brazil to investigate

In [19]:
[df.countryregion == "Brazil"]df.loc

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
84,85,2020-01-23,,Brazil,2020-01-23 17:00:00,0.0,0.0,0.0
2525,2526,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2631,2632,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2742,2743,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2852,2853,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
306272,306273,2021-05-29,Roraima,Brazil,2021-05-30 04:20:55,103222.0,1635.0,96188.0
306290,306291,2021-05-29,Santa Catarina,Brazil,2021-05-30 04:20:55,965277.0,15174.0,921496.0
306292,306293,2021-05-29,Sao Paulo,Brazil,2021-05-30 04:20:55,3254893.0,111123.0,2895697.0
306298,306299,2021-05-29,Sergipe,Brazil,2021-05-30 04:20:55,233932.0,5054.0,208146.0


In [21]:
brazil = df.loc[
    (df.countryregion == "Brazil") &
    (df.confirmed > 0)
]


In [22]:
brazil

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
2525,2526,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2631,2632,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2742,2743,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2852,2853,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
2981,2982,2020-03-01,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
306272,306273,2021-05-29,Roraima,Brazil,2021-05-30 04:20:55,103222.0,1635.0,96188.0
306290,306291,2021-05-29,Santa Catarina,Brazil,2021-05-30 04:20:55,965277.0,15174.0,921496.0
306292,306293,2021-05-29,Sao Paulo,Brazil,2021-05-30 04:20:55,3254893.0,111123.0,2895697.0
306298,306299,2021-05-29,Sergipe,Brazil,2021-05-30 04:20:55,233932.0,5054.0,208146.0


# Confirmed cases

In [23]:
# Plot of the evolution of confirmed cases
px.line(brazil, 'observationdate', 'confirmed', title = 'Confirmed cases from Brazil') 

# New cases by day

In [24]:
brazil['new_cases'] = list(map(
    lambda x: 0 if (x==0) else brazil['confirmed'].iloc[x] - brazil['confirmed'].iloc[x-1],
    np.arange(brazil.shape[0])
))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
#Visualizing
px.line(brazil, 'observationdate', 'new_cases', title = 'New cases by day')