# Prediction of COVID-19 project

Import libraries

In [18]:
from datetime import datetime

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

print("Libraries imported")

Libraries imported


In [7]:
# Import data to this project
url = "https://github.com/hadirga/dio-creating-ml-models-to-predict-covid19-evolution/blob/309b6d99c41cff772d1ca03283f2114082d7546d/datasets/covid_19_data.csv?raw=true"
df = pd.read_csv(url, parse_dates=["ObservationDate", "Last Update"])
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,2020-01-22,Anhui,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
1,2,2020-01-22,Beijing,Mainland China,2020-01-22 17:00:00,14.0,0.0,0.0
2,3,2020-01-22,Chongqing,Mainland China,2020-01-22 17:00:00,6.0,0.0,0.0
3,4,2020-01-22,Fujian,Mainland China,2020-01-22 17:00:00,1.0,0.0,0.0
4,5,2020-01-22,Gansu,Mainland China,2020-01-22 17:00:00,0.0,0.0,0.0


In [3]:
# Check datatypes of each column
df.dtypes

SNo                         int64
ObservationDate    datetime64[ns]
Province/State             object
Country/Region             object
Last Update        datetime64[ns]
Confirmed                 float64
Deaths                    float64
Recovered                 float64
dtype: object

Clean the column names, removing capital letters and special characters. 

In [8]:
# create a function clean the column names
import re

def clean_column_name(col_name):
    return re.sub(r"[/| ]", "", col_name).lower()

clean_column_name("Covid/19 DataSet") # testing function

'covid19dataset'

In [9]:
# clean all the columns names
df.columns = [clean_column_name(col) for col in df.columns]
df.columns

Index(['sno', 'observationdate', 'provincestate', 'countryregion',
       'lastupdate', 'confirmed', 'deaths', 'recovered'],
      dtype='object')

# Brazil

Select only the data from Brazil

In [14]:
df.countryregion.unique()

array(['Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'US', 'Japan',
       'Thailand', 'South Korea', 'China', 'Kiribati', 'Singapore',
       'Philippines', 'Malaysia', 'Vietnam', 'Australia', 'Mexico',
       'Brazil', 'Colombia', 'France', 'Nepal', 'Canada', 'Cambodia',
       'Sri Lanka', 'Ivory Coast', 'Germany', 'Finland',
       'United Arab Emirates', 'India', 'Italy', 'UK', 'Russia', 'Sweden',
       'Spain', 'Belgium', 'Others', 'Egypt', 'Iran', 'Israel', 'Lebanon',
       'Iraq', 'Oman', 'Afghanistan', 'Bahrain', 'Kuwait', 'Austria',
       'Algeria', 'Croatia', 'Switzerland', 'Pakistan', 'Georgia',
       'Greece', 'North Macedonia', 'Norway', 'Romania', 'Denmark',
       'Estonia', 'Netherlands', 'San Marino', ' Azerbaijan', 'Belarus',
       'Iceland', 'Lithuania', 'New Zealand', 'Nigeria', 'North Ireland',
       'Ireland', 'Luxembourg', 'Monaco', 'Qatar', 'Ecuador',
       'Azerbaijan', 'Czech Republic', 'Armenia', 'Dominican Republic',
       'Indonesia', 'Portugal

In [13]:
df.loc[df.countryregion == "Brazil"]

Unnamed: 0,sno,observationdate,provincestate,countryregion,lastupdate,confirmed,deaths,recovered
84,85,2020-01-23,,Brazil,2020-01-23 17:00:00,0.0,0.0,0.0
2525,2526,2020-02-26,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2631,2632,2020-02-27,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2742,2743,2020-02-28,,Brazil,2020-02-26 23:53:02,1.0,0.0,0.0
2852,2853,2020-02-29,,Brazil,2020-02-29 21:03:05,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...
306272,306273,2021-05-29,Roraima,Brazil,2021-05-30 04:20:55,103222.0,1635.0,96188.0
306290,306291,2021-05-29,Santa Catarina,Brazil,2021-05-30 04:20:55,965277.0,15174.0,921496.0
306292,306293,2021-05-29,Sao Paulo,Brazil,2021-05-30 04:20:55,3254893.0,111123.0,2895697.0
306298,306299,2021-05-29,Sergipe,Brazil,2021-05-30 04:20:55,233932.0,5054.0,208146.0


In [56]:
brazil_df = df.loc[(df.countryregion == "Brazil") & (df.confirmed > 0)]
brazil_df = brazil_df.groupby(by=["observationdate"], as_index=False)["confirmed", "deaths", "recovered"].sum()
brazil_df.head(10)


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,observationdate,confirmed,deaths,recovered
0,2020-02-26,1.0,0.0,0.0
1,2020-02-27,1.0,0.0,0.0
2,2020-02-28,1.0,0.0,0.0
3,2020-02-29,2.0,0.0,0.0
4,2020-03-01,2.0,0.0,0.0
5,2020-03-02,2.0,0.0,0.0
6,2020-03-03,2.0,0.0,0.0
7,2020-03-04,4.0,0.0,0.0
8,2020-03-05,4.0,0.0,0.0
9,2020-03-06,13.0,0.0,0.0


# Confirmed cases

In [55]:
# Graph of the evolution of confirmed cases
px.line(brazil_df, "observationdate", "confirmed", title="Confirmed cases in Brazil")