# Libraries

In [1]:
# to get web contents
import requests 
# scrap and clean web contents
from bs4 import BeautifulSoup

# numerical opeations
import numpy as np
# storing and processing in a dataframe
import pandas as pd

# Data

In [2]:
# read data
df = pd.read_csv('sars_2003_complete_dataset_raw.csv', 
                 parse_dates=['Date'])
# first few rows
df.head()

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered
0,2003-03-17,Germany,1,0,0
1,2003-03-17,Canada,8,2,0
2,2003-03-17,Singapore,20,0,0
3,2003-03-17,Hong Kong Special Administrative Region of Ch...,95,1**,0
4,2003-03-17,Switzerland,2,0,0


In [3]:
# info of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2539 entries, 0 to 2538
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Date                          2539 non-null   datetime64[ns]
 1   Country                       2539 non-null   object        
 2   Cumulative number of case(s)  2539 non-null   object        
 3   Number of deaths              2539 non-null   object        
 4   Number recovered              2539 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 99.3+ KB


# Preprocessing

In [4]:
# extract number and fill na with 0
# =================================

# columns that need processing
num_cols = ['Cumulative number of case(s)', 'Number of deaths', 
            'Number recovered']

# loop through column
for col in num_cols:
    # extract number from string
    df[col] = df[col].str.extract('(\d+)', expand=False)
    # fill na with 0
    df[col] = df[col].fillna('0').astype(int)

In [5]:
# strip extra white space
df['Country'] = df['Country'].str.strip()
df['Country'].value_counts()

Germany                                            96
Singapore                                          96
Thailand                                           96
United Kingdom                                     95
Viet Nam                                           95
                                                   ..
China, Hong Kong Special Administrative Region5     1
China \n    +                                       1
Viet Nam 5                                          1
Total                                               1
China, Guangdong Province+                          1
Name: Country, Length: 62, dtype: int64

In [6]:
# all the rows that contains 'China'
df[df['Country'].str.contains('China')]

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered
3,2003-03-17,Hong Kong Special Administrative Region of China,95,1,0
9,2003-03-18,China \n +,0,0,0
11,2003-03-18,Hong Kong Special Administrative Region of China,123,1,0
12,2003-03-18,"Taiwan, China",3,0,0
18,2003-03-19,China+,0,0,0
...,...,...,...,...,...
2482,2003-07-10,"China, Taiwan",671,84,506
2510,2003-07-11,China^5,5327,348,4941
2511,2003-07-11,"China, Hong Kong Special Administrative Region^6",1755,298,1433
2512,2003-07-11,"China, Macao Special Administrative Region",1,0,1


In [7]:
# value_counts of the country of rows that contains 'China'
df[df['Country'].str.contains('China')]['Country'].value_counts()

China, Taiwan                                            88
China                                                    68
China, Macao Special Administrative Region               58
China, Hong Kong Special Administrative Region^4         32
China, Hong Kong Special Administrative Region^6         14
China, Hong Kong Special Administrative Region           14
China^5                                                  14
China, Hong Kong Special Administrative Region^5         13
Hong Kong Special Administrative Region of China          8
Taiwan, China                                             7
China, Hong Kong Special Administrative Region4           5
China, Hong Kong Special Administrative Region 5          5
China3                                                    4
China +                                                   3
China +                                                   3
China, Hong Kong Special Administrative Region 4          3
China, Guangdong Province+              

In [8]:
# replace strings
df['Country'] = df['Country'].replace(r'China, Hong Kong Special Administrative Region', 'Hong Kong SAR, China', regex=True)
df['Country'] = df['Country'].replace(r'^.*Hong Kong.*$', 'Hong Kong SAR, China', regex=True)
df['Country'] = df['Country'].replace(r'^.*Macao.*$', 'Macao SAR, China', regex=True)
df['Country'] = df['Country'].replace(r'^.*Taiwan.*$', 'Taiwan, China', regex=True)
df['Country'] = df['Country'].replace(r'China\s+', 'China', regex=True)
df['Country'] = df['Country'].replace(r'Viet Nam\s+', 'China', regex=True)
df['Country'] = df['Country'].replace(r'China, Guangdong Province', 'China', regex=True)

df['Country'] = df['Country'].replace({'\^\d':''}, regex=True)
df['Country'] = df['Country'].replace({'\d':''}, regex=True)
df['Country'] = df['Country'].replace({'\+':''}, regex=True)
df['Country'] = df['Country'].replace({'\n':''}, regex=True)

df = df[df['Country']!='Total']

In [9]:
# sort according to country name
df['Country'].value_counts().sort_index()

Australia               17
Belgium                  5
Brazil                  81
Bulgaria                20
Canada                  96
China                   96
Colombia                53
Finland                 51
France                  90
Germany                 96
Hong Kong SAR, China    96
India                   64
Indonesia               72
Italy                   92
Japan                   19
Kuwait                  75
Macao SAR, China        58
Malaysia                79
Mongolia                69
New Zealand             56
Philippines             72
Poland                  10
Republic of Ireland     92
Republic of Korea       59
Romania                 87
Russian Federation      31
Singapore               96
Slovenia                 5
South Africa            74
Spain                   86
Sweden                  72
Switzerland             94
Taiwan, China           95
Thailand                96
United Kingdom          95
United States           94
Viet Nam                95
N

In [10]:
# sort accoring to values
df['Country'].value_counts().sort_values()

Belgium                  5
Slovenia                 5
Poland                  10
Australia               17
Japan                   19
Bulgaria                20
Russian Federation      31
Finland                 51
Colombia                53
New Zealand             56
Macao SAR, China        58
Republic of Korea       59
India                   64
Mongolia                69
Philippines             72
Sweden                  72
Indonesia               72
South Africa            74
Kuwait                  75
Malaysia                79
Brazil                  81
Spain                   86
Romania                 87
France                  90
Republic of Ireland     92
Italy                   92
United States           94
Switzerland             94
Viet Nam                95
Taiwan, China           95
United Kingdom          95
Germany                 96
Thailand                96
China                   96
Hong Kong SAR, China    96
Canada                  96
Singapore               96
N

In [11]:
# rows that contains commulative count
df[df['Country']=='Total']

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered


In [12]:
# first few row of the final data
df.head()

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered
0,2003-03-17,Germany,1,0,0
1,2003-03-17,Canada,8,2,0
2,2003-03-17,Singapore,20,0,0
3,2003-03-17,"Hong Kong SAR, China",95,1,0
4,2003-03-17,Switzerland,2,0,0


# Save data

In [13]:
# save data in csv file
df.to_csv('sars_2003_complete_dataset_clean.csv', index=False)