# Source

https://www.who.int/csr/sars/country/en/

# Libraries

In [1]:
# to get web contents
import requests 
# scrap and clean web contents
from bs4 import BeautifulSoup

# numerical opeations
import numpy as np
# storing and processing in a dataframe
import pandas as pd

# Extracting links and date each days report

In [2]:
# get data from main page
# ======================

# url of the main page
who_url = "https://www.who.int/csr/sars/country/en/"
# get contents of the page
who_req = requests.get(who_url)
# convert the contents into soup object using html parser
who_soup = BeautifulSoup(who_req.content, "html.parser")
# find all 'div' elements with class 'col_2-1_1'
who_data = who_soup.findAll('div', attrs={'class':'col_2-1_1'})
# no. of who_data
print(len(who_data))

1


In [3]:
# find all the links in who_data
# ==============================

# find all the links in who_data
links = who_data[0].findAll('a')
# no. of links
print(len(links))

99


In [4]:
# get each days link and date
# ==========================

# container for situation reports links
situation_reports_links = []
# container for situation reports dates
situation_reports_dates = []

# enumerate through links and get and store address and date
for ind, a in enumerate(links):
    # no tables at that index
    if(ind==96): 
        pass
    else:
        # get address
        address = 'https://www.who.int'+a.get('href')
        # append to situation_reports_links
        situation_reports_links.append(address)
        # get date
        date = a.contents[0]
        # append to situation_reports_dates
        situation_reports_dates.append(date)
            
# first few contents
# print(situation_reports_links[:3])
# print(situation_reports_dates[:3])

# no. of itemes in the lists
print(len(situation_reports_links))
print(len(situation_reports_dates))

98
98


# Scrap tables

In [5]:
# scarping html tables to dataframes 

# container for situation report
sr_df = []

# go through each days report link
for ind, sr_link in enumerate(situation_reports_links):

    # print index
    print('Index :', '\t', ind)
    # print situation report link
    print('Link :', '\t', sr_link)
    # print situation report date
    print('Date :', '\t', situation_reports_dates[ind])
    
    # getting html contents
    sr_req = requests.get(sr_link)
    # create a soup object
    sr_soup = BeautifulSoup(sr_req.content, "html.parser")
    
    # find the first table
    sr_table = sr_soup.find_all('table')[0]
    # find all the rows in the table
    sr_rows = sr_table.find_all('tr')
    
    
    # storing table as list of rows
    # =============================
    
    # container for table rows
    row_list = []

    # loop through rows 
    for tr in sr_rows:
        td = tr.find_all('td')
        row = [i.text for i in td]
        row_list.append(row)
        
    
    # saving as a dataframes
    # ======================
    
    # different dates data has different table format

    if(ind<=20):
        df_bs = pd.DataFrame(row_list[1:len(row_list)-1], columns=row_list[0])

    elif(ind<=26):
        cols = ['Country', 'Cumulative number of case(s)', 
                'Number of new cases since last WHO update', 
                'Number of deaths', 'Number recovered', 
                'Local chain(s) of transmission']
        df_bs = pd.DataFrame(row_list[2:len(row_list)-1], columns=cols)

    elif(ind<=95):
        cols = ['Country', 'Cumulative number of case(s)', 
                'Number of new cases since last WHO update', 
                'Number of deaths', 'Number recovered', 
                'Local chain(s) of transmission', 
                'Date of last report']
        df_bs = pd.DataFrame(row_list[2:len(row_list)-1], columns=cols)

    else:
        df_bs = pd.DataFrame(row_list[2:len(row_list)-1], columns=row_list[1])
        
    # append to situation report container
    sr_df.append(df_bs)
    
    # no. of columns
    print('No. of columns in the dataframe :', '\t', len(df_bs.columns))
    # list of columns
    print('Columns :', '\t', df_bs.columns)
    
    # create a horizontal row
    print('\n' + '='*80 + '\n')
    
    # save as a csv file
    # df_bs.to_csv(date+'.csv')

print(len(sr_df))

Index : 	 0
Link : 	 https://www.who.int/csr/sars/country/table/en/
Date : 	 17 March 2003
No. of columns in the dataframe : 	 4
Columns : 	 Index([' Country', ' Total number of case(s)', ' Number of deaths ',
       ' Local transmission'],
      dtype='object')


Index : 	 1
Link : 	 https://www.who.int/csr/sars/country/tablemarch18/en/
Date : 	 18 March 2003
No. of columns in the dataframe : 	 4
Columns : 	 Index(['Country', 'Cumulative number of case(s) §', 'Number of deaths',
       'Local transmission'],
      dtype='object')


Index : 	 2
Link : 	 https://www.who.int/csr/sars/country/2003_19_03/en/
Date : 	 19 March 2003
No. of columns in the dataframe : 	 4
Columns : 	 Index(['Country', 'Cumulative number of case(s) §', 'Number of deaths',
       'Local transmission'],
      dtype='object')


Index : 	 3
Link : 	 https://www.who.int/csr/sars/country/2003_03_20/en/
Date : 	 20 March 2003
No. of columns in the dataframe : 	 4
Columns : 	 Index(['Country ', 'Cumulative number of ca

No. of columns in the dataframe : 	 6
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission'],
      dtype='object')


Index : 	 24
Link : 	 https://www.who.int/csr/sars/country/2003_04_14/en/
Date : 	 14 April 2003
No. of columns in the dataframe : 	 6
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission'],
      dtype='object')


Index : 	 25
Link : 	 https://www.who.int/csr/sars/country/2003_04_15/en/
Date : 	 15 April 2003
No. of columns in the dataframe : 	 6
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission'],
      dtype='object')


Index : 	 26
Link : 	 https://ww

No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 42
Link : 	 https://www.who.int/csr/sars/country/2003_05_05/en/
Date : 	 5 May 2003
No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 43
Link : 	 https://www.who.int/csr/sars/country/2003_05_06/en/
Date : 	 6 May 2003
No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
   

No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 60
Link : 	 https://www.who.int/csr/sars/country/2003_05_26/en/
Date : 	 26 May 2003
No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 61
Link : 	 https://www.who.int/csr/sars/country/2003_05_27/en/
Date : 	 27 May 2003
No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
 

No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 78
Link : 	 https://www.who.int/csr/sars/country/2003_06_18/en/
Date : 	 18 June 2003
No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 79
Link : 	 https://www.who.int/csr/sars/country/2003_06_19/en/
Date : 	 19 June 2003
No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',

No. of columns in the dataframe : 	 7
Columns : 	 Index(['Country', 'Cumulative number of case(s)',
       'Number of new cases since last WHO update', 'Number of deaths',
       'Number recovered', 'Local chain(s) of transmission',
       'Date of last report'],
      dtype='object')


Index : 	 96
Link : 	 https://www.who.int/csr/sars/country/table2003_09_23/en/
Date : 	 26 September 2003
No. of columns in the dataframe : 	 15
Columns : 	 Index(['Areas', 'Female', 'Male', 'Total', 'Median age (range)',
       'Number of deaths^a ', 'Case fatality ratio  (%) ',
       'Number of imported cases (%)', 'Number of HCW affected (%)',
       'Date onset first probable case', 'Date onset last probable case', '',
       '', '', ''],
      dtype='object')


Index : 	 97
Link : 	 https://www.who.int/csr/sars/country/table2004_04_21/en/
Date : 	 21 April 2004
No. of columns in the dataframe : 	 11
Columns : 	 Index(['Areas', 'Female', 'Male', 'Total', 'Median age (range)',
       'Number of deat

In [6]:
# first few rows first table
sr_df[0].head()

Unnamed: 0,Country,Total number of case(s),Number of deaths,Local transmission
0,Germany,1,0,None*
1,Canada,8,2,Yes
2,Singapore,20,0,Yes
3,Hong Kong Special Administrative Region of Ch...,95,1**,Yes
4,Switzerland,2,0,To be determined


### Data from March 17th - April 9th

In [7]:
temp = sr_df.copy()

mar_17_to_apr_9_dfs = temp[:21]
mar_17_to_apr_9_dates = situation_reports_dates[:21]

cols = ['Country', 'Cumulative number of case(s)', 'Number of deaths', 'Local chain(s) of transmission']

for df, date in zip(mar_17_to_apr_9_dfs, mar_17_to_apr_9_dates):
    df.columns = cols
    df['Date'] = date
    df['Number recovered'] = np.nan
    
mar_17_to_apr_9 = pd.concat(mar_17_to_apr_9_dfs)
mar_17_to_apr_9.head()

# print(mar_17_to_apr_9.shape)
# print(sum([len(i) for i in mar_17_to_apr_9_dfs]))
# print(len(mar_17_to_apr_9_dfs))

Unnamed: 0,Country,Cumulative number of case(s),Number of deaths,Local chain(s) of transmission,Date,Number recovered
0,Germany,1,0,None*,17 March 2003,
1,Canada,8,2,Yes,17 March 2003,
2,Singapore,20,0,Yes,17 March 2003,
3,Hong Kong Special Administrative Region of Ch...,95,1**,Yes,17 March 2003,
4,Switzerland,2,0,To be determined,17 March 2003,


### Data from April 10th - April 16th

In [8]:
temp = sr_df.copy()

apr_10_to_apr_16_dfs = temp[21:27]
apr_10_to_apr_16_dates = situation_reports_dates[21:27]
        
cols = ['Country', 'Cumulative number of case(s)', 
        'Number of new cases since last WHO update',
        'Number of deaths', 'Number recovered',
        'Local chain(s) of transmission']

for df, date in zip(apr_10_to_apr_16_dfs, apr_10_to_apr_16_dates):
    df.columns = cols
    df['Date'] = date
    
apr_10_to_apr_16 = pd.concat(apr_10_to_apr_16_dfs)
apr_10_to_apr_16.head()

# print(apr_10_to_apr_16.shape)
# print(sum([len(i) for i in apr_10_to_apr_16_dfs]))
# print(len(apr_10_to_apr_16_dfs))

Unnamed: 0,Country,Cumulative number of case(s),Number of new cases since last WHO update,Number of deaths,Number recovered,Local chain(s) of transmission,Date
0,Brazil,\n 2,1,0,0,\nNone,10 April 2003
1,Canada,\n 97,3,10,22,Yes,10 April 2003
2,China,\n 1290,10,55,1025,Yes,10 April 2003
3,"China, Hong Kong Special Administrative Region...",998,28,30,154,Yes,10 April 2003
4,"China, Taiwan",19,0,0,5,Yes,10 April 2003


### Data from April 17th - July 11th

In [9]:
temp = sr_df.copy()

apr_17_to_july_11_dfs = temp[27:96]
apr_17_to_july_11_dates = situation_reports_dates[27:96]
        
cols = ['Country', 'Cumulative number of case(s)', 
        'Number of new cases since last WHO update',
        'Number of deaths', 'Number recovered',
        'Local chain(s) of transmission', 'Date of last report']

for df, date in zip(apr_17_to_july_11_dfs, apr_17_to_july_11_dates):
    df.columns = cols
    df['Date'] = date
    
apr_17_to_july_11 = pd.concat(apr_17_to_july_11_dfs)
apr_17_to_july_11.head()

# print(apr_17_to_july_11.shape)
# print(sum([len(i) for i in apr_17_to_july_11_dfs]))
# print(len(apr_17_to_july_11_dfs))

Unnamed: 0,Country,Cumulative number of case(s),Number of new cases since last WHO update,Number of deaths,Number recovered,Local chain(s) of transmission,Date of last report,Date
0,Australia,3,3,0,3,,17 Apr 2003,17 April 2003
1,Brazil,2,0,0,0,,16 Apr 2003,17 April 2003
2,Canada,126,23,12,46,Yes,17 Apr 2003,17 April 2003
3,China,1457,25,65,1107,Yes,17 Apr 2003,17 April 2003
4,"China, Hong Kong Special Administrative Region 5",1297,29,65,272,Yes,17 Apr 2003,17 April 2003


# Save data

In [10]:
# subset columns only required columns
df1 = mar_17_to_apr_9[['Date', 'Country', 'Cumulative number of case(s)', 
                       'Number of deaths', 'Number recovered']]
# subset columns only required columns
df2 = apr_10_to_apr_16[['Date', 'Country', 'Cumulative number of case(s)', 
                        'Number of deaths', 'Number recovered']]
# subset columns only required columns
df3 = apr_17_to_july_11[['Date', 'Country', 'Cumulative number of case(s)', 
                         'Number of deaths', 'Number recovered']]

# concateate dataframes
final_df = pd.concat([df1, df2, df3])

# save dataframe as csv
final_df.to_csv('sars_2003_complete_dataset.csv', index=False)