## METHOD 1 : Using Beautiful Soup

In [1]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Lets define the URL

url="https://www.worldometers.info/coronavirus/"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

In [3]:
#we pick the id of the table we want to scrape and extract HTML code for that particular table only
covid_table = soup.find("table", attrs={"id": "main_table_countries_today"})

In [1]:
#the head will form our columns
head = covid_table.thead.find_all("tr") 
head #the headers are contained in this HTML code

NameError: name 'covid_table' is not defined

In [5]:
headings = []
for th in head[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    print(th.text)
    #headings.append(td.b.text.replace('\n', ' ').strip())
    headings.append(th.text.replace("\n","").strip())
print(headings)

#
Country,Other
TotalCases
NewCases
TotalDeaths
NewDeaths
TotalRecovered
NewRecovered
ActiveCases
Serious,Critical
Tot Cases/1M pop
Deaths/1M pop
TotalTests
Tests/
1M pop

Population
Continent
1 Caseevery X ppl
1 Deathevery X ppl
1 Testevery X ppl
['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot\xa0Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop', 'Population', 'Continent', '1 Caseevery X ppl', '1 Deathevery X ppl', '1 Testevery X ppl']


In [7]:
body = covid_table.tbody.find_all("tr") 
body[0] #here is one example of HTML snippet for one row

<tr class="total_row_world row_continent" data-continent="North America" style="display: none">
<td></td>
<td style="text-align:left;">
<nobr>North America</nobr>
</td>
<td>3,217,018</td>
<td>+38,937</td>
<td>170,062</td>
<td>+1,050</td>
<td>1,400,097</td>
<td>+8,110</td>
<td>1,646,859</td>
<td>18,867</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td data-continent="North America" style="display:none;">North America</td>
<td>
</td>
<td></td>
<td></td>
</tr>

In [8]:
#lets declare empty list data that will hold all rows data
data = []
for r in range(1,len(body)):
    row = [] # empty lsit to hold one row data
    for tr in body[r].find_all("td"):
        row.append(tr.text.replace("\n","").strip())
        #append row data to row after removing newlines escape and triming unnecesary spaces
    data.append(row)
    
# data contains all the rows excluding header
# row contains data for one row

In [11]:
#We can now pass data into a pandas dataframe
#with headings as the columns
df = pd.DataFrame(data,columns=headings)
df.head(10)

Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl
0,1,USA,2758395,30542.0,130446,324.0,1146785.0,3451.0,1481164.0,15801,8333,394,34473682,104149,331002277,North America,120,2537,10
1,2,Brazil,1426913,18428.0,60194,538.0,790040.0,,576679.0,8318,6713,283,3070447,14445,212558178,South America,149,3531,69
2,3,Russia,654405,6556.0,9536,216.0,422931.0,10281.0,221938.0,2300,4484,65,19852167,136035,145934619,Europe,223,15304,7
3,4,India,604808,19016.0,17848,438.0,359891.0,12055.0,227069.0,8944,438,13,8826585,6396,1379974505,Asia,2282,77318,156
4,5,UK,313483,829.0,43906,176.0,,,,238,4618,647,9662051,142327,67886052,Europe,217,1546,7
5,6,Spain,296739,388.0,28363,8.0,,,,617,6347,607,5448984,116544,46754824,Europe,158,1648,9
6,7,Peru,285213,,9677,,174535.0,,101001.0,1185,8651,294,1679386,50937,32969875,South America,116,3407,20
7,8,Chile,282043,2650.0,5753,65.0,245443.0,4214.0,30847.0,2106,14754,301,1109792,58056,19115944,South America,68,3323,17
8,9,Italy,240760,182.0,34788,21.0,190717.0,469.0,15255.0,87,3982,575,5445476,90065,60461520,Europe,251,1738,11
9,10,Iran,230211,2549.0,10958,141.0,191487.0,2729.0,27766.0,3081,2741,130,1693242,20160,83988944,Asia,365,7665,50


In [10]:
data = df[df["#"]!=""].reset_index(drop=True)
# Data points with # value are the countries of the world while the data points with
# null values for # columns are features like continents totals etc
data = data.drop_duplicates(subset = ["Country,Other"])
#Reason to drop duplicates : Worldometer reports data for 3 days: today and 2 days back
#I found out that removing duplicates removes the values for the bast two days and keep today's

In [237]:
#We can drop the following columns - Opinion
cols = ['#',
 'Tot\xa0Cases/1M pop',
 'Deaths/1M pop',
 'Tests/1M pop',
 'Population',
 '1 Caseevery X ppl',
 '1 Deathevery X ppl',
 '1 Testevery X ppl']

In [238]:
data_final = data.drop(cols,axis=1)
data_final.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Continent
0,USA,2711956,30145,129977,556,1126736.0,9559.0,1455243.0,15930,33582461,North America
1,Brazil,1383678,13190,58927,542,757462.0,,567289.0,8318,3070447,South America
2,Russia,647849,6693,9320,154,412650.0,9220.0,225879.0,2300,19562440,Europe
3,India,585792,18256,17410,506,347836.0,12565.0,220546.0,8944,8608654,Asia
4,UK,312654,689,43730,155,,,,276,9426631,Europe


In [243]:
#Once you are happy then you can save the dataframe as csv or xlsx
#data_final.to_csv("covid.csv",index=False)

In [242]:
#Here is a list of continents
list(data_final["Continent"].unique())

['North America',
 'South America',
 'Europe',
 'Asia',
 'Africa',
 'Australia/Oceania',
 '']

In [244]:
#Then you can filter by continent
data_final[data_final["Continent"]=="Africa"];

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Continent
16,South Africa,151209,6945.0,2657.0,128.0,73543,2929.0,75009,539.0,1630008.0,Africa
22,Egypt,68311,1557.0,2953.0,81.0,18460,509.0,46898,41.0,135000.0,Africa
48,Nigeria,25133,,573.0,,9402,,15158,7.0,132304.0,Africa
53,Ghana,17741,390.0,112.0,,13268,274.0,4361,6.0,294867.0,Africa
58,Algeria,13907,336.0,912.0,7.0,9897,223.0,3098,42.0,,Africa
62,Cameroon,12592,,313.0,,10100,,2179,52.0,,Africa
63,Morocco,12533,243.0,228.0,3.0,8920,87.0,3385,7.0,681191.0,Africa
65,Ivory Coast,9499,285.0,68.0,2.0,4273,277.0,5158,,55950.0,Africa
66,Sudan,9257,,572.0,,4014,,4671,,401.0,Africa
72,DRC,7039,100.0,170.0,3.0,1426,376.0,5443,,,Africa


## METHOD 2 : USING XPath

In [21]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

In [23]:
#Rememeber that rows on HTML are stored between <tr>..</tr> tags
url = "https://www.worldometers.info/coronavirus/"
page = requests.get(url)#Store the contents of the website under doc
doc = lh.fromstring(page.content)#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr') #note that we are only using tr because we have only one table on the site
#if we had many then we could have needed to have the XPath so that we are specific.
#XPath can be accessed by inspecting elements of a site. 
print(len(list(tr_elements))) #number of all rows including the header row

696


In [24]:
#Lets deal with the header row first
col=[]
i=0#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"'%(i,name))
    col.append((name,[]))


1:"#"
2:"Country,Other"
3:"TotalCases"
4:"NewCases"
5:"TotalDeaths"
6:"NewDeaths"
7:"TotalRecovered"
8:"NewRecovered"
9:"ActiveCases"
10:"Serious,Critical"
11:"Tot Cases/1M pop"
12:"Deaths/1M pop"
13:"TotalTests"
14:"Tests/
1M pop
"
15:"Population"
16:"Continent"
17:"1 Caseevery X ppl"
18:"1 Deathevery X ppl"
19:"1 Testevery X ppl"


In [25]:
# Since out first row is the header, data is stored on the second row onwards
for j in range(1, len(tr_elements)):
    # T is our j'th row
    T = tr_elements[j]

    # If row is not of size 10, the //tr data is not from our table
    if len(T) != 19:
        break

    # i is the index of our column
    i = 0

    # Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content()
        # Check if row is empty
        if i > 0:
            # Convert any numerical value to integers
            try:
                data = int(data)
            except:
                pass
        # Append the data to the empty list of the i'th column
        col[i][1].append(data)
        # Increment i for the next column
        i += 1

#ideally all columns must have the same number of rows
print([len(C) for (title,C) in col]) #for this case we have 695

[695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695, 695]


In [26]:
#From here the same explanation holds like in method 1 above
Dict = {title:column for (title,column) in col}
df = pd.DataFrame(Dict)
data = df[df["#"]!=""].reset_index(drop=True)
data = data.drop_duplicates(subset = ["Country,Other"])

In [27]:
cols = ['#',
 'Tot\xa0Cases/1M pop',
 'Deaths/1M pop',
 'Tests/\n1M pop\n',
 'Population',
 'Continent',
 '1 Caseevery X ppl',
 '1 Deathevery X ppl',
 '1 Testevery X ppl']

data_final = data.drop(cols,axis=1)
#Assume we want to pick only these 5 countries
c = ["South Africa" , "Rwanda","Senegal","Ghana","Cameroon"]
data = data_final[data_final["Country,Other"].isin(c)].reset_index(drop=True)
data.head(10)

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests
0,South Africa,159333,8124.0,2749,92.0,76025,2482.0,80559,539.0,1666939.0
1,Ghana,17741,,112,,13268,,4361,6.0,297591.0
2,Cameroon,12592,,313,,10100,,2179,52.0,
3,Senegal,6925,132.0,116,4.0,4545,114.0,2264,35.0,80619.0
4,Rwanda,1025,,2,,447,,576,,143943.0
