## METHOD 1 : Using Beautiful Soup

In [17]:
# importing the libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [18]:
#Lets define the URL

url="https://www.worldometers.info/coronavirus/"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

In [19]:
#we pick the id of the table we want to scrape and extract HTML code for that particular table only
covid_table = soup.find("table", attrs={"id": "main_table_countries_today"})

In [20]:
#the head will form our columns
head = covid_table.thead.find_all("tr") 
head #the headers are contained in this HTML code

[<tr>
 <th width="1%">#</th>
 <th width="100">Country,<br/>Other</th>
 <th width="20">Total<br/>Cases</th>
 <th width="30">New<br/>Cases</th>
 <th width="30">Total<br/>Deaths</th>
 <th width="30">New<br/>Deaths</th>
 <th width="30">Total<br/>Recovered</th>
 <th width="30">New<br/>Recovered</th>
 <th width="30">Active<br/>Cases</th>
 <th width="30">Serious,<br/>Critical</th>
 <th width="30">Tot Cases/<br/>1M pop</th>
 <th width="30">Deaths/<br/>1M pop</th>
 <th width="30">Total<br/>Tests</th>
 <th width="30">Tests/<br/>
 <nobr>1M pop</nobr>
 </th>
 <th width="30">Population</th>
 <th style="display:none" width="30">Continent</th>
 <th width="30">1 Case<br/>every X ppl</th><th width="30">1 Death<br/>every X ppl</th><th width="30">1 Test<br/>every X ppl</th>
 <th width="30">New Cases/1M pop</th>
 <th width="30">New Deaths/1M pop</th>
 <th width="30">Active Cases/1M pop</th>
 </tr>]

In [21]:
headings = []
for th in head[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    print(th.text)
    #headings.append(td.b.text.replace('\n', ' ').strip())
    headings.append(th.text.replace("\n","").strip())
print(headings)

#
Country,Other
TotalCases
NewCases
TotalDeaths
NewDeaths
TotalRecovered
NewRecovered
ActiveCases
Serious,Critical
Tot Cases/1M pop
Deaths/1M pop
TotalTests
Tests/
1M pop

Population
Continent
1 Caseevery X ppl
1 Deathevery X ppl
1 Testevery X ppl
New Cases/1M pop
New Deaths/1M pop
Active Cases/1M pop
['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases', 'Serious,Critical', 'Tot\xa0Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/1M pop', 'Population', 'Continent', '1 Caseevery X ppl', '1 Deathevery X ppl', '1 Testevery X ppl', 'New Cases/1M pop', 'New Deaths/1M pop', 'Active Cases/1M pop']


In [22]:
body = covid_table.tbody.find_all("tr") 
body[0] #here is one example of HTML snippet for one row

<tr class="total_row_world row_continent" data-continent="North America" style="display: none">
<td></td>
<td style="text-align:left;">
<nobr>North America</nobr>
</td>
<td>40,309,824</td>
<td>+4,915</td>
<td>911,109</td>
<td>+202</td>
<td>33,625,554</td>
<td>+2,681</td>
<td>5,773,161</td>
<td>11,461</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td data-continent="North America" style="display:none;">North America</td>
<td>
</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>

In [23]:
#lets declare empty list data that will hold all rows data
data = []
for r in range(1,len(body)):
    row = [] # empty lsit to hold one row data
    for tr in body[r].find_all("td"):
        row.append(tr.text.replace("\n","").strip())
        #append row data to row after removing newlines escape and triming unnecesary spaces
    data.append(row)
    
# data contains all the rows excluding header
# row contains data for one row

In [24]:
#We can now pass data into a pandas dataframe
#with headings as the columns
df = pd.DataFrame(data,columns=headings)
df.head(10)

Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/1M pop,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
0,,Asia,54242089,21014.0,762222,334.0,51540359,31044.0,1939508,26834,...,,,,Asia,,,,,,
1,,South America,31403277,2118.0,964614,85.0,28428687,2623.0,2009976,31771,...,,,,South America,,,,,,
2,,Europe,47387981,252.0,1091039,5.0,44867729,3678.0,1429213,9027,...,,,,Europe,,,,,,
3,,Africa,5201976,,137032,,4614411,,450533,2973,...,,,,Africa,,,,,,
4,,Oceania,71051,9.0,1259,,67835,,1957,10,...,,,,Australia/Oceania,,,,,,
5,,,721,,15,,706,,0,0,...,,,,,,,,,,
6,,World,178616919,28308.0,3867290,626.0,163145281,40026.0,11604348,82076,...,,,,All,,,,,,
7,1.0,USA,34393269,,616920,,28675929,,5100420,4160,...,498230075.0,1496767.0,332870823.0,North America,10.0,540.0,1.0,,,15323.0
8,2.0,India,29823546,782.0,385167,,28678390,7511.0,759989,8944,...,389207637.0,279392.0,1393049733.0,Asia,47.0,3617.0,4.0,0.6,,546.0
9,3.0,Brazil,17802176,,498621,,16136968,,1166587,8318,...,52714701.0,246315.0,214013244.0,South America,12.0,429.0,4.0,,,5451.0


In [25]:
data = df[df["#"]!=""].reset_index(drop=True)
# Data points with # value are the countries of the world while the data points with
# null values for # columns are features like continents totals etc
data = data.drop_duplicates(subset = ["Country,Other"])
#Reason to drop duplicates : Worldometer reports data for 3 days: today and 2 days back
#I found out that removing duplicates removes the values for the bast two days and keep today's

In [26]:
data.columns

Index(['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases',
       'Serious,Critical', 'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests',
       'Tests/1M pop', 'Population', 'Continent', '1 Caseevery X ppl',
       '1 Deathevery X ppl', '1 Testevery X ppl', 'New Cases/1M pop',
       'New Deaths/1M pop', 'Active Cases/1M pop'],
      dtype='object')

In [27]:
# Columns to keep
cols = ['Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases',
       'Serious,Critical', 'TotalTests',"Continent"]

In [28]:
data_final = data[cols]
data_final.head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Continent
0,USA,34393269,,616920,,28675929,,5100420,4160,498230075,North America
1,India,29823546,782.0,385167,,28678390,7511.0,759989,8944,389207637,Asia
2,Brazil,17802176,,498621,,16136968,,1166587,8318,52714701,South America
3,France,5752872,,110702,,5546870,,95300,1740,90215804,Europe
4,Turkey,5359728,,49071,,5224224,,86433,856,58123990,Asia


In [29]:
#Once you are happy then you can save the dataframe as csv or xlsx
#data_final.to_csv("covid.csv",index=False)

In [32]:
#Here is a list of continents
list(data_final["Continent"].unique())

['North America',
 'Asia',
 'South America',
 'Europe',
 'Africa',
 'Australia/Oceania',
 '']

In [36]:
#Then you can filter by continent
data_final[data_final["Continent"]=="Africa"].head()

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Continent
18,South Africa,1796589,,58441,,1632182,,105966,546,12383955,Africa
42,Morocco,525924,,9233,,512937,,3754,101,6690836,Africa
53,Tunisia,378982,,13874,,330331,,34777,424,1597918,Africa
65,Egypt,276190,,15791,,204701,,55698,90,2869589,Africa
66,Ethiopia,274899,,4276,,253634,,16989,228,2813953,Africa


## METHOD 2 : USING XPath

In [37]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

In [38]:
#Rememeber that rows on HTML are stored between <tr>..</tr> tags
url = "https://www.worldometers.info/coronavirus/"
page = requests.get(url)#Store the contents of the website under doc
doc = lh.fromstring(page.content)#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr') #note that we are only using tr because we have only one table on the site
#if we had many then we could have needed to have the XPath so that we are specific.
#XPath can be accessed by inspecting elements of a site. 
print(len(list(tr_elements))) #number of all rows including the header row

717


In [39]:
#Lets deal with the header row first
col=[]
i=0#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"'%(i,name))
    col.append((name,[]))


1:"#"
2:"Country,Other"
3:"TotalCases"
4:"NewCases"
5:"TotalDeaths"
6:"NewDeaths"
7:"TotalRecovered"
8:"NewRecovered"
9:"ActiveCases"
10:"Serious,Critical"
11:"Tot Cases/1M pop"
12:"Deaths/1M pop"
13:"TotalTests"
14:"Tests/
1M pop
"
15:"Population"
16:"Continent"
17:"1 Caseevery X ppl"
18:"1 Deathevery X ppl"
19:"1 Testevery X ppl"
20:"New Cases/1M pop"
21:"New Deaths/1M pop"
22:"Active Cases/1M pop"


In [44]:
# Since out first row is the header, data is stored on the second row onwards
for j in range(1, len(tr_elements)):
    # T is our j'th row
    T = tr_elements[j]

    # If row is not of size 19, the //tr data is not from our table
#     if len(T) != 19:
#         break

    # i is the index of our column
    i = 0

    # Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content()
        # Check if row is empty
        if i > 0:
            # Convert any numerical value to integers
            try:
                data = int(data)
            except:
                pass
        # Append the data to the empty list of the i'th column
        col[i][1].append(data)
        # Increment i for the next column
        i += 1

#ideally all columns must have the same number of rows
print([len(C) for (title,C) in col]) #for this case we have 695

[716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716, 716]


In [45]:
#From here the same explanation holds like in method 1 above
Dict = {title:column for (title,column) in col}
df = pd.DataFrame(Dict)
data = df[df["#"]!=""].reset_index(drop=True)
data = data.drop_duplicates(subset = ["Country,Other"])

In [46]:
# Columns to keep
cols = ['Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'NewRecovered', 'ActiveCases',
       'Serious,Critical', 'TotalTests',"Continent"]

data_final = data[cols]
#Assume we want to pick only these 5 countries
c = ["South Africa" , "Rwanda","Senegal","Ghana","Cameroon"]
data = data_final[data_final["Country,Other"].isin(c)].reset_index(drop=True)
data.head(10)

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",TotalTests,Continent
0,South Africa,1796589,,58441,,1632182,,105966,546,12383955,Africa
1,Ghana,94913,,793,,92881,,1239,9,1230125,Africa
2,Cameroon,80328,,1313,,78162,,853,152,1718937,Africa
3,Senegal,42259,,1158,,40767,,334,6,545258,Africa
4,Rwanda,30048,,378,,26393,,3277,16,1535396,Africa
