#### In this exercise, we want to extract the countries and sites listed and their categories from this website: https://whc.unesco.org/en/list/. However, this data is not in presented in a table format hence we do some webscraping. This is meant purely for a practice because there is much more data/ statistics put together by the organisation relating to the heritage sites and can be downloaded in XML, KML or XLS formats on the right hand side of the page.

In [1]:
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup
import csv

In [2]:
import pandas as pd
import numpy as np

In [3]:
import time

In [4]:
site = "https://whc.unesco.org/en/list/"
hdr = {'User-Agent': 'Mozilla/5.0'}
bookpage = requests.get(site)
soup = BeautifulSoup(bookpage.text, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<!-- 
Building Peace in the minds of men and women
Construire la paix dans l'esprit des hommes et des femmes
Construir la paz en la mente de los hombres y de las mujeres
Нести мир в сознание мужчин и женщин
بناء السلام في عقول الرجال والنساء
于人之思想中构建和平
-->
<html id="htmlEl" lang="en" style="height:100%;">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=10, user-scalable=yes" name="viewport"/>
  <meta content="UNESCO World Heritage Centre" name="author"/>
  <meta content="UNESCO World Heritage Centre" name="og:site_name"/>
  <link href="https://whc.unesco.org/apple-touch-icon.png?v=wAXNOnQoNn" rel="apple-touch-icon" sizes="180x180"/>
  <link href="https://whc.unesco.org/favicon-32x32.png?v=wAXNOnQoNn" rel="icon" sizes="32x32" type="image/png"/>
  <link href="https://whc.unesco.org/favicon-16x16.png?v=wAXNOnQoNn

In [5]:
soup.find_all('h4')

[<h4 id="alphaA"><a href="/en/statesparties/af">Afghanistan</a></h4>,
 <h4><a href="/en/statesparties/al">Albania</a></h4>,
 <h4><a href="/en/statesparties/dz">Algeria</a></h4>,
 <h4><a href="/en/statesparties/ad">Andorra</a></h4>,
 <h4><a href="/en/statesparties/ao">Angola</a></h4>,
 <h4><a href="/en/statesparties/ag">Antigua and Barbuda</a></h4>,
 <h4><a href="/en/statesparties/ar">Argentina</a></h4>,
 <h4><a href="/en/statesparties/am">Armenia</a></h4>,
 <h4><a href="/en/statesparties/au">Australia</a></h4>,
 <h4><a href="/en/statesparties/at">Austria</a></h4>,
 <h4><a href="/en/statesparties/az">Azerbaijan</a></h4>,
 <h4 id="alphaB"><a href="/en/statesparties/bh">Bahrain</a></h4>,
 <h4><a href="/en/statesparties/bd">Bangladesh</a></h4>,
 <h4><a href="/en/statesparties/bb">Barbados</a></h4>,
 <h4><a href="/en/statesparties/by">Belarus</a></h4>,
 <h4><a href="/en/statesparties/be">Belgium</a></h4>,
 <h4><a href="/en/statesparties/bz">Belize</a></h4>,
 <h4><a href="/en/statesparties/b

In [6]:
#getting list of all countries
country = soup.find_all('h4')

In [7]:
country[0].text.strip()

'Afghanistan'

In [8]:
#note that the last two elements in the list are not countries but text (so in total 168 countries)
len(country)

170

In [77]:
#list of sites for each country embedded within div head
soup.find_all('div', class_="list_site")

[<div class="list_site"> <ul> <li class="cultural_danger"> <a href="/en/list/211">Minaret and Archaeological Remains of Jam</a> </li> <li class="cultural_danger"> <a href="/en/list/208">Cultural Landscape and Archaeological Remains of the Bamiyan Valley</a> </li> </ul> </div>,
 <div class="list_site"> <ul> <li class="mixed"> <a href="/en/list/99">Natural and Cultural Heritage of the Ohrid region</a> <a href="#transboundary" title="*: transboundary property">*</a> <sup><a href="#note1" title="Note">1</a></sup> </li> <li class="cultural"> <a href="/en/list/570">Butrint</a> </li> <li class="cultural"> <a href="/en/list/569">Historic Centres of Berat and Gjirokastra </a> </li> <li class="natural"> <a href="/en/list/1133">Ancient and Primeval Beech Forests of the Carpathians and Other Regions of Europe</a> <a href="#transboundary" title="*: transboundary property">*</a> </li> </ul> </div>,
 <div class="list_site"> <ul> <li class="cultural"> <a href="/en/list/102">Al Qal'a of Beni Hammad</a>

In [78]:
len(soup.find_all('div', class_="list_site"))

168

In [9]:
#first country
#note that you can use two find.all 
soup.find_all('div', class_="list_site")[0].find_all('li')

[<li class="cultural_danger"> <a href="/en/list/211">Minaret and Archaeological Remains of Jam</a> </li>,
 <li class="cultural_danger"> <a href="/en/list/208">Cultural Landscape and Archaeological Remains of the Bamiyan Valley</a> </li>]

In [10]:
#site count for first country
len(soup.find_all('div', class_="list_site")[0].find_all('li'))

2

In [11]:
#second country
soup.find_all('div', class_="list_site")[1].find_all('li')

[<li class="mixed"> <a href="/en/list/99">Natural and Cultural Heritage of the Ohrid region</a> <a href="#transboundary" title="*: transboundary property">*</a> <sup><a href="#note1" title="Note">1</a></sup> </li>,
 <li class="cultural"> <a href="/en/list/570">Butrint</a> </li>,
 <li class="cultural"> <a href="/en/list/569">Historic Centres of Berat and Gjirokastra </a> </li>,
 <li class="natural"> <a href="/en/list/1133">Ancient and Primeval Beech Forests of the Carpathians and Other Regions of Europe</a> <a href="#transboundary" title="*: transboundary property">*</a> </li>]

In [12]:
sites_all = soup.find_all('div', class_="list_site")
sites_all[0]

<div class="list_site"> <ul> <li class="cultural_danger"> <a href="/en/list/211">Minaret and Archaeological Remains of Jam</a> </li> <li class="cultural_danger"> <a href="/en/list/208">Cultural Landscape and Archaeological Remains of the Bamiyan Valley</a> </li> </ul> </div>

In [13]:
#getting the text for first site listed
sites_all[0].find_all('li')[0].text.strip()

'Minaret and Archaeological Remains of Jam'

In [80]:
#putting everything together
country_db = []
country = soup.find_all('h4')
sites_all = soup.find_all('div', class_="list_site")
for i in range(0,len(sites_all)):
    name = country[i].text.strip()
    sites_count = len(sites_all[i].find_all('li'))
    for j in range(0,sites_count):
        site = sites_all[i].find_all('li')[j].text.strip()
        site_type = sites_all[i].find_all('li')[j]['class'][0]
        country_db.append((name,site,site_type))
    time.sleep(1)
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167


In [72]:
country_db

[('Afghanistan',
  'Minaret and Archaeological Remains of Jam',
  'cultural_danger'),
 ('Afghanistan',
  'Cultural Landscape and Archaeological Remains of the Bamiyan Valley',
  'cultural_danger'),
 ('Albania', 'Natural and Cultural Heritage of the Ohrid region * 1', 'mixed'),
 ('Albania', 'Historic Centres of Berat and Gjirokastra', 'cultural'),
 ('Albania',
  'Ancient and Primeval Beech Forests of the Carpathians and Other Regions of Europe *',
  'natural'),
 ('Algeria', "Al Qal'a of Beni Hammad", 'cultural'),
 ('Algeria', 'Djémila', 'cultural'),
 ('Algeria', "M'Zab Valley", 'cultural'),
 ('Algeria', "Tassili n'Ajjer #", 'mixed'),
 ('Algeria', 'Timgad', 'cultural'),
 ('Algeria', 'Tipasa', 'cultural'),
 ('Algeria', 'Kasbah of Algiers', 'cultural'),
 ('Andorra', 'Madriu-Perafita-Claror Valley', 'cultural'),
 ('Angola',
  'Mbanza Kongo, Vestiges of the Capital of the former Kingdom of Kongo',
  'cultural'),
 ('Antigua and Barbuda',
  'Antigua Naval Dockyard and Related Archaeological Si

In [81]:
df = pd.DataFrame(np.array(country_db))

In [82]:
df.columns = ['country','site','type']

In [88]:
df.head(20)

Unnamed: 0,country,site,type
0,Afghanistan,Minaret and Archaeological Remains of Jam,cultural_danger
1,Afghanistan,Cultural Landscape and Archaeological Remains ...,cultural_danger
2,Albania,Natural and Cultural Heritage of the Ohrid reg...,mixed
3,Albania,Butrint,cultural
4,Albania,Historic Centres of Berat and Gjirokastra,cultural
5,Albania,Ancient and Primeval Beech Forests of the Carp...,natural
6,Algeria,Al Qal'a of Beni Hammad,cultural
7,Algeria,Djémila,cultural
8,Algeria,M'Zab Valley,cultural
9,Algeria,Tassili n'Ajjer #,mixed


In [84]:
df.shape

(1201, 3)

In [85]:
df['type'].value_counts()

cultural           879
natural            226
mixed               42
cultural_danger     36
natural_danger      18
Name: type, dtype: int64

In [91]:
df.to_csv("unesco_sites.csv", encoding = 'utf_32')

##### An alternative way of getting the list of sites is through parsing the 'li' header but this doesn't allow us to know the corresponding countries.

In [8]:
soup.find_all('li')

[<li>
 <a href="/en/35/" id="menu35-link">News &amp; Events<span class="menu-down" id="menu35-link-button"></span></a>
 </li>, <li class="active">
 <a href="/en/list/" id="menu335-link">The List<span class="menu-down" id="menu335-link-button"></span></a>
 </li>, <li>
 <a href="/en/about/" id="menu160-link">About World Heritage<span class="menu-down" id="menu160-link-button"></span></a>
 </li>, <li>
 <a href="/en/activities/" id="menu39-link">Activities<span class="menu-down" id="menu39-link-button"></span></a>
 </li>, <li>
 <a href="/en/publications/" id="menu155-link">Publications<span class="menu-down" id="menu155-link-button"></span></a>
 </li>, <li>
 <a href="/en/partnerships/" id="menu69-link">Partnerships<span class="menu-down" id="menu69-link-button"></span></a>
 </li>, <li>
 <a href="/en/resources/" id="menu68-link">Resources<span class="menu-down" id="menu68-link-button"></span></a>
 </li>, <li class="tab-show" id="tabres">Result</li>, <li id="tabresviews">Views</li>, <li clas

In [23]:
len(soup.find_all('li'))

1329

In [41]:
#position of the last site listed 
soup.find_all('li')[1208]

<li class="cultural"> <a href="/en/list/306">Matobo Hills</a> </li>

In [9]:
soup.find_all('li')[10]

<li class="cultural_danger"> <a href="/en/list/208">Cultural Landscape and Archaeological Remains of the Bamiyan Valley</a> </li>

In [10]:
#position of the last site listed 
soup.find_all('li')[9]

<li class="cultural_danger"> <a href="/en/list/211">Minaret and Archaeological Remains of Jam</a> </li>

In [11]:
soup.find_all('li')[11]

<li class="mixed"> <a href="/en/list/99">Natural and Cultural Heritage of the Ohrid region</a> <a href="#transboundary" title="*: transboundary property">*</a> <sup><a href="#note1" title="Note">1</a></sup> </li>

In [64]:
soup.find_all('li')[11]

<li class="mixed"> <a href="/en/list/99">Natural and Cultural Heritage of the Ohrid region</a> <a href="#transboundary" title="*: transboundary property">*</a> <sup><a href="#note1" title="Note">1</a></sup> </li>

In [63]:
soup.find_all('li')[12]

<li class="cultural"> <a href="/en/list/569">Historic Centres of Berat and Gjirokastra </a> </li>

In [49]:
soup.find_all('li')[12].text.strip()

'Historic Centres of Berat and Gjirokastra'

In [47]:
#category of the site
soup.find_all('li')[12]['class'][0]

'cultural'

In [50]:
site_db = []
for i in range(9,1208):
    site = soup.find_all('li')[i].text.strip()
    site_type = soup.find_all('li')[i]['class'][0]
    site_db.append((site, site_type))