
# EMATM0048: Software Development Programming and Algorithms (SDPA)
# `Tutorial - Web Scraping`


In [1]:
import requests # library to handle requests
import pandas as pd
import requests
import folium # plotting library
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values


## Example 3 (advanced): 


### <font color= 'blue'> Question: </font>: Explore trending venues in your area

In [2]:
# define URL for trending vebues in london
address = 'Bristol'

In [3]:

url = "https://api.foursquare.com/v3/places/search"

params = {
  	"query": "coffee",
  	"near": "Bristol,UK",
    #adding popularity and other needed information, check all fields here 
    #https://location.foursquare.com/developer/reference/response-fields
    "fields":"name,geocodes,popularity"
}

headers = {
    "Accept": "application/json",
    #your API key in the above screenshot
    "Authorization": "fsq3AT4h2SjhPIGBYfMa8fk9g6Mj7oqpUs8Cwur/pq4VyKc="
}

response = requests.request("GET", url, params=params, headers=headers)
print(response.text)
#Display the results
results = response.json()["results"]

{"results":[{"geocodes":{"main":{"latitude":51.455181,"longitude":-2.593114},"roof":{"latitude":51.455181,"longitude":-2.593114}},"name":"Full Court Press","popularity":0.9919012289743444},{"geocodes":{"main":{"latitude":51.453748,"longitude":-2.596502},"roof":{"latitude":51.453748,"longitude":-2.596502}},"name":"The Hatter","popularity":0.9973381661663929},{"geocodes":{"drop_off":{"latitude":51.455446,"longitude":-2.600694},"main":{"latitude":51.45533,"longitude":-2.600682},"roof":{"latitude":51.45533,"longitude":-2.600682}},"name":"Sotiris Bakery","popularity":0.917341564252138},{"geocodes":{"drop_off":{"latitude":51.45001,"longitude":-2.597376},"main":{"latitude":51.450072,"longitude":-2.597371},"roof":{"latitude":51.450072,"longitude":-2.597371}},"name":"Society Café","popularity":0.9852749617715354},{"geocodes":{"main":{"latitude":51.451247,"longitude":-2.592931},"roof":{"latitude":51.451247,"longitude":-2.592931}},"name":"Spicer & Cole","popularity":0.9840289969983576},{"geocodes

In [4]:
#Convert Json to dataframe
dataframe = pd.json_normalize(results)

dataframe

Unnamed: 0,name,popularity,geocodes.main.latitude,geocodes.main.longitude,geocodes.roof.latitude,geocodes.roof.longitude,geocodes.drop_off.latitude,geocodes.drop_off.longitude
0,Full Court Press,0.991901,51.455181,-2.593114,51.455181,-2.593114,,
1,The Hatter,0.997338,51.453748,-2.596502,51.453748,-2.596502,,
2,Sotiris Bakery,0.917342,51.45533,-2.600682,51.45533,-2.600682,51.455446,-2.600694
3,Society Café,0.985275,51.450072,-2.597371,51.450072,-2.597371,51.45001,-2.597376
4,Spicer & Cole,0.984029,51.451247,-2.592931,51.451247,-2.592931,,
5,Little Victories,0.979442,51.446648,-2.59939,51.446648,-2.59939,51.446604,-2.599265
6,25A Old Market,0.986606,51.455748,-2.582192,51.455748,-2.582192,,
7,The Crafty Egg,0.973948,51.463721,-2.589967,51.463721,-2.589967,,
8,Primrose Cafe,0.968568,51.45529,-2.618408,51.45529,-2.618408,,
9,Bakers and Co,0.913519,51.475102,-2.591029,51.475102,-2.591029,51.475193,-2.591154


## We will assume that venues with popularity >0.8 is ternding

In [5]:
trending_venues_df= dataframe[dataframe["popularity"]>0.8]
trending_venues_df

Unnamed: 0,name,popularity,geocodes.main.latitude,geocodes.main.longitude,geocodes.roof.latitude,geocodes.roof.longitude,geocodes.drop_off.latitude,geocodes.drop_off.longitude
0,Full Court Press,0.991901,51.455181,-2.593114,51.455181,-2.593114,,
1,The Hatter,0.997338,51.453748,-2.596502,51.453748,-2.596502,,
2,Sotiris Bakery,0.917342,51.45533,-2.600682,51.45533,-2.600682,51.455446,-2.600694
3,Society Café,0.985275,51.450072,-2.597371,51.450072,-2.597371,51.45001,-2.597376
4,Spicer & Cole,0.984029,51.451247,-2.592931,51.451247,-2.592931,,
5,Little Victories,0.979442,51.446648,-2.59939,51.446648,-2.59939,51.446604,-2.599265
6,25A Old Market,0.986606,51.455748,-2.582192,51.455748,-2.582192,,
7,The Crafty Egg,0.973948,51.463721,-2.589967,51.463721,-2.589967,,
8,Primrose Cafe,0.968568,51.45529,-2.618408,51.45529,-2.618408,,
9,Bakers and Co,0.913519,51.475102,-2.591029,51.475102,-2.591029,51.475193,-2.591154


In [6]:
#Getting lat and long for any location, pass the name, postice and get log, lat
address = 'Bristol City Centre'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

51.4539545 -2.5972859


In [7]:

#Visualize trending venues
venues_map = folium.Map(location=[latitude, longitude], zoom_start=15) # generate map centred around Ecco

# add the trending venues as blue circle markers
for lat, lng, label in zip(trending_venues_df['geocodes.main.latitude'], trending_venues_df['geocodes.main.longitude'], trending_venues_df['name']):
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            poup=label,
            fill=True,
            color='blue',
            fill_color='blue',
            fill_opacity=0.6
        ).add_to(venues_map)

venues_map

# Part B: HTML based scraping


### <font color= 'Blue'> Question: </font>

Extract cities in the UK from information in tables on this website
https://en.wikipedia.org/wiki/List_of_cities_in_the_United_Kingdom

In [8]:
import pandas as pd
from bs4 import BeautifulSoup
url_link= 'https://en.wikipedia.org/wiki/List_of_cities_in_the_United_Kingdom'
r = requests.get(url_link,headers ={'User-Agent':'Mozilla/5.0'})
UKdata = pd.read_html(r.text)
print(f'Extracted {len(UKdata)} table/s')
UKdata[0]

Extracted 5 table/s


  UKdata = pd.read_html(r.text)


Unnamed: 0_level_0,De facto[e],De facto[e],De facto[e],De jure[f],De jure[f],De jure[f]
Unnamed: 0_level_1,City,Statistical region,Year granted or confirmed,City[3][1],City council status,Population
0,London,London,"""time immemorial""[g]",City of London[h],Sui generis and ceremonial county,"12,156 (2023)[19]"
1,Westminster,London,1540,City of Westminster[i],London borough,"213,119 (2023)[19]"
2,Birmingham,West Midlands,1889[21],City of Birmingham[j],Metropolitan borough,"1,171,467 (2023)[19]"
3,Leeds,Yorkshire and the Humber,1893,City of Leeds[k],Metropolitan borough,"829,417 (2023)[19]"
4,Glasgow32 (Scots: Glesga) (Scottish Gaelic: Gl...,Scotland,mid-18th century[12] (Burgh: 1492),Glasgow,Council area,"620,700 (2022)[24]"
...,...,...,...,...,...,...
71,Armagh[ab] (Irish: Ard Mhacha) (Ulster-Scots: ...,Northern Ireland,1994,,"Represented on Armagh City, Banbridge and Crai...","16,310 (2021)[106]"
72,Bangor,Wales,"""time immemorial""",Bangor community[k],Community,"15,060 (2021)[107]"
73,Wells,South West England,"""time immemorial""",Wells parish[k],Civil parish,"11,145 (2021)[108]"
74,St Asaph (Welsh: Llanelwy),Wales,2012,St Asaph community[s],Community,"3,485 (2021)[109]"


## <font color= "blue">Question:</font>
- Extract "Leave a Reply" title from this article
-  Extract all links on the page

## <font color= "Green"> Answer:</font>


In [9]:

web_url = 'https://www.luckytailsalpacafarm.co.uk/'
re = requests.get(web_url)
print('Status code\n', response.status_code)
print('\n--\n')
print('Content of the website\n', response.content[:2000])

Status code
 200

--

Content of the website
 b'{"results":[{"geocodes":{"main":{"latitude":51.455181,"longitude":-2.593114},"roof":{"latitude":51.455181,"longitude":-2.593114}},"name":"Full Court Press","popularity":0.9919012289743444},{"geocodes":{"main":{"latitude":51.453748,"longitude":-2.596502},"roof":{"latitude":51.453748,"longitude":-2.596502}},"name":"The Hatter","popularity":0.9973381661663929},{"geocodes":{"drop_off":{"latitude":51.455446,"longitude":-2.600694},"main":{"latitude":51.45533,"longitude":-2.600682},"roof":{"latitude":51.45533,"longitude":-2.600682}},"name":"Sotiris Bakery","popularity":0.917341564252138},{"geocodes":{"drop_off":{"latitude":51.45001,"longitude":-2.597376},"main":{"latitude":51.450072,"longitude":-2.597371},"roof":{"latitude":51.450072,"longitude":-2.597371}},"name":"Society Caf\xc3\xa9","popularity":0.9852749617715354},{"geocodes":{"main":{"latitude":51.451247,"longitude":-2.592931},"roof":{"latitude":51.451247,"longitude":-2.592931}},"name":"Spi

In [10]:
#Q1
soup_object = BeautifulSoup(re.content)
soup_object.find_all("h2")[0]

<h2 class="font_2 wixui-rich-text__text" style="font-size:24px; line-height:1.2em; text-align:center;"><span class="color_2 wixui-rich-text__text"><span class="wixui-rich-text__text" style="font-size:24px;"><span class="wixui-rich-text__text" style="letter-spacing:0em;"><span class="wixui-rich-text__text" style="font-family:lulo-clean-w01-one-bold,sans-serif;">Donkey Experiences</span></span></span></span></h2>

In [11]:
## Q2: 
for link in soup_object.find_all('a', href=True):
    print(link['href'])
    

https://fareharbor.com/embeds/book/luckytailsalpacafarm/items/576928/?full-items=yes&flow=470673
https://www.luckytailsalpacafarm.co.uk
https://www.luckytailsalpacafarm.co.uk/gift-vouchers
https://www.luckytailsalpacafarm.co.uk/walks
https://www.luckytailsalpacafarm.co.uk/adoptions
https://www.luckytailsalpacafarm.co.uk/meerkats
https://www.luckytailsalpacafarm.co.uk/our-boys
https://www.luckytailsalpacafarm.co.uk/about-5
https://www.luckytailsalpacafarm.co.uk/care-farm
https://www.luckytailsalpacafarm.co.uk/blog
https://www.luckytailsalpacafarm.co.uk/faq
https://www.luckytailsalpacafarm.co.uk/cafe
https://www.luckytailsalpacafarm.co.uk/care-home-visits
https://www.luckytailsalpacafarm.co.uk
https://www.luckytailsalpacafarm.co.uk/cart-page
https://www.facebook.com/sarahbooth722
https://www.instagram.com/luckytailsalpacas
https://www.tiktok.com/@lucky.tails.alpacas
https://fareharbor.com/embeds/book/luckytailsalpacafarm/?full-items=yes&flow=472922
https://fareharbor.com/embeds/book/luck