To filter out the popular places of London, we will scrape the top 50 highly-rated places of attraction from a popular travel website, Trip Advisor, using BeautifulSoup package.

In [1]:
# Import libraries
import requests # http library for python 
from bs4 import BeautifulSoup # library for pulling data out of HTML
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import re

In [2]:
response = requests.get('https://www.tripadvisor.com.sg/Attractions-g186338-Activities-oa0-London_England.html', headers={'User-Agent': "Mozilla/5.0"})
print('Status Code: ',response.status_code)

Status Code:  200


In [3]:
soup = BeautifulSoup(response.text, "lxml")

In [4]:
%%time
poi = [] # declare empty list to append names of places of interest

for num_results in tqdm(range(0, 61, 30), desc='1st loop'): # keeps running until "for" condition breaks
    url = f'https://www.tripadvisor.com.sg/Attractions-g186338-Activities-oa{num_results}-London_England.html'
    print(url)
    
    response = requests.get(url, headers={'User-Agent': "Mozilla/5.0"})
    print('Status Code: ',response.status_code)
    
    soup = BeautifulSoup(response.text, "lxml")
    for i in tqdm(range(0,30), desc='2nd loop'):
        name = soup.find_all('div', {'class':'XfVdV o AIbhI'})[i].text # get name of poi
        poi.append(name)

1st loop:   0%|                                           | 0/3 [00:00<?, ?it/s]

https://www.tripadvisor.com.sg/Attractions-g186338-Activities-oa0-London_England.html
Status Code:  200



2nd loop:   0%|                                          | 0/30 [00:00<?, ?it/s][A
2nd loop:  10%|███▍                              | 3/30 [00:00<00:00, 29.55it/s][A
2nd loop:  20%|██████▊                           | 6/30 [00:00<00:00, 28.64it/s][A
2nd loop:  33%|███████████                      | 10/30 [00:00<00:00, 30.84it/s][A
2nd loop:  47%|███████████████▍                 | 14/30 [00:00<00:00, 31.92it/s][A
2nd loop:  60%|███████████████████▊             | 18/30 [00:00<00:00, 32.82it/s][A
2nd loop:  73%|████████████████████████▏        | 22/30 [00:00<00:00, 33.25it/s][A
2nd loop:  87%|████████████████████████████▌    | 26/30 [00:00<00:00, 33.61it/s][A
2nd loop: 100%|█████████████████████████████████| 30/30 [00:00<00:00, 32.58it/s][A
1st loop:  33%|███████████▋                       | 1/3 [00:02<00:04,  2.18s/it]

https://www.tripadvisor.com.sg/Attractions-g186338-Activities-oa30-London_England.html
Status Code:  200



2nd loop:   0%|                                          | 0/30 [00:00<?, ?it/s][A
2nd loop:  10%|███▍                              | 3/30 [00:00<00:00, 29.72it/s][A
2nd loop:  23%|███████▉                          | 7/30 [00:00<00:00, 33.69it/s][A
2nd loop:  37%|████████████                     | 11/30 [00:00<00:00, 34.94it/s][A
2nd loop:  50%|████████████████▌                | 15/30 [00:00<00:00, 35.38it/s][A
2nd loop:  63%|████████████████████▉            | 19/30 [00:00<00:00, 35.73it/s][A
2nd loop:  77%|█████████████████████████▎       | 23/30 [00:00<00:00, 35.57it/s][A
2nd loop: 100%|█████████████████████████████████| 30/30 [00:00<00:00, 35.27it/s][A
1st loop:  67%|███████████████████████▎           | 2/3 [00:04<00:02,  2.45s/it]

https://www.tripadvisor.com.sg/Attractions-g186338-Activities-oa60-London_England.html
Status Code:  200



2nd loop:   0%|                                          | 0/30 [00:00<?, ?it/s][A
2nd loop:  13%|████▌                             | 4/30 [00:00<00:00, 34.83it/s][A
2nd loop:  27%|█████████                         | 8/30 [00:00<00:00, 35.59it/s][A
2nd loop:  40%|█████████████▏                   | 12/30 [00:00<00:00, 35.22it/s][A
2nd loop:  53%|█████████████████▌               | 16/30 [00:00<00:00, 35.69it/s][A
2nd loop:  67%|██████████████████████           | 20/30 [00:00<00:00, 35.87it/s][A
2nd loop:  80%|██████████████████████████▍      | 24/30 [00:00<00:00, 35.49it/s][A
2nd loop: 100%|█████████████████████████████████| 30/30 [00:00<00:00, 35.32it/s][A
1st loop: 100%|███████████████████████████████████| 3/3 [00:07<00:00,  2.54s/it]

CPU times: user 3.36 s, sys: 126 ms, total: 3.49 s
Wall time: 7.68 s





In [5]:
# Extract top 50 poi
poi = poi[0:50]
df = pd.DataFrame(poi, columns = ['poi_name'])
df

Unnamed: 0,poi_name
0,1. Tower of London
1,2. The British Museum
2,3. London Eye
3,4. National Gallery
4,5. Tower Bridge
5,6. Natural History Museum
6,7. Churchill War Rooms
7,8. V&A - Victoria and Albert Museum
8,9. Westminster Abbey
9,10. Hyde Park


In [6]:
# Remove numbered list from poi_name
df['poi_name'] =  [re.sub(r'(^|\s)[0-9]+\.','', str(x)) for x in df['poi_name']]

In [7]:
# Remove leading whitespace from poi_name
df['poi_name'] = [df['poi_name'][i].lstrip() for i in range(len(df))]

In [8]:
# Save and export
df.to_csv('./output/poi_name.csv', index=False)