#### Web Scraper Lab Solutions

##### Connecting to Data And Initializing the Web Scraper

In [2]:
# imports
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [3]:
# we'll initialize the request
req = requests.get('https://www.yelp.com/search?find_desc=Restaurants&find_loc=London%2C+United+Kingdom&ns=1')

In [4]:
# this is not json data, so instead we'll get the entire web page as text
req.text



In [5]:
# feed the text into a scraper
scraper = BeautifulSoup(req.text)

In [6]:
# use the find_all method to select every <a> tag, along with its accompanying classes
titles = scraper.find_all('a', {'class': 'lemon--a__373c0__IEZFH', 'class': 'link__373c0__1G70M',
                                'class': 'link-color--inherit__373c0__3dzpk',
                                'class':  'link-size--inherit__373c0__1VFlE'})

##### Finding the Titles

In [7]:
# if you look at the contents of this list, you'll see we have some cleaning to do!
titles[0]

<a class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE" href="/biz/the-mayfair-chippy-london-2?osq=Restaurants" name="The Mayfair Chippy" rel="" target="">The Mayfair Chippy</a>

In [8]:
# if you check the data type of the item, you'll notice it's NOT a string, but rather a specialized scraper object
type(titles[0])

bs4.element.Tag

In [9]:
# so let's first convert everything into a string
titles = [str(title) for title in titles]

In [10]:
# remove the </a> tag at the end
titles = [title.replace('</a>', '') for title in titles]

In [11]:
# and then split the items and grab the appropriate spot in the list to get the actual title
titles = [title.split('>')[1] for title in titles]

In [12]:
# you should see that we have MORE than what we need
titles

['The Mayfair Chippy',
 'more',
 'Dishoom',
 'more',
 'Ffiona’s Restaurant',
 'more',
 'Flat Iron',
 'more',
 'Restaurant Gordon Ramsay',
 'more',
 'The Queens Arms',
 'more',
 'Mother Mash',
 'more',
 'The Golden Chippy',
 'more',
 'The Grazing Goat',
 'more',
 'Padella',
 'more',
 'Gordon Ramsay Street Pizza',
 'more',
 'Duck &amp; Waffle',
 'more',
 'Dishoom',
 'more',
 'The Pig and Butcher',
 'more',
 'Sketch',
 'more',
 'The Churchill Arms',
 'more',
 'The Victoria',
 'London House by Gordon Ramsay',
 'more',
 'The Colonel Fawcett',
 'more',
 'Burger &amp; Lobster',
 'more',
 'BAO - Soho',
 'more',
 'Ye Olde Cheshire Cheese',
 'more',
 'Busaba Soho',
 'more',
 'Abeno',
 'more',
 'Wright Brothers - South Kensington',
 'more',
 'Dinner by Heston Blumenthal',
 'more',
 'Kazan',
 'more',
 'Yauatcha',
 'more',
 'Hawksmoor Seven Dials',
 'more',
 'Barrafina',
 'more',
 '<div aria-label="Page: 2" class="lemon--div__373c0__1mboc undefined display--inline-block__373c0__1ZKqC border-color--

In [13]:
# since we don't need the values that are 'more' and the tags that contain <div> and <span> let's remove them out
titles = [title for title in titles if title != 'more' and '<div' not in title and '<span' not in title]

In [14]:
# and let's double check what we have now:
titles

['The Mayfair Chippy',
 'Dishoom',
 'Ffiona’s Restaurant',
 'Flat Iron',
 'Restaurant Gordon Ramsay',
 'The Queens Arms',
 'Mother Mash',
 'The Golden Chippy',
 'The Grazing Goat',
 'Padella',
 'Gordon Ramsay Street Pizza',
 'Duck &amp; Waffle',
 'Dishoom',
 'The Pig and Butcher',
 'Sketch',
 'The Churchill Arms',
 'The Victoria',
 'London House by Gordon Ramsay',
 'The Colonel Fawcett',
 'Burger &amp; Lobster',
 'BAO - Soho',
 'Ye Olde Cheshire Cheese',
 'Busaba Soho',
 'Abeno',
 'Wright Brothers - South Kensington',
 'Dinner by Heston Blumenthal',
 'Kazan',
 'Yauatcha',
 'Hawksmoor Seven Dials',
 'Barrafina']

Perfect!!  

Now let's follow a similar process to get the number of ratings for each item.

#### Step 1 Solution:

In [15]:
num_reviews = scraper.find_all('span', {'class': 'lemon--span__373c0__3997G', 
                                        'class': 'text__373c0__2Kxyz',
                                        'class': 'reviewCount__373c0__2r4xT',
                                        'class': 'text-color--black-extra-light__373c0__2OyzO'})

In [99]:
# we'll convert everything into a string
num_reviews = [str(review) for review in num_reviews]

In [103]:
# and remove the </span> at the end
num_reviews = [review.replace('</span>', '') for review in num_reviews]

In [105]:
# and split on the > to get the last item
num_reviews = [review.split('>')[1] for review in num_reviews]

In [106]:
# if we see what we have, we have a mix of numeric items (the number of reviews) and other non-numeric data as well
num_reviews

['277',
 '££',
 '<a class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--default__373c0__7tls6" href="/search?cflt=fishnchips&amp;find_desc=Restaurants&amp;find_loc=London%2C+United+Kingdom" name="" rel="" role="link" target=""',
 '204',
 '££££',
 '<a class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--default__373c0__7tls6" href="/search?cflt=french&amp;find_desc=Restaurants&amp;find_loc=London%2C+United+Kingdom" name="" rel="" role="link" target=""',
 '<a class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--default__373c0__7tls6" href="/search?cflt=british&amp;find_desc=Restaurants&amp;find_loc=London%2C+United+Kingdom" name="" rel="" role="link" target=""',
 '1842',
 '££',
 '<a class="lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--default__373c0__7tls6" href="/search?cflt=indpak&amp;find_desc=Restaurants&amp;find_loc=Lond

In [107]:
# let's do a check and just select the items that are numeric
# the isdigit() string method will be helpful here
num_reviews = [review for review in num_reviews if review.isdigit()]

In [108]:
# this is perfect
num_reviews

['277',
 '204',
 '1842',
 '269',
 '378',
 '119',
 '701',
 '202',
 '239',
 '107',
 '468',
 '183',
 '109',
 '353',
 '830',
 '78',
 '212',
 '482',
 '271',
 '544',
 '26',
 '22',
 '101',
 '194',
 '342',
 '375',
 '22',
 '112',
 '30',
 '61']

#### Step 2 Solution:

Using the selection criteria that we had before we also had the price range included, so we'll grab that as well.

In [117]:
price_ranges = scraper.find_all('span', {'class': 'lemon--span__373c0__3997G', 
                                        'class': 'text__373c0__2Kxyz',
                                        'class': 'reviewCount__373c0__2r4xT',
                                        'class': 'text-color--black-extra-light__373c0__2OyzO'})

price_ranges = [str(range_) for range_ in price_ranges]
price_ranges = [range_.replace('</span>', '') for range_ in price_ranges]
price_ranges = [range_.split('>')[1] for range_ in price_ranges]
# \xA3 is unicode for the pound symbol
price_ranges = [range_ for range_ in price_ranges if '\xA3' in range_]

##### Step 3 Solution:  Turning Our Data into a Dataframe

Using a step similar to what we used in the previous lab, let's turn our results into a dataframe

In [121]:
df_dict = {
    'Name': titles,
    'NumReviews': num_reviews,
    'PriceRange': price_ranges
}

df = pd.DataFrame(df_dict)

In [122]:
# beautiful :)
df

Unnamed: 0,Name,NumReviews,PriceRange
0,The Mayfair Chippy,277,££
1,Restaurant Gordon Ramsay,204,££££
2,Dishoom,1842,££
3,Ffiona’s Restaurant,269,££
4,Flat Iron,378,££
5,The Queens Arms,119,££
6,Duck &amp; Waffle,701,£££
7,Padella,202,££
8,The Grazing Goat,239,££
9,The Golden Chippy,107,££
