# Webscraping lab

Practice your webscraping and parsing skills! 🎉

In [1]:
# Import libaries
import pandas as pd
import requests
from bs4 import BeautifulSoup


### Step 1: Create a soup object from the home page

In [2]:
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'
req = requests.get(url, timeout = 5)
req

<Response [200]>

In [3]:
req.status_code

200

In [4]:
req.content

b'<!DOCTYPE html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8"/>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>\n    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>\n    <title>Nutrition Information</title>\n    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">\n\n  </head>\n  <body>\n    <header>\n      <section class="container">\n        <nav role="navigation" class="navbar navbar-expand-lg navbar-light bg-light">\n<a class="navbar-brand" href="/">Nutrition Information</a>        </nav>\n      </section>\n    </header>\n    <main role="main" class="container">\n      <br>\n      <div class="alert alert-danger">\n        NOTE: This data is super old and rife with errors. It\'s meant for scraping practice only.\n      </div>\n<table id="restaurants" class="table">\n  <thead>\n   

In [5]:
soup = BeautifulSoup(req.content, 'lxml')

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [6]:
# https://stackoverflow.com/questions/11716380/beautifulsoup-extract-text-from-anchor-tag

restaurants = []
for a in soup.find_all('a', href=True)[1:]:
    restaurants.append({'name': a.contents[0], 'href': a['href']})
    
restaurants

[{'name': 'A&W Restaurants', 'href': 'restaurants/1.html'},
 {'name': "Applebee's", 'href': 'restaurants/2.html'},
 {'name': "Arby's", 'href': 'restaurants/3.html'},
 {'name': 'Atlanta Bread Company', 'href': 'restaurants/4.html'},
 {'name': "Bojangle's Famous Chicken 'n Biscuits",
  'href': 'restaurants/5.html'},
 {'name': 'Buffalo Wild Wings', 'href': 'restaurants/6.html'},
 {'name': 'Burger King', 'href': 'restaurants/7.html'},
 {'name': "Captain D's", 'href': 'restaurants/8.html'},
 {'name': "Carl's Jr.", 'href': 'restaurants/9.html'},
 {'name': "Charley's Grilled Subs", 'href': 'restaurants/10.html'},
 {'name': 'Chick-fil-A', 'href': 'restaurants/11.html'},
 {'name': "Chili's", 'href': 'restaurants/12.html'},
 {'name': 'Chipotle Mexican Grill', 'href': 'restaurants/13.html'},
 {'name': "Church's", 'href': 'restaurants/14.html'},
 {'name': 'Corner Bakery Cafe', 'href': 'restaurants/15.html'},
 {'name': 'Dairy Queen', 'href': 'restaurants/16.html'},
 {'name': "Denny's", 'href': 'res

### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [7]:
# https://www.pluralsight.com/guides/extracting-data-html-beautifulsoup

foods = []
for r in restaurants:

# loop through restaurant list to make beautiful soup object of each site
    req2 = requests.get(f"https://pages.git.generalassemb.ly/rldaggie/for-scraping/{r['href']}", timeout = 5)
    soup2 = BeautifulSoup(req2.content, 'lxml')

# find food table and identify body
    food_table = soup2.find('table', attrs={'class': 'table'})
    food_table_data = food_table.tbody.find_all('tr')

# loop through foods in table to pull each element into a dictionary then add to total foods list
# headings = ['calories', 'carbs', 'category', 'fat', 'name', 'restaurant']
    for food in food_table_data:
        elements = food.find_all('td')
        food_dict = {'restaurant': r['name'],
                     'category': elements[1].text,
                     'name': elements[0].text,
                     'calories': elements[2].text,
                     'carbs': elements[4].text,
                     'fat': elements[3].text,
                    }
        foods.append(food_dict)
foods[0]

{'restaurant': 'A&W Restaurants',
 'category': 'Burgers',
 'name': 'Original Bacon Double Cheeseburger',
 'calories': '760',
 'carbs': '45',
 'fat': '45'}

### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows

In [8]:
food_df = pd.DataFrame(foods)
food_df.set_index('restaurant', drop = True, inplace = True)
food_df

Unnamed: 0_level_0,category,name,calories,carbs,fat
restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A&W Restaurants,Burgers,Original Bacon Double Cheeseburger,760,45,45
A&W Restaurants,Entrees,Coney (Chili) Dog,340,26,20
A&W Restaurants,French Fries,Chili Fries,370,49,15
A&W Restaurants,Shakes,Strawberry Milkshake (small),670,90,29
A&W Restaurants,Shakes,A&W® Root Beer Freeze (large),820,150,18
...,...,...,...,...,...
Wendy's,Shakes,Jr. Original Chocolate Frosty™,200,32,5
Wendy's,Wraps,Grilled Chicken Go Wrap,260,25,10
Wendy's,Sandwiches,Asiago Ranch Chicken Club,670,57,32
Wendy's,Wraps,Spicy Chicken Go Wrap,330,30,16


### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [9]:
food_df.to_csv('foods.csv')

### Step 6 Do the same thing as above, but use `pd.read_html()` to scrape the table from each page instead of BS4.

In [10]:
foods = []
for r in restaurants:

# loop through restaurant list to make beautiful soup object of each site
    url = f"https://pages.git.generalassemb.ly/rldaggie/for-scraping/{r['href']}"
    table = pd.read_html(url)[0]
    table['restaurant'] = r['name']
    foods.append(table)

foods_df = pd.concat(foods)

In [11]:
foods_df.set_index('restaurant', inplace = True)
foods_df

Unnamed: 0_level_0,Name,Category,Calories,Fat,Carbs
restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A&W Restaurants,Original Bacon Double Cheeseburger,Burgers,760,45,45
A&W Restaurants,Coney (Chili) Dog,Entrees,340,20,26
A&W Restaurants,Chili Fries,French Fries,370,15,49
A&W Restaurants,Strawberry Milkshake (small),Shakes,670,29,90
A&W Restaurants,A&W® Root Beer Freeze (large),Shakes,820,18,150
...,...,...,...,...,...
Wendy's,Jr. Original Chocolate Frosty™,Shakes,200,5,32
Wendy's,Grilled Chicken Go Wrap,Wraps,260,10,25
Wendy's,Asiago Ranch Chicken Club,Sandwiches,670,32,57
Wendy's,Spicy Chicken Go Wrap,Wraps,330,16,30
