# Web Scraping with Beautiful Soup Lab

In [1]:
# Import libaries here
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Step 1: Create a soup object from the home page

In [2]:
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'

res = requests.get(url)

In [3]:
res.status_code

200

In [4]:
html = res.text
print(html)

<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8"/>
    <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
    <title>Nutrition Information</title>
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.4.1/css/bootstrap.min.css" integrity="sha384-Vkoo8x4CGsO3+Hhxv8T/Q5PaXtkKtu6ug5TOeNV6gBiFeWPGFN9MuhOf23Q9Ifjh" crossorigin="anonymous">

  </head>
  <body>
    <header>
      <section class="container">
        <nav role="navigation" class="navbar navbar-expand-lg navbar-light bg-light">
<a class="navbar-brand" href="/">Nutrition Information</a>        </nav>
      </section>
    </header>
    <main role="main" class="container">
      <br>
      <div class="alert alert-danger">
        NOTE: This data is super old and rife with errors. It's meant for scraping practice only.
      </div>
<table id="restaurants" class="table">
  <thead>
    <tr>
      <th>Name</th>



In [5]:
soup = BeautifulSoup(html, 'lxml')

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [6]:
results_list = []

all_td = soup.find_all('td')
for element in all_td:
    result = {}
    
    a_href = element.find('a')
    if a_href:
        result['name'] = a_href.text 
        result['href'] = a_href['href']
    results_list.append(result)

results_list[:10]

[{'name': 'A&W Restaurants', 'href': 'restaurants/1.html'},
 {'name': "Applebee's", 'href': 'restaurants/2.html'},
 {'name': "Arby's", 'href': 'restaurants/3.html'},
 {'name': 'Atlanta Bread Company', 'href': 'restaurants/4.html'},
 {'name': "Bojangle's Famous Chicken 'n Biscuits",
  'href': 'restaurants/5.html'},
 {'name': 'Buffalo Wild Wings', 'href': 'restaurants/6.html'},
 {'name': 'Burger King', 'href': 'restaurants/7.html'},
 {'name': "Captain D's", 'href': 'restaurants/8.html'},
 {'name': "Carl's Jr.", 'href': 'restaurants/9.html'},
 {'name': "Charley's Grilled Subs", 'href': 'restaurants/10.html'}]

In [7]:
pd.DataFrame(results_list).head()

Unnamed: 0,name,href
0,A&W Restaurants,restaurants/1.html
1,Applebee's,restaurants/2.html
2,Arby's,restaurants/3.html
3,Atlanta Bread Company,restaurants/4.html
4,Bojangle's Famous Chicken 'n Biscuits,restaurants/5.html


### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [11]:
food = []

for i, v in enumerate(results_list):
    href = list(results_list[i].items())[1][1] 
    url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/' + href
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    
    tr = soup.findAll('tr')
    for element in tr:
        result = {}

        td = element.find('td') # all tags with hyperlinks within elements/rows of td tags
        if td:
            result['calories'] = element.findAll('td')[2].text.strip()
            result['carbs'] = element.findAll('td')[4].text.strip()
            result['category'] = element.findAll('td')[1].text.strip()
            result['fat'] = element.findAll('td')[3].text.strip()
            result['name'] = element.findAll('td')[0].text.strip()
            result['restaurant'] = list(results_list[i].values())[0]

            food.append(result)

# https://stackoverflow.com/questions/38570411/how-to-scrape-href-with-python-3-5-and-beautifulsoup

In [12]:
food[:1]

[{'calories': '760',
  'carbs': '45',
  'category': 'Burgers',
  'fat': '45',
  'name': 'Original Bacon Double Cheeseburger',
  'restaurant': 'A&W Restaurants'}]

### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows. Please output the number of rows in your DataFrame!

In [13]:
foods = pd.DataFrame(data=food)[['restaurant','name','category','calories','fat','carbs']]
foods.head()

Unnamed: 0,restaurant,name,category,calories,fat,carbs
0,A&W Restaurants,Original Bacon Double Cheeseburger,Burgers,760,45,45
1,A&W Restaurants,Coney (Chili) Dog,Entrees,340,20,26
2,A&W Restaurants,Chili Fries,French Fries,370,15,49
3,A&W Restaurants,Strawberry Milkshake (small),Shakes,670,29,90
4,A&W Restaurants,A&WÂ® Root Beer Freeze (large),Shakes,820,18,150


In [14]:
# How many rows does your dataframe have?
foods.shape

(5131, 6)

### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame