# Exercise: Web Scraping with Beautiful Soup
*****
# Part 1
### Gather Book Prices and Ratings

In [1]:
from bs4 import BeautifulSoup
import requests

site = 'http://books.toscrape.com/'
page = ""

failed_sites = []

def get_page(path):
    page_text = requests.get(path).text
    soup = BeautifulSoup(page_text, 'html.parser')
    return soup

def get_products(soup):
    products = soup.find_all(attrs = {'class': 'product_pod'})
    return products

print(" Getting Books...")
books = []
passed = False
while not passed:
    url = site + page
    
    try: 
        soup = get_page(url)
    except:
        failed_sites.append(url)
        print(url,'failed')
        continue
        
    products = get_products(soup)
    
    for item in products:
        rating = item.find(attrs = {'class':'star-rating'}).attrs['class'][1]
        title = item.find_all('a')[1].get('title')
        price = item.find(attrs = {'class':'price_color'}).text[1:]
        status = item.find(attrs = {'class':'instock availability'}).text.split('\n')[3].strip()

        book = {
            'title': title,
            'price': price,
            'rating': rating,
            'status': status
        }

        books.append(book)

    try:
        next_page_link = soup.find_all('a')
        if next_page_link[-1].text == 'previous':
            passed = True
        next_page_link = next_page_link[-1].get('href')
        if 'catalogue/' not in next_page_link:
            next_page_link = 'catalogue/' + next_page_link
    except:
        next_page_link = None
    
    page = next_page_link
    if next_page_link is None:
        passed = True
    
print(len(books), 'books collected')
print('\n Data Example')
print(books[0])

 Getting Books...
1000 books collected

 Data Example
{'title': 'A Light in the Attic', 'price': '£51.77', 'rating': 'Three', 'status': 'In stock'}


### Export Data to a CSV

In [2]:
import csv

columns = ['title','price','rating','status']
with open('book_list.csv','w', encoding = 'utf-8') as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(columns)
    for book in books:
        book_items = [book[key] for key in book.keys()]
        csvwriter.writerow(book_items)
file.close()


## Part 3: Splinter Stretch Goal
 - What is Splinter and what does it do differently than the requests module?
   - Splinter is an automation tool that uses webbrowsers to interact with the pages you visit
 - Why is requests module insufficient for some modern websites?
   - An increased use of Javascript in websites means that a standard http request using requests would return an incomplete result<br>
       as it would return the html before the javascript has finished loading it, and elements of the html may change through javascript
       interactions.
 - What happens if you turn Javascript off in the browser?
   - Many websites stop working, or you get an extremely simplified user experience.
   
   

## Part 4: Splinter Stretch Coding Goal

In [3]:
from splinter import Browser
import time

executable_path = {'executable_path':'C:\webdrivers\chromedriver.exe'}
browser = Browser('chrome', **executable_path)

# Setting a delay on page access to reduce chance of errors
time.sleep(2)
browser.visit('https://www.nasa.gov/')

# Setting a delay on data access to allow page to load
time.sleep(2)
first_event = browser.find_by_tag('div[class="title"]')

events = []
for event in first_event:
    link = event.find_by_tag('a')["href"]
    nasa_events = [link]
    nasa_events = nasa_events + [_.strip() for _ in event.text.split(",")]
    events.append(nasa_events)

columns = ['link','weekday','date','event']
with open('nasa_events.csv','w', encoding = 'utf-8') as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(columns)
    for event in events:
        csvwriter.writerow(event)
file.close()

