In [1]:
#!pip install dataset
import requests
import dataset
import re
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pywren

In [2]:
db = dataset.connect('sqlite:///books.db')
base_url = 'http://books.toscrape.com/'

def scrape_books(html_soup, url):
    for book in html_soup.select('article.product_pod'):
        # For now, we'll only store the books url
        book_url = book.find('h3').find('a').get('href')
        book_url = urljoin(url, book_url)
        path = urlparse(book_url).path
        book_id = path.split('/')[2]
        # Upsert tries to update first and then insert instead
        db['books'].upsert({'book_id' : book_id,
                            'last_seen' : datetime.now()
                            }, ['book_id'])

def scrape_book(book_id_list):
    for book_id in book_id_list:
        book_url = base_url + 'catalogue/{}'.format(book_id)
        print('Now scraping book:', book_url)
        r = requests.get(book_url)
        r.encoding = 'utf-8'
        main = BeautifulSoup(requests.get(book_url).text, 'html.parser').find(class_='product_main')
        book = {}
        book['book_id'] = book_id
        book['title'] = main.find('h1').get_text(strip=True)
        book['price'] = main.find(class_='price_color').get_text(strip=True)
        book['stock'] = main.find(class_='availability').get_text(strip=True)
        book['rating'] = ' '.join(main.find(class_='star-rating') \
                            .get('class')).replace('star-rating', '').strip()
        book['img'] = html_soup.find(class_='thumbnail').find('img').get('src')
        desc = html_soup.find(id='product_description')
        book['description'] = ''
        if desc:
            book['description'] = desc.find_next_sibling('p') \
                                      .get_text(strip=True)
        book_product_table = html_soup.find(text='Product Information').find_next('table')
        for row in book_product_table.find_all('tr'):
            header = row.find('th').get_text(strip=True)
            # Since we'll use the header as a column, clean it a bit
            # to make sure SQLite will accept it
            header = re.sub('[^a-zA-Z]+', '_', header)
            value = row.find('td').get_text(strip=True)
            book[header] = value
        db['book_info'].upsert(book, ['book_id'])
    db['books'].upsert({'book_id' : book_id_list,
                        'last_seen' : datetime.now()
                        }, ['book_id'])



In [None]:
# Scrape the pages in the catalogue
url = base_url
inp = input('Do you wish to re-scrape the catalogue (y/n)? ')
while True and inp == 'y':
    print('Now scraping page:', url)
    r = requests.get(url)
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_books(html_soup, url)
    # Is there a next page?
    next_a = html_soup.select('li.next > a')
    if not next_a or not next_a[0].get('href'):
        break
    url = urljoin(url, next_a[0].get('href'))

# Now scrape book by book, oldest first
books = db['books'].find(order_by=['last_seen'])
for book in books:
    book_id = book['book_id']
    book_url = base_url + 'catalogue/{}'.format(book_id)
    print('Now scraping book:', book_url)
    r = requests.get(book_url)
    r.encoding = 'utf-8'
    html_soup = BeautifulSoup(r.text, 'html.parser')
    scrape_book(html_soup, book_id)
    # Update the last seen timestamp
    db['books'].upsert({'book_id' : book_id,
                        'last_seen' : datetime.now()
                        }, ['book_id'])

In [3]:
books = db['books'].find(order_by=['last_seen'])
book_all = []
book_id_list = []
for book in books:
    book_id = book['book_id']
    book_id_list.append(book_id)
    if len(book_id_list) ==3:
        book_all.append(book_id_list)
        book_id_list = []

In [5]:
#pwex = pywren.default_executor()
#pwex.map(func = scrape_book1, iterdata =['wall-and-piece_971'])
pywren.get_all_results(pwex.map(scrape_book1, book_all[1]))

AttributeError: 'NoneType' object has no attribute 'find'

In [4]:
def scrape_book1(book_id_list):
    book_list = []
    for book_id in book_id_list:
        book_url = base_url + 'catalogue/{}'.format(book_id)
        print('Now scraping book:', book_url)
        r = requests.get(book_url)
        r.encoding = 'utf-8'
        #html_soup = BeautifulSoup(r.text, 'html.parser')
        main = BeautifulSoup(r.text, 'html.parser').find(class_='product_main')
        book = {}
        book['book_id'] = book_id
        book['title'] = main.find('h1').get_text(strip=True)
        book['price'] = main.find(class_='price_color').get_text(strip=True)
        book['stock'] = main.find(class_='availability').get_text(strip=True)
        book['rating'] = ' '.join(main.find(class_='star-rating') \
                            .get('class')).replace('star-rating', '').strip()
        book['img'] = html_soup.find(class_='thumbnail').find('img').get('src')
        desc = html_soup.find(id='product_description')
        book['description'] = ''
        if desc:
            book['description'] = desc.find_next_sibling('p') \
                                      .get_text(strip=True)
        book_product_table = html_soup.find(text='Product Information').find_next('table')
        for row in book_product_table.find_all('tr'):
            header = row.find('th').get_text(strip=True)
            # Since we'll use the header as a column, clean it a bit
            # to make sure SQLite will accept it
            header = re.sub('[^a-zA-Z]+', '_', header)
            value = row.find('td').get_text(strip=True)
            book[header] = value
        book_list.append(book)
    return book_list
        #db['book_info'].upsert(book, ['book_id'])
    #db['books'].upsert({'book_id' : book_id_list,'last_seen' : datetime.now()}, ['book_id'])

In [35]:

def scrape_book2(soup_list):
    book_list = []
    for soup in soup_list:
        main = soup.find(class_='product_main')
        book = {}
        book['book_id'] = book_id
        book['title'] = main.find('h1').get_text(strip=True)
        book['price'] = main.find(class_='price_color').get_text(strip=True)
        book['stock'] = main.find(class_='availability').get_text(strip=True)
        book['rating'] = ' '.join(main.find(class_='star-rating') \
                            .get('class')).replace('star-rating', '').strip()
        book['img'] = html_soup.find(class_='thumbnail').find('img').get('src')
        desc = html_soup.find(id='product_description')
        book['description'] = ''
        if desc:
            book['description'] = desc.find_next_sibling('p') \
                                      .get_text(strip=True)
        book_product_table = html_soup.find(text='Product Information').find_next('table')
        for row in book_product_table.find_all('tr'):
            header = row.find('th').get_text(strip=True)
            # Since we'll use the header as a column, clean it a bit
            # to make sure SQLite will accept it
            header = re.sub('[^a-zA-Z]+', '_', header)
            value = row.find('td').get_text(strip=True)
            book[header] = value
        #book_list.append(book)
    return book

soup_list = []
for book_id in ['wall-and-piece_971']:
    book_url = base_url + 'catalogue/{}'.format(book_id)
    print('Now scraping book:', book_url)
    r = requests.get(book_url)
    r.encoding = 'utf-8'
    html_soup = BeautifulSoup(r.text, 'html.parser')
    soup_list.append(html_soup)
pywren.get_all_results(pwex.map(scrape_book2, soup_list))

Now scraping book: http://books.toscrape.com/catalogue/wall-and-piece_971


PicklingError: Could not pickle object as excessively deep recursion required.

In [36]:
scrape_book2(soup_list)

{'book_id': 'wall-and-piece_971',
 'title': 'Wall and Piece',
 'price': '£44.18',
 'stock': 'In stock (18 available)',
 'rating': 'Four',
 'img': '../../media/cache/df/34/df346322ddfdd3b4da0e34cad17f49dc.jpg',
 'description': 'Banksy, Britain\'s now-legendary "guerilla" street artist, has painted the walls, streets, and bridges of towns and cities throughout the world. Not only did he smuggle his pieces into four of New York City\'s major art museums, he\'s also "hung" his work at London\'s Tate Gallery and adorned Israel\'s West Bank barrier with satirical images. Banksy\'s identity remains unknown, Banksy, Britain\'s now-legendary "guerilla" street artist, has painted the walls, streets, and bridges of towns and cities throughout the world. Not only did he smuggle his pieces into four of\xa0 New York City\'s major art museums, he\'s also "hung" his work at London\'s Tate Gallery and adorned Israel\'s West Bank barrier with satirical images. Banksy\'s identity remains unknown, but his