In [157]:
!pip install word2number

Collecting word2number
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-cp36-none-any.whl size=5587 sha256=ac7f63e55aa8ebb80032a2fc72055cfe242a7942e1517c970146568bce8427d5
  Stored in directory: /root/.cache/pip/wheels/46/2f/53/5f5c1d275492f2fce1cdab9a9bb12d49286dead829a4078e0e
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [208]:
import pandas as pd 
import requests
import regex as rgx
from bs4 import BeautifulSoup
from word2number import w2n

# Set our pandas dataframe width wider
pd.set_option('display.max_colwidth', 400)

In [221]:
def load_data(pages=5):
  global website
  url = 'http://books.toscrape.com/catalogue/page-'
  data = []
  website = []
  # Setup for pagination (only 1 page at a time)
  for x in range(pages):
    data.append(requests.get(url + str(x) + '.html'))
    website.append(BeautifulSoup(data[x].content))
  return "Data loaded and ready to go!"

def extract_titles():
  titles = []
  for i in range(len(website)):
    for h3 in website[i].find_all('h3'):
      titles.append(h3.a['title'])
  return titles

def extract_prices():
  prices = []
  for i in range(len(website)):
    for p in website[i].find_all('p',{'class':'price_color'}):
      prices.append(p.text)
  return prices

def extract_instock():
  instock = []
  for i in range(len(website)):
    for p in website[i].find_all('p',{'class':'instock availability'}):
      p = p.text.split(' ')[12:14]
      p[1] = p[1].replace('\n','')
      p = p[0] + p[1]
      instock.append(p)
  return mutate_instock(instock)

def mutate_instock(items):
  instock = []
  for ioos in items:
    if 'Instock' in ioos:
      instock.append(True)
    else:
      instock.append(False)
  return instock

def extract_ratings():
  ratings = []
  pattern = rgx.regex.compile("star-rating.*\w")
  for i in range(len(website)):
    for p in website[i].find_all('p',{'class':'star-rating'}):
      ratings.append(w2n.word_to_num(rgx.regex.search(pattern,str(p)).group(0).split(' ')[1]))
  return ratings

def extract_covers():
  base_url = 'http://books.toscrape.com'
  cover_urls = []
  for i in range(len(website)):
    for img in website[i].find_all('img',{'class':'thumbnail'}):
      cover_urls.append(base_url + str(img['src'].replace('..','')))
  return cover_urls

def get_dataframe(pages=5):
  load_data(pages)
  titles = extract_titles()
  prices = extract_prices()
  instock = extract_instock()
  ratings = extract_ratings()
  covers = extract_covers()
  return pd.DataFrame({'title':titles,'price':prices,'instock':instock,'rating':ratings,'coverUrl':covers})


In [222]:
get_dataframe().head(10)

Unnamed: 0,title,price,instock,rating,coverUrl
0,A Light in the Attic,£51.77,True,3,http://books.toscrape.com/media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg
1,Tipping the Velvet,£53.74,True,1,http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg
2,Soumission,£50.10,True,1,http://books.toscrape.com/media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg
3,Sharp Objects,£47.82,True,4,http://books.toscrape.com/media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg
4,Sapiens: A Brief History of Humankind,£54.23,True,5,http://books.toscrape.com/media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg
5,The Requiem Red,£22.65,True,1,http://books.toscrape.com/media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg
6,The Dirty Little Secrets of Getting Your Dream Job,£33.34,True,4,http://books.toscrape.com/media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg
7,"The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",£17.93,True,3,http://books.toscrape.com/media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg
8,The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics,£22.60,True,4,http://books.toscrape.com/media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg
9,The Black Maria,£52.15,True,1,http://books.toscrape.com/media/cache/58/46/5846057e28022268153beff6d352b06c.jpg
