In [157]:
!pip install word2number

Collecting word2number
  Downloading https://files.pythonhosted.org/packages/4a/29/a31940c848521f0725f0df6b25dca8917f13a2025b0e8fcbe5d0457e45e6/word2number-1.1.zip
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-cp36-none-any.whl size=5587 sha256=ac7f63e55aa8ebb80032a2fc72055cfe242a7942e1517c970146568bce8427d5
  Stored in directory: /root/.cache/pip/wheels/46/2f/53/5f5c1d275492f2fce1cdab9a9bb12d49286dead829a4078e0e
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [208]:
import pandas as pd 
import requests
import regex as rgx
from bs4 import BeautifulSoup
from word2number import w2n

# Set our pandas dataframe width wider
pd.set_option('display.max_colwidth', 400)

In [218]:
def load_data(pages=5):
  global website
  url = 'http://books.toscrape.com/catalogue/page-'
  data = []
  website = []
  # Setup for pagination (only 1 page at a time)
  for x in range(pages):
    data.append(requests.get(url + str(x) + '.html'))
    website.append(BeautifulSoup(data[x].content))
  return "Data loaded and ready to go!"

def extract_titles():
  titles = []
  for i in range(len(website)):
    for h3 in website[i].find_all('h3'):
      titles.append(h3.a['title'])
  return titles

def extract_prices():
  prices = []
  for i in range(len(website)):
    for p in website[i].find_all('p',{'class':'price_color'}):
      prices.append(p.text)
  return prices

def extract_instock():
  instock = []
  for i in range(len(website)):
    for p in website[i].find_all('p',{'class':'instock availability'}):
      p = p.text.split(' ')[12:14]
      p[1] = p[1].replace('\n','')
      p = p[0] + p[1]
      instock.append(p)
  return mutate_instock(instock)

def mutate_instock(items):
  instock = []
  for ioos in items:
    if 'Instock' in ioos:
      instock.append(True)
    else:
      instock.append(False)
  return instock

def extract_ratings():
  ratings = []
  pattern = rgx.regex.compile("star-rating.*\w")
  for i in range(len(website)):
    for p in website[i].find_all('p',{'class':'star-rating'}):
      ratings.append(w2n.word_to_num(rgx.regex.search(pattern,str(p)).group(0).split(' ')[1]))
  return ratings

def extract_covers():
  base_url = 'http://books.toscrape.com'
  cover_urls = []
  for i in range(len(website)):
    for img in website[i].find_all('img',{'class':'thumbnail'}):
      cover_urls.append(base_url + str(img['src'].replace('..','')))
  return cover_urls

def get_dataframe(pages=5):
  load_data(pages)
  titles = extract_titles()
  prices = extract_prices()
  instock = extract_instock()
  ratings = extract_ratings()
  covers = extract_covers()
  return pd.DataFrame({'title':titles,'price':prices,'instock':instock,'rating':ratings,'coverUrl':covers}).sort_values('rating', ascending=False).reset_index().drop('index',axis=1)


In [220]:
get_dataframe().head(10)

Unnamed: 0,title,price,instock,rating,coverUrl
0,#HigherSelfie: Wake Up Your Life. Free Your Soul. Find Your Tribe.,£23.11,True,5,http://books.toscrape.com/media/cache/9c/46/9c463c7631c82401160fd3b554b8f0e1.jpg
1,Set Me Free,£17.46,True,5,http://books.toscrape.com/media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg
2,The Four Agreements: A Practical Guide to Personal Freedom,£17.66,True,5,http://books.toscrape.com/media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg
3,Worlds Elsewhere: Journeys Around Shakespeare’s Globe,£40.30,True,5,http://books.toscrape.com/media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg
4,Thirst,£17.27,True,5,http://books.toscrape.com/media/cache/c4/0a/c40a64f59e7487b1a80a049f6ceb2ba5.jpg
5,Black Dust,£34.53,True,5,http://books.toscrape.com/media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg
6,Chase Me (Paris Nights #2),£25.27,True,5,http://books.toscrape.com/media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg
7,Sophie's World,£15.94,True,5,http://books.toscrape.com/media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg
8,Private Paris (Private #10),£47.61,True,5,http://books.toscrape.com/media/cache/9d/05/9d0533bae1578846d728a82913b95c26.jpg
9,The Elephant Tree,£23.82,True,5,http://books.toscrape.com/media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg
