In [58]:
import requests
import os
import urllib.request
import time
import numpy as np
import PIL

from bs4 import BeautifulSoup
from PIL import Image, ImageFilter
from io import BytesIO

# Gather Data and Form Averages

In [8]:
def getGenres(genres, timeout = 3):
    webpages = {}
    for genre in genres:
        fp = urllib.request.urlopen("https://www.goodreads.com/shelf/show/" + genre)
        webpages[genre] = fp.read().decode("utf8")
        fp.close()
        time.sleep(timeout)
    return webpages

In [9]:
def getBookLinks(page):
    soup = BeautifulSoup(page, "html.parser")
    
    bookLinks = set()
    for tag in soup.find_all(lambda tag: tag.has_attr('class') and tag["class"] == ['elementList']):
        try:
            link = tag.div.a["href"]
        except AttributeError:
            pass

        if link[:5] == "/book":
            bookLinks.add(tag.div.a["href"])
    return bookLinks

In [10]:
def getImageLinks(bookLinks, timeout = 2):
    imageLinks = []
    for link in bookLinks:
        fp = urllib.request.urlopen("https://www.goodreads.com/" + link)
        bookPage = fp.read().decode("utf8")
        fp.close()

        soup = BeautifulSoup(bookPage, "html.parser")
        imageLinks.append(soup.find_all(lambda tag: tag.has_attr("id") and tag["id"] == "coverImage")[0]["src"])
        time.sleep(timeout)
    return imageLinks

In [11]:
def getImages(imageLinks, genre, timeout = 2):
    images = []
    directory = "raw/" + genre
    for i in range(len(imageLinks)):
        response = requests.get(imageLinks[i])
        img = Image.open(BytesIO(response.content)).resize((300, 475))
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        img.save(directory + "/book" + str(i) + ".jpg")
        images.append(img)
        time.sleep(timeout)
    return images
        

In [12]:
def createAverageImage(images, name):
    arr = np.zeros((475, 300, 3), np.float)
    size = len(images)
    for image in images:
        imarr=np.array(image,dtype=np.float)
        try:
            arr = arr + imarr
        except ValueError:
            pass
    arr = arr/size
    
    # Round values in array and cast as 8-bit integer
    arr=np.array(np.round(arr),dtype=np.uint8)

    # Generate, save and preview final image
    out=Image.fromarray(arr,mode="RGB")
    out.save("averages/" + name + ".jpg")

In [13]:
def createAverages(genres):
    webpages = getGenres(genres)
    for genre in genres:
        page = webpages[genre]
        bookLinks = getBookLinks(page)
        imageLinks = getImageLinks(bookLinks)
        images = getImages(imageLinks, genre)
        createAverageImage(images, genre)

In [14]:
genres = ["romance", "fantasy", "non-fiction", "classics", "contemporary", "science-fiction", "horror", "thriller", "poetry", "urban-fantasy", "sci-fi-fantasy", "ya-fantasy", "high-fantasy", "epic-fantasy"]

In [None]:
createAverages(genres)

# Calculate Differences


In [1]:
def difference(genres):
    images = []
    for genre in genres:
        images.append(Image.open("averages/" + genre + ".jpg"))

    avg = np.zeros((475, 300, 3), np.float)
    for image in images:
        imarr=np.array(image,dtype=np.float)
        avg = avg + imarr
    
    avg = avg / len(images)
    
    # Round values in array and cast as 8-bit integer
    avg = Image.fromarray(np.array(np.round(avg),dtype=np.uint8))
    for i in range(len(images)):
        h = images[i].histogram()
        sum_ = 0
        for j in range(len(h)):
            sum_ += (j % 256) * h[j]
        print(genres[i], sum_ / (images[i].width * images[i].height * 3))
        PIL.ImageChops.subtract(images[i], avg, scale=0.5, offset=128).save("differences/" + genres[i] + ".jpg")

In [2]:
difference(genres)

NameError: name 'genres' is not defined