In [8]:
# our json file is too large, let's stream it
import gzip
with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()
    

In [9]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [11]:
# load single line from json file as a python dictionary, we'll scale this up
import json
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [22]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id" : data["book_id"],
        "title" : data['title_without_series'],
        'ratings': data['ratings_count'],
        'url': data['url'],
        'cover_image': data['image_url'],
    }

        
        

In [16]:
# we are cutting down on our data size by working with a smaller subset
# and filtering books < 15 ratings
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        # read ia single line from our json
        line = f.readline()
        if not line:
            break
        # returns a dictionary of selected fields
        fields = parse_fields(line)
        
        try: # many books have zero ratings, so keep going on ValueError
            ratings = int(fields['ratings'])
        except ValueError:
            continue
            # finally, we only want books with more than 15 reviews
        if ratings > 15:
            books_titles.append(fields)
        
            
            
            
            
        
        

In [41]:
books_titles[0]

{'book_id': '7327624',
 'title': 'The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)',
 'ratings': '140',
 'url': 'https://www.goodreads.com/book/show/7327624-the-unschooled-wizard',
 'cover_image': 'https://images.gr-assets.com/books/1304100136m/7327624.jpg'}

In [48]:
# turns each dictionary into a row in dataframe
titles = pd.DataFrame.from_dict(books_titles)

In [55]:
# turn titles_ratings into a numerical column
titles['ratings'] = pd.to_numeric(titles['ratings'])

In [64]:
# replace (get rid of) any character that does not fall within regular expression
titles['mod_title'] = titles['title'].str.replace("[^a-zA-Z0-9 ]", "",regex=True)

In [70]:
# make every title all lowercase to reduce search space
titles['mod_title'] = titles['mod_title'].str.lower()

In [78]:
# remove any instances of multiple spaces and replace with single space
titles['mod_title'] = titles['mod_title'].str.replace("\s+", " ", regex=True)

In [229]:
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46.0,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98.0,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986.0,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


In [230]:
# let's also remove any row that has less than 1 character
titles = titles[titles['mod_title'].str.len() > 0]

In [231]:
# so we can use this in future 
titles.to_json('books_titles.json')

### Building Our Search Engine

Now that we have our dataset within our pandas dataframe, we can build a simple term frequency-inverse document frequency search engine.

In [277]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# transform our cleaned titles column into vector form 
tfidf = vectorizer.fit_transform(titles['mod_title'])

In [298]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# this will allow us to click on the goodreads link to the book to verify title
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# display cover image of book
def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

def search(query,vectorizer):
    # process our query, removing extraneous characters and caps
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    # turn our query into a vector 
    query_vec = vectorizer.transform([processed])
    # search our tfidf for our query
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # find the largest similarities 
    indices = np.argpartition(similarity,-10)[-10:]
    # use our indices to index titles
    results = titles.iloc[indices]
    # sort values and assign only the row with the highest number of ratings
    results = results.sort_values(by='ratings', ascending=False)
    # return top 5 results
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})
    

In [304]:
search('hands-on machine learning',vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
861957,213030,Machine Learning,446.0,Goodreads,,machine learning
941049,25545994,Python Machine Learning,188.0,Goodreads,,python machine learning
268418,32899495,Hands-On Machine Learning with Scikit-Learn and TensorFlow,81.0,Goodreads,,handson machine learning with scikitlearn and tensorflow
327118,18453708,Hands-on Justice,76.0,Goodreads,,handson justice
488557,18859629,Machine Learning with R,71.0,Goodreads,,machine learning with r


#### Creating a List of Liked Books Using `book_id`


Next, we will choose a series of books using `book_id` to feed to our recommender system. Here I chose 'East of Eden', 'The Road', 'The Physics of Consciousness', and 'Hands on Machine Learning', of course!

In [305]:
liked_books = ['8132407','850062','1325218','306982','32899495'] # east of eden, physics of consciousness, the road, etc

### Exploring Book Rating Data