# The object of this project is to recommend books.
# "The datasets were collected in late 2017 from goodreads.com, where we only scraped users' public shelves, i.e. everyone can see it on web without login. User IDs and review IDs are anonymized. "
    source: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph

# We will use three datasets:
#### 1) goodreads_interactions.csv with 4GB and the following columns:
            - user id;
            - book id;
            - rating (can go from 0 to 5).
#### 2) goodreads_books.json.gz with 2GB and each line is a metadata from a specific book, for example:
            - title;
            - book id;
            - ratings count.
#### 3) Book id map that connects both datasets with the following columns:
            - book_id_csv;
            - book_id.

# Read a single line of the .gz file:

In [9]:
import gzip
with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()

In [10]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [13]:
# turn into a python dictonary:
import json
json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [18]:
# return only the fields that are relevant:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "title": data["title_without_series"],
        "ratings": data["ratings_count"],
        "url": data["url"],
        "cover_image": data["image_url"]
    }

In [23]:
# create a list of dictionaries:
books_titles = []
with gzip.open("goodreads_books.json.gz", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields)

In [25]:
# transform the list of dictionaries into row in the dataframe:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [26]:
# Transform ratings column into numeric type to make comparisons:
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [27]:
# Standardization of the book name to avoid books like Harry Potter or Harry POTTER from being diferent:
# replace to spaces any name that does not contain a-z, A-Z, 0-9 and spaces:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True) 

In [28]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,The Unschooled Wizard Sun Wolf and Starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,Best Friends Forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,The Aeneid for Boys and Girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,Alls Fairy in Love and War Avalon Web of Magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,The Devils Notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,Ondine Ondine Quartet 05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,Jacqueline Kennedy Onassis Friend of the Arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,The Spaniards Blackmailed Bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,The Childrens Classic Poetry Collection


In [29]:
# lower case the mod_titles:
titles["mod_title"] = titles["mod_title"].str.lower()

In [30]:
# Replace to one space, any spaces is a row:
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [31]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondine ondine quartet 05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacqueline kennedy onassis friend of the arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the spaniards blackmailed bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection


In [33]:
# Remove any row where the mod_title is null:
titles = titles[titles["mod_title"].str.len() > 0]

In [34]:
# create a json file:
titles.to_json("books_titles.json")

In [35]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondine ondine quartet 05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacqueline kennedy onassis friend of the arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the spaniards blackmailed bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection


In [40]:
# Create a search engine using a TF-IDF (term frequency-inverse document frequency) matrix:
# search through titles creating a (A) term frequency matrix and an
# (B) inverse document frequency matrix (to minimize the impact of very common words, for example, "the").
# finally the value of the term shall be (A) * (B):
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [51]:
# Do a comparision between the vectorized title and the title I want to search:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

#style "url" column:
def make_clickable(val):
    return '<a target=blank" href={}">Goodreads</a>'.format(val)

# style "cover_image" column:
def show_image(val):
    return '<img src="{}" width=50 </img>'.format(val)

# Do a comparision between the vectorized title and the title I want to search:
def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # find the ten more similar to the query:
    indices = np.argpartition(similarity, -10)[-10:]
    results= titles.iloc[indices]
    # sort the rows with the higher ratings:
    results = results.sort_values("ratings", ascending=False)
    # pick 5 rows with the higher ratings, style the url and cover_image:
    return results.head(5).style.format({"url": make_clickable, 'cover_image': show_image})

In [58]:
search("the witcher", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1056636,8173788,"The Last Wish (The Witcher, #1)",2829,Goodreads,,the last wish the witcher 1
673490,2287468,"The Last Wish (The Witcher, #1)",2667,Goodreads,,the last wish the witcher 1
1079663,11570030,"The Last Wish (The Witcher, #1)",1610,Goodreads,,the last wish the witcher 1
174038,34102179,"The Last Wish (The Witcher, #1)",903,Goodreads,,the last wish the witcher 1
1117414,22789432,The World of the Witcher,224,Goodreads,,the world of the witcher


In [56]:
# using the search function to search for other liked books(for example "Pachinko", "Foundation", "The Witcher"),
# I have found the better rating book with these IDs:
liked_books = ["8132407", "31147619", "29983711", "5996629", "8173788"]