In [2]:
import gzip

with gzip.open("books.json.gz") as f:
    line = f.readline()

In [3]:
import json

data = json.loads(line)
data["book_id"],data["title"],data["ratings_count"],data["authors"],data["popular_shelves"]

('5333265',
 'W.C. Fields: A Life on Film',
 '3',
 [{'author_id': '604031', 'role': ''}],
 [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}])

In [4]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"],  
        "cover_image": data["image_url"],
    }

In [5]:
books_titles = []
with gzip.open("books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields)

In [6]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [7]:
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [8]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [9]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [10]:
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [11]:
titles = titles[titles["mod_title"].str.len() > 0]

In [12]:
titles.to_json("books_titles.json")

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5).style.format({'cover_image': show_image})

In [18]:
search("harry potter", vectorizer)

Unnamed: 0,book_id,title,ratings,cover_image,mod_title
129532,7809996,"هاري بوتر وحجرة الأسرار (Harry Potter, #2)",1117,,harry potter 2
1044938,49869,"هاري بوتر وسجين أزكابان (Harry Potter, #3)",1023,,harry potter 3
19737,49839,"هاري بوتر وكأس النار (Harry Potter, #4)",957,,harry potter 4
903847,70355,"هاري بوتر وجماعة العنقاء (Harry Potter, #5)",955,,harry potter 5
138743,14560521,"הארי פוטר והאסיר מאזקבאן (Harry Potter, #3)",36,,harry potter 3
