In [10]:
!wc -l 'goodreads_books.json.gz'

 7588375 goodreads_books.json.gz


In [11]:
!ls -lh | grep 'goodreads_books.json.gz'

-rw-r--r--@ 1 khadijagardezi  staff   1.9G Feb 17 19:36 goodreads_books.json.gz


In [12]:
import gzip 
with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()

In [13]:
import json
# json.loads(line) 

In [14]:
def parse_fields(line):
    data = json.loads(line)
    return {
        # Extract the book ID,title,url,cover_image,ratings
        'book_id': data['book_id'],         
        'title': data['title_without_series'],  
        'ratings': data['ratings_count'],   
        'url': data['url'],                
        'cover_image': data['image_url']    
    }


In [15]:
books_titles = []

# Books data
with gzip.open('goodreads_books.json.gz', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break 
        fields = parse_fields(line) 
        
        try:
            ratings = int(fields['ratings'])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields) 


In [38]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...


In [39]:
titles['ratings'] = pd.to_numeric(titles['ratings'])

In [40]:
titles['mod_title'] = titles['title'].str.replace('[^a-zA-Z0-9 ]', '', regex=True)

In [41]:
titles['mod_title'] = titles['mod_title'].str.lower()

In [42]:
titles['mod_title'] = titles['mod_title'].str.replace('\s+', " ", regex=True)

In [43]:
# Filter out any rows where the 'mod_title' column is an empty string.
titles = titles[titles['mod_title'].str.len()> 0 ]
titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


In [44]:
titles.to_json("books_titles.json")

### Search Engine for books

In [45]:
# !pip install gradio

In [46]:
import gradio as gr

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(titles["mod_title"])


In [48]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


In [52]:
def make_clickable(val):
    return '<a target="_black" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" />'.format(val)

def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    results['url'] = results['url'].apply(make_clickable)
    results['cover_image'] = results['cover_image'].apply(show_image)
    results['mod_title'] = results['mod_title'].apply(lambda x: f'<p style="font-weight: bold;">{x}</p>') 
    return results.head(5).to_html(escape=False, index=False)

In [50]:
def recommend_books(book_name):
    try:
        search_screen = search(book_name, vectorizer)
        return search_screen
    except Exception as e:
        return f"An error occurred: {e}"

In [53]:
iface = gr.Interface(
    fn= recommend_books,
    inputs=gr.Textbox(lines=1, placeholder="Enter Book Name Here..."),
    outputs="html",
    title="Book Recommendation System",
    description="Search A Book You Like"
)

iface.launch()

Running on local URL:  http://127.0.0.1:7887

To create a public link, set `share=True` in `launch()`.


