In [28]:
import gzip 
with gzip.open("goodreads_books.json.gz", "r") as f:
    line = f.readline()

line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [29]:
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

### Parsing our Book Metadata

> This function essentially filters the information from the original JSON data, keeping only the fields specified in the return statement. It can be useful when you want to simplify or extract specific information from a larger dataset.

In [30]:
def parse_feilds(line):
    # Parse the JSON-formatted string into a Python dictionary
    data = json.loads(line)

    # Create a new dictionary with selected fields
    return {
        "book_id": data["book_id"],
        "title": data["title_without_series"],
        "ratings" : data["ratings_count"],
        "url" : data["url"],
        "cover_image" : data["image_url"]
    }

- This code reads lines from a gzipped JSON file ("goodreads_books.json.gz"), parsing each line using the parse_feilds function, and then filtering out books with ratings less than or equal to 15. The resulting book information (title, book_id, etc.) for books with ratings greater than 150 is stored in the books_titles list.

In [31]:
 # Initialize an empty list to store book information

books_titles = []
with gzip.open("goodreads_books.json.gz", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        feilds = parse_feilds(line)

        try: 
            ratings = int(feilds["ratings"])
        except ValueError:
            continue


        # Check if the ratings are greater than 150
        if ratings > 150:
            books_titles.append(feilds)



### Processing Book Metadata with Pandas

In [32]:
import pandas as pd

# creating DataFrame from a list of books with more than 150 ratings
titles = pd.DataFrame.from_dict(books_titles)


In [33]:
titles["ratings"] = pd.to_numeric(titles["ratings"])

#  "mod_title," which contains modified versions of the "title" column with non-alphanumeric characters removed. 
titles["mod_title"] = titles["title"].str.replace("[^a-zA-z0-9 ]", "", regex = True)
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,Best Friends Forever
1,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,The Devils Notebook
2,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,Crowner Royal Crowner John Mystery 13
3,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,The House of Memory Plutos Snitch 2
4,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 Minutes in Heaven A True Story of Death and...
...,...,...,...,...,...,...
381185,17280727,دولة البطاطس,151,https://www.goodreads.com/book/show/17280727,https://images.gr-assets.com/books/1358953155m...,
381186,31522515,Walking on My Grave,340,https://www.goodreads.com/book/show/31522515-w...,https://images.gr-assets.com/books/1490374478m...,Walking on My Grave
381187,15500943,"Not Quickly Broken (Chop, Chop, #7)",456,https://www.goodreads.com/book/show/15500943-n...,https://images.gr-assets.com/books/1381765124m...,Not Quickly Broken Chop Chop 7
381188,1370179,The Brazilian Boss's Innocent Mistress,240,https://www.goodreads.com/book/show/1370179.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,The Brazilian Bosss Innocent Mistress


In [34]:
titles["mod_title"] = titles["mod_title"].str.lower()

# Filter out all the trailing white spaces
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex = True)
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
1,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
2,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
3,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2
4,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 minutes in heaven a true story of death and...
...,...,...,...,...,...,...
381185,17280727,دولة البطاطس,151,https://www.goodreads.com/book/show/17280727,https://images.gr-assets.com/books/1358953155m...,
381186,31522515,Walking on My Grave,340,https://www.goodreads.com/book/show/31522515-w...,https://images.gr-assets.com/books/1490374478m...,walking on my grave
381187,15500943,"Not Quickly Broken (Chop, Chop, #7)",456,https://www.goodreads.com/book/show/15500943-n...,https://images.gr-assets.com/books/1381765124m...,not quickly broken chop chop 7
381188,1370179,The Brazilian Boss's Innocent Mistress,240,https://www.goodreads.com/book/show/1370179.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the brazilian bosss innocent mistress


In [35]:
# Filter out empty files
titles = titles[titles["mod_title"].str.len() > 0]

# saving it all in a JSON file
titles.to_json("books_titles.json")
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
1,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
2,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
3,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2
4,89375,90 Minutes in Heaven: A True Story of Death an...,68157,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,90 minutes in heaven a true story of death and...
...,...,...,...,...,...,...
381185,17280727,دولة البطاطس,151,https://www.goodreads.com/book/show/17280727,https://images.gr-assets.com/books/1358953155m...,
381186,31522515,Walking on My Grave,340,https://www.goodreads.com/book/show/31522515-w...,https://images.gr-assets.com/books/1490374478m...,walking on my grave
381187,15500943,"Not Quickly Broken (Chop, Chop, #7)",456,https://www.goodreads.com/book/show/15500943-n...,https://images.gr-assets.com/books/1381765124m...,not quickly broken chop chop 7
381188,1370179,The Brazilian Boss's Innocent Mistress,240,https://www.goodreads.com/book/show/1370179.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the brazilian bosss innocent mistress


### Building A Search Engine


- TfidfVectorizer is a tool for converting a collection of raw documents (in this case, titles) into a matrix of TF-IDF features.
- Each row of the matrix corresponds to a title, and each column corresponds to a unique word in the titles. The values in the matrix represent the TF-IDF scores for each word in each title.

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

tfidf


<379878x92395 sparse matrix of type '<class 'numpy.float64'>'
	with 1809158 stored elements in Compressed Sparse Row format>

In [37]:
liked_books = []

>  displays the top 5 results in a styled Pandas DataFrame with clickable URLs and images.

- Cosine similarity is a measure of similarity between two non-zero vectors in an inner product space. cosine similarity is often used to measure the similarity between two documents or text passages. It calculates the cosine of the angle between two vectors and ranges from -1 (completely dissimilar) to 1 (completely similar).

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

def search(query, vectorizer):
    # cleaning the query
    query = query.lower()
    pro = re.sub("[^a-zA-Z0-9]", "", query)
    # turning the query into a tf - idf vector
    query_vec = vectorizer.transform([query])

    similarity = cosine_similarity(query_vec, tfidf).flatten()

    indices = np.argpartition(similarity, -10)[-10:]
    result = titles.iloc[indices]

    # show the book with the highest rating first
    result = result.sort_values("ratings", ascending=False)
    
    liked_books.append(result.iloc[0]["book_id"])


    
    return result.head(5).style.format({'cover_image' : show_image, 'url': make_clickable})

In [39]:
search("northanger abbey", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
262657,398199,Northanger Abbey,2436,Goodreads,,northanger abbey
119649,6234197,Northanger Abbey,2248,Goodreads,,northanger abbey
203631,8144727,Northanger Abbey,745,Goodreads,,northanger abbey
6052,11758567,Northanger Abbey,622,Goodreads,,northanger abbey
131364,827523,Northanger Abbey,594,Goodreads,,northanger abbey


In [40]:
search("jane eyre", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
276375,50165,Jane Eyre,9694,Goodreads,,jane eyre
85301,161106,Jane Eyre,1066,Goodreads,,jane eyre
79425,11020,Jane Eyre,613,Goodreads,,jane eyre
276197,1344463,Jane Eyre,601,Goodreads,,jane eyre
250444,2947880,Jane Eyre,529,Goodreads,,jane eyre


In [41]:
search("far from the madding crowd", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
370721,31463,Far from the Madding Crowd,89085,Goodreads,,far from the madding crowd
28705,14800528,Far from the Madding Crowd,521,Goodreads,,far from the madding crowd
370722,31462,Far from the Madding Crowd,334,Goodreads,,far from the madding crowd
202580,929547,Far From the Madding Crowd,283,Goodreads,,far from the madding crowd
231019,23492139,Far from the Madding Crowd,278,Goodreads,,far from the madding crowd


In [42]:
search("the red and the black", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
322757,14662,The Red and the Black,34172,Goodreads,,the red and the black
139905,7709933,Red,1490,Goodreads,,red
19389,30755704,The Red,1226,Goodreads,,the red
85383,29223516,"Red (Black, #2)",794,Goodreads,,red black 2
293840,23234844,The Black,766,Goodreads,,the black


In [43]:
liked_books

['398199', '50165', '31463', '14662']

### Exploring the Book Rating Data

In [44]:
#This line initializes an empty dictionary to store the mapping between CSV IDs and book IDs.
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id


### Finding users who like the same books as us

> This code is essentially counting how many times each user has interacted with books that are in the liked_books list, and it stores this information in the overlap_users dictionary. The dictionary keys are user IDs, and the values are the corresponding counts of interactions.


In [63]:
overlap_users = {}

with open("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break

        user_id, csv_id, _, rating, _ = line.strip().split(",")

        # Check if csv_id is in the dictionary; if not, set book_id to None
        book_id = csv_book_mapping.get(csv_id)

        if book_id is not None and book_id in liked_books:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1



In [64]:
len(overlap_users)

28187

- only selects those users whose taste matches with 20% of our liked books

In [65]:
filtered_overlap_users = set(user_id for user_id, count in overlap_users.items() if count > len(liked_books)/5)

### Finding What those users liked

If the user is in the filtered set, it retrieves the corresponding book_id from the csv_book_mapping dictionary and appends a list containing [user_id, book_id, rating] to the interactions_list

In [66]:
interactions_list = []

with open("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")

        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [67]:
len(interactions_list)

23388763

In [68]:
import pandas as pd 
recs = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
recs["book_id"] = recs["book_id"].astype(str)
recs["user_id"] = recs["user_id"].astype(str)
recs["rating"] = pd.to_numeric(recs["rating"])

> These steps are often done when working with collaborative filtering or recommendation systems. The categorical codes are used to represent users and books with numerical indices, which can be more efficient for certain algorithms and analyses.

In [69]:
# This line creates a new column "user_index" in the recs DataFrame, which represents the categorical codes assigned to each unique user ID.
recs["user_index"] = recs["user_id"].astype("category").cat.codes


In [70]:
recs["book_index"] = recs["book_id"].astype("category").cat.codes

In [71]:
len(recs["user_index"].unique())
len(recs["book_index"].unique())

1368559

> sparse matrix where rows correspond to users, columns correspond to books, and the matrix elements represent the ratings given by users to books. Sparse matrices are particularly useful when dealing with large datasets where most entries are zero, as they save memory by only storing the non-zero elements.

In [72]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((recs["rating"], (recs["user_index"], recs["book_index"])))

ratings_mat_coo

<28187x1368559 sparse matrix of type '<class 'numpy.int64'>'
	with 23388763 stored elements in COOrdinate format>

In [73]:
# Conversion to coordinate format
# CSR format is efficient for arithmetic operations and slicing along rows, which are common tasks in recommendation systems and collaborative filtering algorithms.
ratings_mat = ratings_mat_coo.tocsr()

In [74]:
top_recs = recs["book_id"].value_counts().head(10)
top_recs = top_recs.index.values

In [75]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)


In [76]:
books_titles[books_titles["book_id"].isin(top_recs)]
all_recs = recs["book_id"].value_counts()
all_recs

book_id
31463       19415
4671        18805
2657        18627
1885        18238
5470        17092
            ...  
9743169         1
1277950         1
936909          1
5986621         1
17138312        1
Name: count, Length: 1368559, dtype: int64

In [77]:
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ["book_id", "book_count"]
all_recs

Unnamed: 0,book_id,book_count
0,31463,19415
1,4671,18805
2,2657,18627
3,1885,18238
4,5470,17092
...,...,...
1368554,9743169,1
1368555,1277950,1
1368556,936909,1
1368557,5986621,1


In [78]:
all_recs = all_recs.merge(books_titles, how = "inner", on = "book_id")
all_recs

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title
0,31463,19415,Far from the Madding Crowd,89085,https://www.goodreads.com/book/show/31463.Far_...,https://images.gr-assets.com/books/1388279695m...,far from the madding crowd
1,4671,18805,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
2,2657,18627,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
3,1885,18238,Pride and Prejudice,2078406,https://www.goodreads.com/book/show/1885.Pride...,https://images.gr-assets.com/books/1320399351m...,pride and prejudice
4,5470,17092,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
...,...,...,...,...,...,...,...
366962,18756007,1,The Betrayer,591,https://www.goodreads.com/book/show/18756007-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the betrayer
366963,15720601,1,More than a Governess (The Wetherby Brides #2),285,https://www.goodreads.com/book/show/15720601-m...,https://images.gr-assets.com/books/1340562868m...,more than a governess the wetherby brides 2
366964,29913659,1,"With Deadly Intent (North East Police, #1)",248,https://www.goodreads.com/book/show/29913659-w...,https://s.gr-assets.com/assets/nophoto/book/11...,with deadly intent north east police 1
366965,30061066,1,Forgotten,196,https://www.goodreads.com/book/show/30061066-f...,https://images.gr-assets.com/books/1462017262m...,forgotten


In [79]:
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])
all_recs.sort_values("score", ascending=False).head(10)
popular_recs = all_recs[all_recs["book_count"] > 75].sort_values("score", ascending=False)

In [80]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

popular_recs[~popular_recs["book_id"].isin(liked_books)].head(10).style.format({'cover_image' : show_image, 'url': make_clickable})

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
96,19089,7708,Middlemarch,98029,Goodreads,,middlemarch,606.078446
3333,6382055,738,"A Dream of Spring (A Song of Ice and Fire, #7)",914,Goodreads,,a dream of spring a song of ice and fire 7,595.890591
108,5797,7230,Vanity Fair,92941,Goodreads,,vanity fair,562.431005
55,32261,9917,Tess of the D'Urbervilles,175403,Goodreads,,tess of the durbervilles,560.691032
6488,24493732,423,Solutions and Other Problems,334,Goodreads,,solutions and other problems,535.715569
183,337113,5704,The Tenant of Wildfell Hall,61479,Goodreads,,the tenant of wildfell hall,529.215114
5919,26032825,458,"The Cruel Prince (The Folk of the Air, #1)",400,Goodreads,,the cruel prince the folk of the air 1,524.41
118,5890,7007,The Woman in White,94556,Goodreads,,the woman in white,519.248371
260,31173,4665,Villette,42535,Goodreads,,villette,511.63101
3086,22294061,789,Romantic Outlaws: The Extraordinary Lives of Mary Wollstonecraft and Her Daughter Mary Shelley,1238,Goodreads,,romantic outlaws the extraordinary lives of mary wollstonecraft and her daughter mary shelley,502.844103
