In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gzip
import os

import numpy as np
import pandas as pd
import json
import csv
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
indir = '/content/drive/MyDrive/Capstone/dataset/'


filepath = os.path.join(indir, 'goodreads_interactions_comics_graphic.json.gz')
lines = []
with gzip.open(filepath, 'rt', encoding='utf-8') as f:
    for line in f:
        line = json.loads(line)
        content = {
            'user_id': line['user_id'],
            'book_id': line['book_id'],
            'rating': line['rating']
        }
        lines.append(content)


interactions = pd.DataFrame.from_records(lines)
filepath = os.path.join(indir, 'goodreads_books_comics_graphic.json.gz')
with gzip.open(filepath, 'rt', encoding='utf-8') as f:
    lines = [json.loads(line) for line in f]
books= pd.DataFrame.from_records(lines)

ratings = interactions

# this sparse matrix code is for dataframes with book_id and user_id
user_ids = CategoricalDtype(sorted(ratings.user_id.unique()), ordered=True)
book_ids = CategoricalDtype(sorted(ratings.book_id.unique()), ordered=True)

user_idx = ratings.user_id.astype(user_ids).cat.codes
book_idx = ratings.book_id.astype(book_ids).cat.codes
shape = (book_ids.categories.size, user_ids.categories.size)

sparse_matrix = csr_matrix((ratings.rating, (book_idx, user_idx)), shape=shape, dtype=np.int8)
similarities = cosine_similarity(sparse_matrix, dense_output=False)


In [None]:
def get_similar_books_in_genre_fast(book_id, k=10):
    if book_id not in book_ids.categories:
        return f"Book ID {book_id} not found."

    book_idx = pd.Series(book_id).astype(book_ids).cat.codes[0]
    row = similarities.getrow(book_idx)

    indices = row.indices
    data = row.data
    mask = indices != book_idx
    filtered_indices = indices[mask]
    filtered_scores = data[mask]

    if len(filtered_scores) == 0:
        return pd.DataFrame(columns=["input_book_id", "recommended_book_id", "similarity"])

    top_k_idx = np.argpartition(-filtered_scores, kth=min(k, len(filtered_scores)-1))[:k]
    top_k_pairs = [(book_ids.categories[filtered_indices[i]], filtered_scores[i]) for i in top_k_idx]

    return pd.DataFrame({
        "input_book_id": [book_id] * len(top_k_pairs),
        "recommended_book_id": [r[0] for r in top_k_pairs],
        "similarity": [r[1] for r in top_k_pairs]
    })


def build_collaborative_recommender_fast(k=10):
    results = []
    n_books = similarities.shape[0]

    for i in range(n_books):
        row = similarities.getrow(i)
        indices = row.indices
        data = row.data

        mask = indices != i
        filtered_indices = indices[mask]
        filtered_scores = data[mask]

        if len(filtered_scores) == 0:
            continue

        top_k_idx = np.argpartition(-filtered_scores, kth=min(k, len(filtered_scores)-1))[:k]
        top_k_pairs = [(book_ids.categories[filtered_indices[j]], filtered_scores[j]) for j in top_k_idx]

        for rec_id, sim_score in top_k_pairs:
            results.append({
                "input_book_id": book_ids.categories[i],
                "recommended_book_id": rec_id,
                "similarity": sim_score
            })

    return pd.DataFrame(results)

In [None]:
df_one = get_similar_books_in_genre_fast("24494", k=10)
print(df_one)

  input_book_id recommended_book_id  similarity
0         24494               24813    0.467992
1         24494               24814    0.440795
2         24494               24815    0.439526
3         24494               59715    0.436621
4         24494               24816    0.435958
5         24494               70489    0.429853
6         24494               70488    0.423315
7         24494               70487    0.417438
8         24494               24818    0.411657
9         24494               70491    0.409295


In [None]:
collab_df = build_collaborative_recommender_fast(k=10)
collab_df.to_csv("/content/drive/MyDrive/Capstone/dataset/recommendercollab.csv", index=False, encoding="utf-8")