# Imports

In [10]:
# import files
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import requests
from IPython.display import HTML
from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Data Processing

In [11]:
# read the data
books_df = pd.read_csv("data/books_data.csv")
reviews_df = pd.read_csv("data/books_rating.csv")

In [12]:
def preprocess_reviews_df(reviews_df):
    """
    This function preprocesses the reviews dataframe

        Args:
            reviews_df (pd.DataFrame): dataframe of the original reviews

        Returns:
            reviews_df (pd.DataFrame): preprocessed dataframe
    """

    # rename columns for consistency
    reviews_df.rename(
        columns={
            "Id": "book_id",
            "Title": "title",
            "Price": "price",
            "User_id": "user_id",
            "profileName": "profile_name",
            "review/helpfulness": "helpfulness",
            "review/score": "score",
            "review/time": "review_date",
            "review/summary": "summary",
            "review/text": "text",
        },
        inplace=True,
    )

    # get the year of the review from the date (review/time)
    reviews_df["year"] = reviews_df["review_date"].apply(
        lambda x: datetime.utcfromtimestamp(x).year
    )

    # drop unnecessary cols
    reviews_df.drop(columns=["price"], inplace=True)

    # drop the null titles and users
    reviews_df = reviews_df.dropna(subset=["title", "user_id"])

    # Preprocess helpfulness
    # handle 0/0
    reviews_df["helpfulness"] = reviews_df["helpfulness"].replace("0/0", 0)

    # convert each helpfulness string to float
    reviews_df["helpfulness"] = reviews_df["helpfulness"].apply(
        lambda x: eval(x) if isinstance(x, str) and "/" in x else x
    )
    reviews_df["helpfulness"] = reviews_df["helpfulness"].astype(float)
    reviews_df["helpfulness"]

    # drop duplicate reviews
    reviews_df.drop_duplicates(
        subset=["user_id", "score", "review_date", "summary", "text"], inplace=True
    )

    return reviews_df


def preprocess_books_df(books_df):
    # drop irrelevant cols
    books_df.drop(
        columns=["previewLink", "infoLink", "ratingsCount", "publisher"], inplace=True
    )

    # rename cols
    books_df.rename(
        columns={
            "Title": "title",
            "publishedDate": "published_date",
        },
        inplace=True,
    )

    # drop null values in Title
    books_df.dropna(subset=["title"], inplace=True)

    # fix the dates, extract the year of the book
    books_df["published_date"] = books_df["published_date"].replace("1963*", 1963)
    books_df["published_date"] = (
        books_df["published_date"].astype(str).str.extract(r"(\d{4})")
    )
    books_df["published_date"] = books_df["published_date"].apply(
        lambda x: int(x) if isinstance(x, str) and x.isdigit() else x
    )

    # calculate age (recency feature) - possibly for content based filtering if combined with category for ex
    books_df["age"] = datetime.today().year - books_df["published_date"]

    return books_df

In [13]:
reviews_df = preprocess_reviews_df(reviews_df)
books_df = preprocess_books_df(books_df)

  lambda x: datetime.utcfromtimestamp(x).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] = reviews_df["helpfulness"].replace("0/0", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] = reviews_df["helpfulness"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] 

In [14]:
def calc_review_counts(books_df):
    # calculate review count for each book
    review_counts_dict = (
        reviews_df.groupby("title").agg(count=("book_id", "count")).to_dict()["count"]
    )

    # add the count column to books_df
    books_df["count"] = books_df["title"].apply(lambda x: review_counts_dict.get(x))

    books_df.sort_values(by="count", ascending=False)

    return books_df


books_df = calc_review_counts(books_df)

In [15]:
reviews_df.head()

Unnamed: 0,book_id,title,user_id,profile_name,helpfulness,score,review_date,summary,text,year
0,1882931173,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",1.0,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,1999
1,826414346,Dr. Seuss: American Icon,A30TK6U7DNS82R,Kevin Killian,1.0,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,2004
2,826414346,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,John Granger,0.909091,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t...",2004
3,826414346,Dr. Seuss: American Icon,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",1.0,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",2004
4,826414346,Dr. Seuss: American Icon,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",1.0,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...,2005


In [16]:
books_df.head()

Unnamed: 0,title,description,authors,image,published_date,categories,age,count
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,1996.0,['Comics & Graphic Novels'],29.0,1.0
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,2005.0,['Biography & Autobiography'],20.0,9.0
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,2000.0,['Religion'],25.0,4.0
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,2005.0,['Fiction'],20.0,32.0
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,2003.0,,22.0,1.0


# Non-personalized Recommendations

## Weighted Scoring

## Bayesian Scoring

Note: function used from "Non_Personalized_Recommendations_Trending_Now" notebook from iLearn

In [17]:
def bayesian_scoring(reviews_df):
    # step 1: create array with title, average rating of each book, and # of ratings
    books_avg_ratings = (
        reviews_df.groupby(["title"])
        .agg(avg_rating=("score", "mean"), num_ratings=("score", "count"))
        .reset_index()
    )

    # step 2: find m= Global average rating across all books
    m = reviews_df["score"].mean()
    # step 3: find C= confidence factor
    C = reviews_df["title"].value_counts().mean()
    # step 4: bayesian score
    books_avg_ratings["bayesian_score"] = (
        C * m + books_avg_ratings["num_ratings"] * books_avg_ratings["avg_rating"]
    ) / (C + books_avg_ratings["num_ratings"])
    # step 5: rank the movies based on their bayesian score
    books_avg_ratings_ranked = books_avg_ratings.sort_values(
        by=["bayesian_score"], ascending=False
    )
    return books_avg_ratings_ranked

In [18]:
# Top 5 Books based on Bayesian Scoring Ranking
top5 = bayesian_scoring(reviews_df).iloc[:5]

In [19]:
def show_top_books_with_covers(df, title_col="title", n=5):
    """
    show the top books with cover

        Args:
            df (pd.DataFrame): dataframe of the top books
            title_col (str, optional): column name that contain book titles. Default = 'title'.
            n (int, optional): no. of books to display. Default = 5.

        Returns:
            IPython.display.HTML: An HTML table with book covers and titles


    Note: function and HTML is a refined version of -> https://github.com/masao/google_books_api_wrapper

    """

    def get_google_books_cover(title):
        url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{title}"
        try:
            response = requests.get(url, timeout=5).json()
            return response["items"][0]["volumeInfo"]["imageLinks"]["thumbnail"]
        except Exception:
            return "https://via.placeholder.com/60x90?text=No+Cover"

    top_books = df.head(n).copy()
    top_books["cover_url"] = top_books[title_col].apply(get_google_books_cover)

    # Build HTML table
    html_table = [
        "<table style='border-collapse: collapse; text-align: left;'>",
        "<tr><th>Cover</th><th>Title</th></tr>",
    ]
    for _, row in top_books.iterrows():
        html_table.append(
            f"<tr style='border: 1px solid #ccc;'>"
            f"<td><img src='{row['cover_url']}' width='60'></td>"
            f"<td style='padding: 8px;'>{row[title_col]}</td>"
            f"</tr>"
        )
    html_table.append("</table>")

    return HTML("".join(html_table))


In [20]:
show_top_books_with_covers(top5)

Cover,Title
,Lilla Belle: The First Stages
,the lion's paw
,"The Ferret Calendar 2005, Ferret Music"
,With the Old Breed: At Peleliu and Okinawa
,The Wealthy Spirit: Daily Affirmations for Financial Stress Reduction


## Wilson Scoring

## Apriori Algorithm

## FP Growth

# Personalized Recommendations

## User-based collaborative filtering

## Item-based collaborative filtering

In [21]:
relevant_cols = ["title", "user_id", "score", "review_date"]
user_book_reviews = reviews_df[relevant_cols]
user_book_reviews.shape

(1693626, 4)

In [22]:
# select the 100k most active users and 100k most reviewed items
most_active_users = user_book_reviews.user_id.value_counts().keys()[:100000]
most_reviewed_books = user_book_reviews.title.value_counts().keys()[:100000]

subset_reviews = user_book_reviews[
    (user_book_reviews.user_id.isin(most_active_users))
    & (user_book_reviews.title.isin(most_reviewed_books))
]

print(subset_reviews.shape)
subset_reviews.head()


(642291, 4)


Unnamed: 0,title,user_id,score,review_date
1,Dr. Seuss: American Icon,A30TK6U7DNS82R,5.0,1095724800
2,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,5.0,1078790400
3,Dr. Seuss: American Icon,A2MVUWT453QH61,4.0,1090713600
4,Dr. Seuss: American Icon,A22X4XUPKF66MR,4.0,1107993600
5,Dr. Seuss: American Icon,A2F6NONFUDB6UK,4.0,1127174400


In [None]:
# Using the entire dataset
subset_reviews = user_book_reviews


rows = subset_reviews["user_id"].astype("category")
cols = subset_reviews["title"].astype("category")

user_mapping = rows.cat.categories
book_mapping = cols.cat.categories

sparse_matrix = csr_matrix((subset_reviews["score"], (rows.cat.codes, cols.cat.codes)))
# Renaming sparse_matrix to user_item_rating
user_item_rating = sparse_matrix

In [24]:
print(user_item_rating.tocoo())

  (0, 176927)	5.0
  (1, 163001)	5.0
  (2, 7687)	3.0
  (2, 163845)	5.0
  (3, 135363)	5.0
  (4, 141878)	5.0
  (5, 70522)	5.0
  (6, 139754)	3.0
  (7, 28314)	5.0
  (8, 143398)	5.0
  (9, 41399)	5.0
  (9, 130366)	5.0
  (9, 191001)	4.0
  (10, 170546)	5.0
  (11, 39433)	4.0
  (12, 20951)	5.0
  (13, 22203)	4.0
  (14, 36023)	3.0
  (15, 64763)	5.0
  (16, 153281)	5.0
  (17, 72640)	5.0
  (18, 135871)	5.0
  (19, 57044)	3.0
  (20, 141314)	3.0
  (21, 167697)	5.0
  :	:
  (1008942, 23216)	5.0
  (1008942, 143168)	5.0
  (1008943, 158562)	4.0
  (1008943, 168950)	1.0
  (1008944, 37349)	5.0
  (1008945, 136517)	5.0
  (1008946, 163038)	5.0
  (1008947, 14097)	5.0
  (1008948, 3090)	2.0
  (1008949, 145544)	5.0
  (1008950, 37019)	1.0
  (1008951, 106940)	3.0
  (1008952, 133839)	4.0
  (1008953, 166359)	4.0
  (1008954, 22045)	5.0
  (1008955, 35047)	5.0
  (1008956, 159326)	5.0
  (1008957, 120497)	5.0
  (1008958, 95150)	5.0
  (1008958, 105437)	5.0
  (1008958, 110187)	5.0
  (1008958, 136756)	5.0
  (1008958, 168068)	5.0
 

In [None]:
# Calculates the total possible number of user-item interactions -> max number of entries matrix could have
user_book_reviews.user_id.nunique() * user_book_reviews.title.nunique()

195986638406

In [None]:
# returns the number of non-zero entries in the sparse matrix.
sparse_matrix.getnnz()

1679121

In [None]:
# Find the mean for each item & user
def get_avg(sparse_matrix, _axis):
    # .A1 converts the sparse matrix result into a flat NumPy array.
    sums = sparse_matrix.sum(axis=_axis).A1  # sum of only non-zero entries
    counts = sparse_matrix.getnnz(axis=_axis)
    AVG = sums / counts
    return AVG


# Find users who rated both book i and book j.
def find_common_users(book_i_ratings, book_j_ratings):
    i_j_common_users = set(book_i_ratings.row) & set(book_j_ratings.row)
    i_j_common_users = np.array(list(i_j_common_users), dtype=book_i_ratings.row.dtype)
    return i_j_common_users


# Take rating vectors for book i and book j and keep only the ratings from common users.
def filter_for_common_users(common_users, book_i_ratings, book_j_ratings):
    # only get the ratings for the common users
    mask = np.isin(book_i_ratings.tocoo().row, common_users)
    book_i_ratings_filtered = book_i_ratings.tocoo().__class__(
        (
            book_i_ratings.tocoo().data[mask],
            (book_i_ratings.tocoo().row[mask], book_i_ratings.tocoo().col[mask]),
        ),
        shape=book_i_ratings.tocoo().shape,
    )

    mask = np.isin(book_j_ratings.tocoo().row, common_users)
    book_j_ratings_filtered = book_j_ratings.tocoo().__class__(
        (
            book_j_ratings.tocoo().data[mask],
            (book_j_ratings.tocoo().row[mask], book_j_ratings.tocoo().col[mask]),
        ),
        shape=book_j_ratings.tocoo().shape,
    )

    return book_i_ratings_filtered, book_j_ratings_filtered

In [None]:
# get the number of common users for each pair
coo = sparse_matrix.tocoo()
# This keeps the same structure but replaces ratings with 1.
binary = coo.copy()
binary.data = np.ones_like(binary.data)
# Compute overlap between books
book_pairs_to_common_users = binary.T.dot(binary)

# filter for pairs with #common_users > N
coo_pairs = book_pairs_to_common_users.tocoo()
N = 5
mask = coo_pairs.data >= N
book_pairs_to_common_users_filtered = coo_pairs.__class__(
    (coo_pairs.data[mask], (coo_pairs.row[mask], coo_pairs.col[mask])),
    shape=coo_pairs.shape,
)

In [29]:
print(book_pairs_to_common_users.tocoo())

  (0, 174618)	1.0
  (0, 172974)	1.0
  (0, 165841)	1.0
  (0, 165634)	1.0
  (0, 157157)	1.0
  (0, 150063)	1.0
  (0, 139856)	1.0
  (0, 125616)	1.0
  (0, 111457)	1.0
  (0, 105150)	1.0
  (0, 104275)	1.0
  (0, 38314)	1.0
  (0, 38039)	1.0
  (0, 0)	2.0
  (1, 184835)	1.0
  (1, 36869)	1.0
  (1, 10202)	1.0
  (1, 1224)	1.0
  (1, 194117)	1.0
  (1, 188481)	1.0
  (1, 181425)	1.0
  (1, 180623)	1.0
  (1, 180397)	1.0
  (1, 178660)	1.0
  (1, 174908)	1.0
  :	:
  (194242, 174938)	1.0
  (194242, 17426)	1.0
  (194242, 169344)	1.0
  (194242, 165846)	1.0
  (194242, 161471)	1.0
  (194242, 157067)	1.0
  (194242, 137761)	1.0
  (194242, 136340)	1.0
  (194242, 118535)	1.0
  (194242, 101592)	1.0
  (194242, 91836)	1.0
  (194242, 84247)	1.0
  (194242, 79233)	1.0
  (194242, 76631)	1.0
  (194242, 51936)	1.0
  (194242, 46366)	1.0
  (194242, 39683)	1.0
  (194242, 27210)	2.0
  (194242, 13299)	1.0
  (194242, 12749)	1.0
  (194242, 11206)	1.0
  (194242, 194242)	19.0
  (194243, 194243)	1.0
  (194244, 194244)	2.0
  (194245, 194

In [30]:
USER_AVGS = get_avg(sparse_matrix, _axis=1)
BOOKS_AVGS = get_avg(sparse_matrix, _axis=0)

# adjust the rating for each item vector v to be (v - user_avg)
user_book_to_rating_adjusted = sparse_matrix.copy()
coo = user_book_to_rating_adjusted.tocoo()
coo.data = coo.data - USER_AVGS[coo.row]
user_book_to_rating_adjusted.data = coo.data

In [31]:
print(user_book_to_rating_adjusted.tocoo())

  (0, 176927)	0.0
  (1, 163001)	0.0
  (2, 7687)	-1.0
  (2, 163845)	1.0
  (3, 135363)	0.0
  (4, 141878)	0.0
  (5, 70522)	0.0
  (6, 139754)	0.0
  (7, 28314)	0.0
  (8, 143398)	0.0
  (9, 41399)	0.33333333333333304
  (9, 130366)	0.33333333333333304
  (9, 191001)	-0.666666666666667
  (10, 170546)	0.0
  (11, 39433)	0.0
  (12, 20951)	0.0
  (13, 22203)	0.0
  (14, 36023)	0.0
  (15, 64763)	0.0
  (16, 153281)	0.0
  (17, 72640)	0.0
  (18, 135871)	0.0
  (19, 57044)	0.0
  (20, 141314)	0.0
  (21, 167697)	0.0
  :	:
  (1008942, 23216)	0.0
  (1008942, 143168)	0.0
  (1008943, 158562)	1.5
  (1008943, 168950)	-1.5
  (1008944, 37349)	0.0
  (1008945, 136517)	0.0
  (1008946, 163038)	0.0
  (1008947, 14097)	0.0
  (1008948, 3090)	0.0
  (1008949, 145544)	0.0
  (1008950, 37019)	0.0
  (1008951, 106940)	0.0
  (1008952, 133839)	0.0
  (1008953, 166359)	0.0
  (1008954, 22045)	0.0
  (1008955, 35047)	0.0
  (1008956, 159326)	0.0
  (1008957, 120497)	0.0
  (1008958, 95150)	0.0
  (1008958, 105437)	0.0
  (1008958, 110187)	0.0


In [32]:
"""for each item pair, find the cosine similarity IF we have >N common users

# Calculate item based similarity
similarity_map = {}

# compute similarity
for i, j in zip(
    book_pairs_to_common_users_filtered.row, book_pairs_to_common_users_filtered.col
):
    if (similarity_map.get(i, {}).get(j) is None) and (i != j):
        # book_i = cols.cat.categories[i]  # map index to actual book title
        # book_j = cols.cat.categories[j]  # map index to actual book title
        book_i_ratings = user_book_to_rating_adjusted[:, i].tocoo()
        book_j_ratings = user_book_to_rating_adjusted[:, j].tocoo()

        i_j_common_users = find_common_users(book_i_ratings, book_j_ratings)

        # filter to get ratings for common users only
        book_i_ratings_filtered, book_j_ratings_filtered = filter_for_common_users(
            i_j_common_users, book_i_ratings, book_j_ratings
        )

        # compute the dot product
        dot = book_j_ratings_filtered.data.dot(book_i_ratings_filtered.data)
        norm_i = np.linalg.norm(book_i_ratings_filtered.data)
        norm_j = np.linalg.norm(book_j_ratings_filtered.data)
        cos_sim = dot / (norm_i * norm_j)

        if i not in similarity_map:
            similarity_map[i] = {}
        if j not in similarity_map:
            similarity_map[j] = {}

        similarity_map[i][j] = cos_sim
        similarity_map[j][i] = cos_sim

"""

'for each item pair, find the cosine similarity IF we have >N common users\n\n# Calculate item based similarity\nsimilarity_map = {}\n\n# compute similarity\nfor i, j in zip(\n    book_pairs_to_common_users_filtered.row, book_pairs_to_common_users_filtered.col\n):\n    if (similarity_map.get(i, {}).get(j) is None) and (i != j):\n        # book_i = cols.cat.categories[i]  # map index to actual book title\n        # book_j = cols.cat.categories[j]  # map index to actual book title\n        book_i_ratings = user_book_to_rating_adjusted[:, i].tocoo()\n        book_j_ratings = user_book_to_rating_adjusted[:, j].tocoo()\n\n        i_j_common_users = find_common_users(book_i_ratings, book_j_ratings)\n\n        # filter to get ratings for common users only\n        book_i_ratings_filtered, book_j_ratings_filtered = filter_for_common_users(\n            i_j_common_users, book_i_ratings, book_j_ratings\n        )\n\n        # compute the dot product\n        dot = book_j_ratings_filtered.data.do

In [None]:

# computes adjusted pairwise similarity between rows of X by default - thats why we are using transpose:
similarity_matrix = cosine_similarity(
    user_book_to_rating_adjusted.T, dense_output=False
)

# keep the similarities for only the books that have atleast N common users
N = 5
similarity_matrix = similarity_matrix.multiply(book_pairs_to_common_users >= N)


In [34]:
# for a given target user:
# get the candidate books that can be recommended to the user:
#   books the user did not rate and they exist in our sim map
    # for the 3 best most recently rated book:
    # get the sim score between it and each of the candidate books
    # choose the k most similar books
    # for the k most similar books, calculate the predicted rating
    # recommend the 5 highest rated books

In [35]:
# Make sure it's COO
def get_k_neighbors(coo_mat, k):
    coo = coo_mat.tocoo()

    # Get indices of the top 3 values
    topk_idx = np.argsort(np.abs(coo.data))[-k:][::-1]  # sort descending

    # Get the corresponding row indices
    topk_rows = coo.row[topk_idx]
    topk_values = coo.data[topk_idx]

    # print("Top 3 row indices:", topk_rows)
    # print("Top 3 values:", topk_values)

    return topk_rows, topk_values

In [36]:
# Assume user 0 is a target user

target_user_id = "AVCGYZL8FQQTD"
target_user_idx = user_mapping.get_loc(target_user_id)
target_user = user_book_reviews[(user_book_reviews["user_id"] == target_user_id)]
target_mean = target_user["score"].mean()


# i want to recommend an unread book to this target user
# get the candidate books (unread by the user)
read_books_indices = [book_mapping.get_loc(t) for t in target_user.title]
candidate_books = list(set(sparse_matrix.tocoo().col) - set(read_books_indices))
print(len(candidate_books))

# get all the books that the target read : read_books_indices
# for each candidate book:
    # get the similarity between the candidate book and the read books
    # deviation for the read book: target user rating for read book - global avg for read book


# if this candidate has similarity scores with other candidates, filter only for candidates in read_books_indices
# i want to predict the rating for a candidate. i will filter the candidates for items that are in 
#items having sim score
target_candidate_to_predict = 8 #in principal, you should predict ratings for all candidates
items_having_sim_score_with_candidate = similarity_matrix[: , target_candidate_to_predict].tocoo().row     
items_having_sim_score_with_candidate_and_target_read_them = set(items_having_sim_score_with_candidate) & set(read_books_indices)

predicted_ratings = {}

# Precompute user deviations once
import time
time_now = int(time.time())

user_ratings = sparse_matrix[target_user_idx, read_books_indices].toarray().ravel()
deviations = user_ratings - BOOKS_AVGS[read_books_indices] # what for ? 

time_of_rating_i = target_user.set_index("title")["review_date"]
aligned_times = np.array([time_of_rating_i[book_mapping[i]] for i in read_books_indices])

# for relevance, let t be a time decay factor = 1 / 1+0.5*(today - time)
for cand in candidate_books:

    # similarities of candidate c to the books user read
    sim_col = similarity_matrix[read_books_indices, cand].toarray().ravel()
    
    
    # TODO : choose the k neighbors 
    # TODO : Exploitation vs exploration - 
    #           say we have k = 3. 
    #           2 should have high sim scores like 0.9
    #           1 should have sim between 0.6-0.9
    # TODO : Optimize parameters
    
    # keep only similarities that are nonzero
    mask = (sim_col != 0)
    if not mask.any():
        continue  # skip candidate if no similar items
    
    sim_col_ = sim_col[mask]
    deviations_ = deviations[mask]
    
    taste_change_factor = 0.3 # can be optimized
    time_factor = 1 / (1 + taste_change_factor * (time_now - aligned_times))
    time_factor = time_factor[mask]

    # predicted rating
    #numerator = sim_col_ @ deviations_
    numerator = (sim_col_ * deviations_ * time_factor).sum()
    denominator = np.abs(sim_col_).sum()
    pred_rating = BOOKS_AVGS[cand] + numerator/denominator

    predicted_ratings[cand] = pred_rating


194243


In [37]:
predicted_ratings

{19465: 4.29032258093371,
 119641: 3.904761905050453,
 153475: 3.7400000002885485}

In [38]:
""" ignore for now """
for idx in best_recent_title_indices:
    
    target_col = (
        similarity_matrix[:, idx].tocoo()
    )  # get the similarity scores between this book and all other books in the matrix

    mask = ~np.isin(target_col.row, read_books_indices)  # remove the read books

    target_candidate_similarities = coo_matrix(
        (target_col.data[mask], (target_col.row[mask], target_col.col[mask])),
        shape=similarity_matrix[:, idx].shape,
    )

    # check with Dr. Alex -- the candidates aren't all unread books, they're the unread books that exist in our similarity map
    #print(target_candidate_similarities)

    # get the k most similar books that the target user also rated: 
    #  check with insiyah, do i look at absolute sim score?
    topk_rows, topk_values = get_k_neighbors(target_candidate_similarities, k=15)
    
    print(
        f"Top k similar movies for {book_mapping[idx]} : {book_mapping[topk_rows]}"
    )
    print("Their similarity scores:", topk_values)
    print("--------------------------------------------")
    
    # predict the rating for the k-most similar books
    # so the neighbors have to be items that the target user also rated
    # item avg + deviation: 
    #           how did this target user's ratings deviate from the sim items avgs
    # deviation = sim_score_with_target*(target_user_rating - sim_item's_avg) 
    # numerator = sim_score * 
    

NameError: name 'best_recent_title_indices' is not defined