In [2]:
# import files
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [4]:
# Reading files
books_df = pd.read_csv("books_data.csv")
reviews_df = pd.read_csv("Books_rating.csv")

# Preprocessing files
def preprocess_reviews_df(reviews_df):
    """
    This function preprocesses the reviews dataframe

        Args:
            reviews_df (pd.DataFrame): dataframe of the original reviews

        Returns:
            reviews_df (pd.DataFrame): preprocessed dataframe
    """

    # rename columns for consistency
    reviews_df.rename(
        columns={
            "Id": "id",
            "Title": "title",
            "Price": "price",
            "User_id": "user_id",
            "profileName": "profile_name",
            "review/helpfulness": "helpfulness",
            "review/score": "score",
            "review/time": "review_date",
            "review/summary": "summary",
            "review/text": "text",
        },
        inplace=True,
    )

    # get the year of the review from the date (review/time)
    reviews_df["year"] = reviews_df["review_date"].apply(
        lambda x: datetime.utcfromtimestamp(x).year
    )

    # drop unnecessary cols
    reviews_df.drop(columns=["price"], inplace=True)

    # drop the null titles and users
    reviews_df = reviews_df.dropna(subset=["title", "user_id"])

    # Preprocess helpfulness
    # handle 0/0
    reviews_df["helpfulness"] = reviews_df["helpfulness"].replace("0/0", 0)

    # convert each helpfulness string to float
    reviews_df["helpfulness"] = reviews_df["helpfulness"].apply(
        lambda x: eval(x) if isinstance(x, str) and "/" in x else x
    )
    reviews_df["helpfulness"] = reviews_df["helpfulness"].astype(float)
    reviews_df["helpfulness"]

    return reviews_df

def preprocess_books_df(books_df):
    # drop irrelevant cols
    books_df.drop(columns=["previewLink", "infoLink", "ratingsCount", "publisher"], inplace=True)

    # rename cols
    books_df.rename(
        columns={
            "Title": "title",
            "publishedDate": "published_date",
        },
        inplace=True,
    )

    # drop null values in Title
    books_df.dropna(subset=["title"], inplace=True)

    # fix the dates, extract the year of the book
    books_df["published_date"] = books_df["published_date"].replace("1963*", 1963)
    books_df["published_date"] = (
        books_df["published_date"].astype(str).str.extract(r"(\d{4})")
    )
    books_df["published_date"] = books_df["published_date"].apply(
        lambda x: int(x) if isinstance(x, str) and x.isdigit() else x
    )

    # calculate age (recency feature) - possibly for content based filtering if combined with category for ex
    books_df["age"] = datetime.today().year - books_df["published_date"]
    
    return books_df

In [5]:
reviews_df = preprocess_reviews_df(reviews_df)
books_df = preprocess_books_df(books_df)

  lambda x: datetime.utcfromtimestamp(x).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] = reviews_df["helpfulness"].replace("0/0", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] = reviews_df["helpfulness"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] 

In [None]:
# Exploring reviews_df
print(f'Shape: {reviews_df.shape}')
print(f'Number of unqiue users/reviews: {reviews_df['user_id'].nunique()}')
print(f'Number of unique books/titles {reviews_df['title'].nunique()}')
print(reviews_df.head())

Shape: (2438018, 10)
Number of unqiue users/reviews: 1008961
Number of unique books/titles 206711
           id                           title         user_id  \
0  1882931173  Its Only Art If Its Well Hung!   AVCGYZL8FQQTD   
1  0826414346        Dr. Seuss: American Icon  A30TK6U7DNS82R   
2  0826414346        Dr. Seuss: American Icon  A3UH4UZ4RSVO82   
3  0826414346        Dr. Seuss: American Icon  A2MVUWT453QH61   
4  0826414346        Dr. Seuss: American Icon  A22X4XUPKF66MR   

                         profile_name  helpfulness  score  review_date  \
0               Jim of Oz "jim-of-oz"     1.000000    4.0    940636800   
1                       Kevin Killian     1.000000    5.0   1095724800   
2                        John Granger     0.909091    5.0   1078790400   
3  Roy E. Perry "amateur philosopher"     1.000000    4.0   1090713600   
4     D. H. Richards "ninthwavestore"     1.000000    4.0   1107993600   

                                           summary  \
0           

In [None]:
# Drop duplicates based on specific columns
new_reviews_df = reviews_df.drop_duplicates(subset=['user_id', 'score','review_date','summary','text'])
print(f'new df : {new_reviews_df.shape}')


### User Based Collaborative Filtering


In [50]:
# Making Dictionaries
user_to_book = new_reviews_df.groupby('user_id')['title'].apply(list).to_dict()
book_to_user = new_reviews_df.groupby('title')['user_id'].apply(list).to_dict()

user_book = zip(new_reviews_df['user_id'], new_reviews_df['title'])
user_book_score = zip(user_book, reviews_df['score'])
user_book_to_score = dict(user_book_score)

In [51]:
from scipy.sparse import lil_matrix

# Map user IDs to row indices
users = list(user_to_book.keys())
user_to_index = {u: i for i, u in enumerate(users)}
index_to_user = {i: u for u, i in user_to_index.items()}

n_users = len(users)


While using list we get repeats

In [None]:
# Build a sparse userâ€“item matrix first

all_books = list(book_to_user.keys())
book_to_index = {b: i for i, b in enumerate(all_books)}

n_books = len(all_books)
user_item = lil_matrix((n_users, n_books))

# Fill with ratings
for (user, book), score in user_book_to_score.items():
    user_idx = user_to_index[user]
    book_idx = book_to_index[book]
    user_item[user_idx, book_idx] = score


In [None]:
# Build a sparse similarity matrix for users
