# Imports

In [1]:
# import files
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

# Model (ignore)

In [None]:
# --- Sample dataset (replace with your own later) ---
data = {
    "title": [
        "The Hobbit",
        "The Lord of the Rings",
        "Harry Potter and the Sorcerer's Stone",
        "Harry Potter and the Chamber of Secrets",
        "A Game of Thrones",
    ],
    "author": [
        "J.R.R. Tolkien",
        "J.R.R. Tolkien",
        "J.K. Rowling",
        "J.K. Rowling",
        "George R.R. Martin",
    ],
    "description": [
        "A hobbit goes on a journey with dwarves and a wizard.",
        "A group sets out to destroy a powerful ring.",
        "A boy discovers he is a wizard and attends a magical school.",
        "The young wizard faces a hidden chamber and a dark force.",
        "Noble families vie for the throne in a fantasy kingdom.",
    ],
}

books_df = pd.DataFrame(data)

# --- TF-IDF (turn text into vectors) ---
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(books_df["description"])

# --- Similarity scores ---
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


# --- Recommendation function ---
def recommend_books(title, top_n=3):
    if title not in books_df["title"].values:
        return ["Book not found in dataset."]

    idx = books_df[books_df["title"] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1 : top_n + 1]
    book_indices = [i[0] for i in sim_scores]
    return books_df["title"].iloc[book_indices].tolist()


# --- Test ---
print(recommend_books("The Hobbit"))

["Harry Potter and the Sorcerer's Stone", 'Harry Potter and the Chamber of Secrets', 'The Lord of the Rings']


# Data Processing

In [5]:
# read the data

books_df = pd.read_csv("books_data.csv")
reviews_df = pd.read_csv("Books_rating.csv")


In [6]:
def preprocess_reviews_df(reviews_df):
    """
    This function preprocesses the reviews dataframe

        Args:
            reviews_df (pd.DataFrame): dataframe of the original reviews

        Returns:
            reviews_df (pd.DataFrame): preprocessed dataframe
    """

    # rename columns for consistency
    reviews_df.rename(
        columns={
            "Id": "id",
            "Title": "title",
            "Price": "price",
            "User_id": "user_id",
            "profileName": "profile_name",
            "review/helpfulness": "helpfulness",
            "review/score": "score",
            "review/time": "review_date",
            "review/summary": "summary",
            "review/text": "text",
        },
        inplace=True,
    )

    # get the year of the review from the date (review/time)
    reviews_df["year"] = reviews_df["review_date"].apply(
        lambda x: datetime.utcfromtimestamp(x).year
    )

    # drop unnecessary cols
    reviews_df.drop(columns=["price"], inplace=True)

    # drop the null titles and users
    reviews_df = reviews_df.dropna(subset=["title", "user_id"])

    # Preprocess helpfulness
    # handle 0/0
    reviews_df["helpfulness"] = reviews_df["helpfulness"].replace("0/0", 0)

    # convert each helpfulness string to float
    reviews_df["helpfulness"] = reviews_df["helpfulness"].apply(
        lambda x: eval(x) if isinstance(x, str) and "/" in x else x
    )
    reviews_df["helpfulness"] = reviews_df["helpfulness"].astype(float)
    reviews_df["helpfulness"]

    return reviews_df

def preprocess_books_df(books_df):
    # drop irrelevant cols
    books_df.drop(columns=["previewLink", "infoLink", "ratingsCount", "publisher"], inplace=True)

    # rename cols
    books_df.rename(
        columns={
            "Title": "title",
            "publishedDate": "published_date",
        },
        inplace=True,
    )

    # drop null values in Title
    books_df.dropna(subset=["title"], inplace=True)

    # fix the dates, extract the year of the book
    books_df["published_date"] = books_df["published_date"].replace("1963*", 1963)
    books_df["published_date"] = (
        books_df["published_date"].astype(str).str.extract(r"(\d{4})")
    )
    books_df["published_date"] = books_df["published_date"].apply(
        lambda x: int(x) if isinstance(x, str) and x.isdigit() else x
    )

    # calculate age (recency feature) - possibly for content based filtering if combined with category for ex
    books_df["age"] = datetime.today().year - books_df["published_date"]
    
    return books_df

In [7]:
reviews_df = preprocess_reviews_df(reviews_df)
books_df = preprocess_books_df(books_df)

  lambda x: datetime.utcfromtimestamp(x).year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] = reviews_df["helpfulness"].replace("0/0", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] = reviews_df["helpfulness"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df["helpfulness"] 

In [8]:
def calc_review_counts(books_df):
    
    # calculate review count for each book
    review_counts_dict = reviews_df.groupby('title').agg(
        count = ('id', 'count')
    ).to_dict()['count']

    # add the count column to books_df
    books_df['count'] = books_df['title'].apply(lambda x: review_counts_dict.get(x))

    books_df.sort_values(by='count', ascending = False)
    
    return books_df

books_df = calc_review_counts(books_df)

In [9]:
reviews_df.head()

Unnamed: 0,id,title,user_id,profile_name,helpfulness,score,review_date,summary,text,year
0,1882931173,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",1.0,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,1999
1,826414346,Dr. Seuss: American Icon,A30TK6U7DNS82R,Kevin Killian,1.0,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,2004
2,826414346,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,John Granger,0.909091,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t...",2004
3,826414346,Dr. Seuss: American Icon,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",1.0,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",2004
4,826414346,Dr. Seuss: American Icon,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",1.0,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...,2005


In [10]:
books_df.head()

Unnamed: 0,title,description,authors,image,published_date,categories,age,count
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,1996.0,['Comics & Graphic Novels'],29.0,1.0
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,2005.0,['Biography & Autobiography'],20.0,9.0
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,2000.0,['Religion'],25.0,4.0
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,2005.0,['Fiction'],20.0,32.0
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,2003.0,,22.0,1.0


# Non-personalized Recommendations

## Weighted Scoring

## Bayesian Scoring

Note: function used from "Non_Personalized_Recommendations_Trending_Now" notebook from iLearn

In [None]:
def bayesian_scoring(reviews_df):
    #step 1: create array with title, average rating of each book, and # of ratings
    books_avg_ratings = reviews_df.groupby(['title']).agg(
        avg_rating=('score', 'mean'),  
        num_ratings=('score', 'count') 
    ).reset_index()
    
    #step 2: find m= Global average rating across all books
    m = reviews_df['score'].mean()
    #step 3: find C= confidence factor
    C = reviews_df['title'].value_counts().mean()
    #step 4: bayesian score
    books_avg_ratings['bayesian_score'] = (
        (C * m + books_avg_ratings['num_ratings'] * books_avg_ratings['avg_rating']) /
        (C + books_avg_ratings['num_ratings'])
    )
    #step 5: rank the movies based on their bayesian score
    books_avg_ratings_ranked = books_avg_ratings.sort_values(by=['bayesian_score'], ascending=False)
    return books_avg_ratings_ranked

In [62]:
#Top 5 Books based on Bayesian Scoring Ranking 
top5 = bayesian_scoring(reviews_df).head()

In [None]:
import requests
from IPython.display import HTML

def show_top_books_with_covers(df, title_col='title', n=5):
    """
    show the top books with cover 

        Args:
            df (pd.DataFrame): dataframe of the top books 
            title_col (str, optional): column name that contain book titles. Default = 'title'.
            n (int, optional): no. of books to display. Default = 5.
        
        Returns:
            IPython.display.HTML: An HTML table with book covers and titles
    
    
    Note: function and HTML is a refined version of -> https://github.com/masao/google_books_api_wrapper
    
    """

    def get_google_books_cover(title):
        url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{title}"
        try:
            response = requests.get(url, timeout=5).json()
            return response['items'][0]['volumeInfo']['imageLinks']['thumbnail']
        except Exception:
            return "https://via.placeholder.com/60x90?text=No+Cover"

    top_books = df.head(n).copy()
    top_books['cover_url'] = top_books[title_col].apply(get_google_books_cover)

    # Build HTML table
    html_table = [
        "<table style='border-collapse: collapse; text-align: left;'>",
        "<tr><th>Cover</th><th>Title</th></tr>"
    ]
    for _, row in top_books.iterrows():
        html_table.append(
            f"<tr style='border: 1px solid #ccc;'>"
            f"<td><img src='{row['cover_url']}' width='60'></td>"
            f"<td style='padding: 8px;'>{row[title_col]}</td>"
            f"</tr>"
        )
    html_table.append("</table>")

    return HTML(''.join(html_table))


In [61]:
show_top_books_with_covers(top5)

Cover,Title
,Lilla Belle: The First Stages
,Why revival tarries
,With the Old Breed: At Peleliu and Okinawa
,"The Ferret Calendar 2005, Ferret Music"
,the lion's paw


## Wilson Scoring

In [63]:
import numpy as np
def wilson_scoring(reviews_df):  
    # step 1: review/score column will help in implementing the thumbs-up/down ratio 
    # score > 4 = thumbs-up
    #score < 4 = thumbs-down

    reviews_wilson = reviews_df
    reviews_wilson['thumbs'] = (reviews_wilson['score'] >= 4.0).astype(int)
    
    #step 2: create array with title, total(thumbs-up), total # of reviews, and p=ratio 

    books_wilson = reviews_wilson.groupby('title').agg(
        thumbs_up=('thumbs', 'sum'),      # total thumbs-up across all reviews for that title
        thumbs_down=('thumbs', lambda x: (x == 0).sum()),  # total thumbs down’s 
        total_reviews=('thumbs', 'count')         # number of review rows for that title
    ).reset_index()
    
    #step 3: get p

    books_wilson['p'] = books_wilson['thumbs_up'] / books_wilson['total_reviews']
    #to avoid dividing by 0 error
    books_wilson['p'] = books_wilson['p'].fillna(0)
    books_wilson=books_wilson.sort_values(by=['total_reviews'], ascending=False)
    
    #step 4: calculate wilson score 

    books_wilson['wilson_score'] = (
    (books_wilson['p'] + (z**2) / (2 * books_wilson['total_reviews']) -
     z * np.sqrt(
         (books_wilson['p'] * (1 - books_wilson['p']) / books_wilson['total_reviews']) +
         (z**2) / (4 * books_wilson['total_reviews']**2)
     )) /
    (1 + (z**2) / books_wilson['total_reviews'])
    )
    books_wilson['wilson_score'] = books_wilson['wilson_score'].fillna(0)
    #normalize
    books_wilson['wilson_score'] = books_wilson['wilson_score'].clip(lower=0)
    books_wilson = books_wilson.sort_values(by=['wilson_score'], ascending=False)
    return books_wilson
    


In [64]:
#Top 5 Books based on Bayesian Scoring Ranking 
top5 = wilson_scoring(reviews_df).head()
show_top_books_with_covers(top5)

Cover,Title
,With the Old Breed: At Peleliu and Okinawa
,Why revival tarries
,Wildwood Wisdom
,Little Britches
,Lilla Belle: The First Stages


## Apriori Algorithm

## FP Growth

# Personalized Recommendations

## User-based collaborative filtering

## Item-based collaborative filtering