# Assignment 3: Book Recommendation System

### Part 1: Data Selection

In [29]:
import pandas as pd
import numpy as np

# Load datasets
books = pd.read_csv('Books.csv', sep=';', encoding='latin-1', low_memory=False)
ratings = pd.read_csv('Ratings.csv', sep=';', encoding='latin-1', low_memory=False)

print(books.head())
print(ratings.head())

print("Book Column Shape: ", books.shape)
print("Ratings Column Shape: ", ratings.shape)

         ISBN                                              Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 Author  Year                Publisher  
0    Mark P. O. Morford  2002  Oxford University Press  
1  Richard Bruce Wright  2001    HarperFlamingo Canada  
2          Carlo D'Este  1991          HarperPerennial  
3      Gina Bari Kolata  1999     Farrar Straus Giroux  
4       E. J. W. Barber  1999   W. W. Norton & Company  
   User-ID        ISBN  Rating
0   276725  034545104X       0
1   276726  0155061224       5
2   276727  0446520802       0
3   276729  052165615X       3
4   276729  0521795028       6
Book Column Shape:  (271379, 5)
Ratings Column Shape:  (1149780

### Part 2: Data Preprocessing

In [30]:
# Selecting required columns
books = books[['ISBN', 'Title', 'Author', 'Year', 'Publisher']]
ratings = ratings[['User-ID', 'ISBN', 'Rating']]

# Removing missing values
books.dropna(inplace=True)
ratings.dropna(inplace=True)

# Converting ratings to numeric
ratings['Rating'] = pd.to_numeric(ratings['Rating'], errors='coerce')

# Removing zero ratings
ratings = ratings[ratings['Rating'] > 0]

# Filtering active users and popular books
user_counts = ratings['User-ID'].value_counts()
book_counts = ratings['ISBN'].value_counts()

ratings = ratings[
    ratings['User-ID'].isin(user_counts[user_counts >= 30].index) &
    ratings['ISBN'].isin(book_counts[book_counts >= 50].index)
]

print("Reduced dataset shape: ", end="")
print(ratings.shape)
ratings_sample = ratings.head(5)
ratings_sample

Reduced dataset shape: (19341, 3)


Unnamed: 0,User-ID,ISBN,Rating
1456,277427,002542730X,10
1474,277427,0061009059,9
1522,277427,0316776963,8
1543,277427,0345413903,10
1578,277427,0385424736,9


### Part 3: Content Based Filtering

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Merging ratings with book metadata
books_cbf = ratings.merge(books, on='ISBN')

# Creating combined text feature
books_cbf['combined_features'] = (
    books_cbf['Title'] + ' ' +
    books_cbf['Author'] + ' ' +
    books_cbf['Publisher']
)

# Removing duplicates
books_cbf = books_cbf.drop_duplicates('ISBN')

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_cbf['combined_features'])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (531, 1400)


In [24]:
# Computing cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Index mapping
indices = pd.Series(books_cbf.index, index=books_cbf['Title'])

# Recommendation function
def recommend_books(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]
    book_indices = [i[0] for i in sim_scores]
    return books_cbf['Title'].iloc[book_indices]

# Example recommendation
recommend_books(books_cbf['Title'].iloc[0])

Unnamed: 0,Title
278,Wicked: The Life and Times of the Wicked Witch...
338,Everything's Eventual : 14 Dark Tales
145,Life of Pi
126,Life of Pi
2160,A Beautiful Mind: The Life of Mathematical Gen...
63,The Secret Life of Bees
1005,The Beach House
324,The Secret Life of Bees
999,Cradle and All
2318,The Beach House


### Part 4: Collaborative Filtering

In [25]:
# Creating user-item matrix
pivot_table = ratings.pivot_table(
    index='User-ID',
    columns='ISBN',
    values='Rating'
).fillna(0)

# Item-item similarity
item_similarity = cosine_similarity(pivot_table.T)

item_similarity_df = pd.DataFrame(
    item_similarity,
    index=pivot_table.columns,
    columns=pivot_table.columns
)

item_similarity_df.iloc[:5, :5]


ISBN,002542730X,0060096195,006016848X,0060173289,0060175400
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
002542730X,1.0,0.0,0.029427,0.0,0.087169
0060096195,0.0,1.0,0.0,0.0,0.061962
006016848X,0.029427,0.0,1.0,0.0,0.073099
0060173289,0.0,0.0,0.0,1.0,0.091464
0060175400,0.087169,0.061962,0.073099,0.091464,1.0


In [26]:
# Recommendation function
def recommend_cf(user_id, n=5):
    user_ratings = pivot_table.loc[user_id]
    rated_books = user_ratings[user_ratings > 0].index

    scores = item_similarity_df[rated_books].mean(axis=1)
    scores = scores.drop(rated_books)

    top_books = scores.sort_values(ascending=False).head(n).index
    return books[books['ISBN'].isin(top_books)][['Title', 'Author']]

# Example user
recommend_cf(pivot_table.index[0])

Unnamed: 0,Title,Author
37,To Kill a Mockingbird,Harper Lee
90,The Catcher in the Rye,J.D. Salinger
2143,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling
5506,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling
6933,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling
