In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
books_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DSCI351/Preprocessed_data.csv')

**Data Cleaning**

In [None]:
books_df.shape

(1031175, 19)

In [None]:
books_df.columns

Index(['Unnamed: 0', 'user_id', 'location', 'age', 'isbn', 'rating',
       'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_s', 'img_m', 'img_l', 'Summary', 'Language', 'Category', 'city',
       'state', 'country'],
      dtype='object')

In [None]:
# Drop irrelevant columns
cleaned_books_df = books_df.drop(['Unnamed: 0', 'location', 'img_s', 'img_m', 'img_l'], axis=1)

In [None]:
# Only keep users from 'usa'
cleaned_books_df = cleaned_books_df[cleaned_books_df['country'].str.contains('usa', case=False, na=False)]

In [None]:
cleaned_books_df.shape

(746495, 14)

In [None]:
# Only keep users who rated books in 'en' language
cleaned_books_df = cleaned_books_df[cleaned_books_df['Language'] == 'en']

In [None]:
cleaned_books_df.shape

(475650, 14)

In [None]:
# Convert 'year_of_publication' to integer data type
cleaned_books_df['year_of_publication'] = pd.to_numeric(cleaned_books_df['year_of_publication']).astype('int64')

In [None]:
cleaned_books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 475650 entries, 0 to 1031173
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              475650 non-null  int64  
 1   age                  475650 non-null  float64
 2   isbn                 475650 non-null  object 
 3   rating               475650 non-null  int64  
 4   book_title           475650 non-null  object 
 5   book_author          475650 non-null  object 
 6   year_of_publication  475650 non-null  int64  
 7   publisher            475650 non-null  object 
 8   Summary              475650 non-null  object 
 9   Language             475650 non-null  object 
 10  Category             475650 non-null  object 
 11  city                 475154 non-null  object 
 12  state                473625 non-null  object 
 13  country              475650 non-null  object 
dtypes: float64(1), int64(3), object(10)
memory usage: 54.4+ MB


In [None]:
cleaned_books_df.shape

(475650, 14)

In [None]:
cleaned_books_df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
0,2,18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
16,2954,71.0,60973129,8,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",en,['1940-1949'],wichita,kansas,usa
19,35704,53.0,374157065,6,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",en,['Medical'],kansas city,missouri,usa
20,83160,65.0,374157065,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",en,['Medical'],oregon city,oregon,usa
21,110912,36.0,374157065,10,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",en,['Medical'],milpitas,california,usa


**Data Preprocessing**

Exclude users with less than 100 ratings, and books with less than 20 ratings

In [None]:
# Count the number of ratings per user
user_rating_counts = cleaned_books_df['user_id'].value_counts()
user_rating_counts

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
198711,3976
153662,3837
98391,3719
35859,3565
278418,2383
...,...
63994,1
60360,1
54245,1
30584,1


In [None]:
# Filter users with at least 100 ratings
users_with_enough_ratings = user_rating_counts[user_rating_counts >= 100].index
users_with_enough_ratings

Index([198711, 153662,  98391,  35859, 278418,  76352,  16795, 235105, 230522,
       110973,
       ...
        89014, 143163, 267249, 196985,  41781, 145431, 111578, 176667, 260944,
        27812],
      dtype='int64', name='user_id', length=814)

In [None]:
# Exclude users with fewer than 100 ratings
preprocessed_books_df = cleaned_books_df[cleaned_books_df['user_id'].isin(users_with_enough_ratings)]
preprocessed_books_df.shape

(262337, 14)

In [None]:
# Count the number of ratings per book
book_rating_counts = preprocessed_books_df['book_title'].value_counts()
book_rating_counts

Unnamed: 0_level_0,count
book_title,Unnamed: 1_level_1
Wild Animus,296
Bridget Jones's Diary,224
Divine Secrets of the Ya-Ya Sisterhood: A Novel,209
The Nanny Diaries: A Novel,207
The Da Vinci Code,205
...,...
"The Mark of the Crown (Star Wars: Jedi Apprentice, Book 4)",1
"The Uncertain Path (Star Wars: Jedi Apprentice, Book 6)",1
"The Twisted Tale of Tiki Island (Give Yourself Goosebumps, No 21)",1
"Mark 947: A Life Shaped by God, Gender and Force of Will",1


In [None]:
# Filter books with at least 20 ratings
books_with_enough_ratings = book_rating_counts[book_rating_counts >= 20].index
books_with_enough_ratings

Index(['Wild Animus', 'Bridget Jones's Diary',
       'Divine Secrets of the Ya-Ya Sisterhood: A Novel',
       'The Nanny Diaries: A Novel', 'The Da Vinci Code', 'A Time to Kill',
       'The Secret Life of Bees', 'The Horse Whisperer',
       'Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))',
       'Snow Falling on Cedars',
       ...
       'The Story of Jonah (An Alice in Bibleland Storybook)',
       'Shadows of Steel', 'If Ever I Return, Pretty Peggy-O',
       'Running Scared', 'Cold Tea On A Hot Day', 'Motion to Suppress',
       'Only Love',
       'The Crepes of Wrath (Pennsylvania Dutch Mysteries with Recipes (Paperback))',
       'Odd Girl Out: The Hidden Culture of Aggression in Girls',
       'Serpent : A Novel from the NUMA Files (Numa Files Series)'],
      dtype='object', name='book_title', length=1768)

In [None]:
# Exclude books with fewer than 20 ratings
preprocessed_books_df = preprocessed_books_df[preprocessed_books_df['book_title'].isin(books_with_enough_ratings)]
preprocessed_books_df.shape

(70435, 14)

In [None]:
preprocessed_books_df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
101,2977,25.0,440234743,0,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],richland,washington,usa
103,3363,29.0,440234743,0,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],knoxville,tennessee,usa
106,7346,49.0,440234743,9,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],sunnyvale,california,usa
110,9856,22.0,440234743,0,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],glendale,colorado,usa
115,13552,32.0,440234743,8,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],cordova,tennessee,usa


**Create Rating Matrix**

In [None]:
rating_matrix = preprocessed_books_df.pivot_table(values='rating', index='book_title', columns='user_id')
rating_matrix.shape

(1768, 813)

In [None]:
rating_matrix

user_id,254,1733,2033,2110,2276,2766,2891,2977,3363,4017,...,273979,274061,274308,274808,275970,276680,277427,277639,278188,278418
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,,,,,,,,,,,...,,,0.0,,,,,,,
16 Lighthouse Road,,,,,,,,,,,...,,,,,,,,,0.0,
1984,9.0,,,,,,,,,,...,,,,,0.0,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2010: Odyssey Two,,,,,0.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You Belong to Me and Other True Cases (Ann Rule's Crime Files: Vol. 2),,,,,,,,,,,...,,,0.0,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,,,,,0.0,,...,,,,,0.0,,,,,
Zlata's Diary: A Child's Life in Sarajevo,,,,,,,,,0.0,,...,,,,,,,,,,
Zoya,,,,,,,,,,,...,0.0,,,,,,,,,


In [None]:
# Get the number of unique rows
unique_row_count = rating_matrix.drop_duplicates().shape[0]
print(unique_row_count)

1768


In [None]:
# Fill missing values with 0 (assuming the absence of a rating is equivalent to 0)
rating_matrix_filled = rating_matrix.fillna(0)
rating_matrix_filled

user_id,254,1733,2033,2110,2276,2766,2891,2977,3363,4017,...,273979,274061,274308,274808,275970,276680,277427,277639,278188,278418
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You Belong to Me and Other True Cases (Ann Rule's Crime Files: Vol. 2),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zlata's Diary: A Child's Life in Sarajevo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Check for duplicate user_ids
duplicate_user_ids = preprocessed_books_df['user_id'].duplicated().sum()
print(f"Number of duplicate user IDs: {duplicate_user_ids}")

preprocessed_books_df_unique = preprocessed_books_df.drop_duplicates(subset=['user_id', 'book_title'])

user_item_matrix = preprocessed_books_df_unique.pivot(index='user_id', columns='book_title', values='rating').fillna(0)

# Transpose the matrix for item-based filtering
item_user_matrix = user_item_matrix.T

Number of duplicate user IDs: 69622


In [None]:
# Instantiate the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
knn.fit(item_user_matrix)


In [None]:
def recommend_books_knn(book_title, knn_model, item_user_matrix, top_n=5):
  # Get the index of the book
  book_index = item_user_matrix.index.get_loc(book_title)

  # Find the k nearest neighbors
  distances, indices = knn_model.kneighbors(item_user_matrix.iloc[book_index, :].values.reshape(1, -1), n_neighbors=top_n + 1)

  # Get the indices of the nearest neighbors (excluding the book itself)
  recommended_indices = indices[0][1:]

  # Get the book titles for the recommended indices
  recommended_books = [item_user_matrix.index[i] for i in recommended_indices]

  return recommended_books

In [None]:
# Get the top 5 most popular books
top_5_books = sorted_books.head(5).index.tolist()

# Recommend books for each of the top 5 books
for book_title in top_5_books:
  recommendations_knn = recommend_books_knn(book_title, knn, item_user_matrix, top_n=5)
  print(f"Recommendations for '{book_title}': {recommendations_knn}")


Recommendations for 'The Da Vinci Code': ["Widow's Walk", 'TickTock', 'Touching Evil', 'Doing Good', 'The Arraignment']
Recommendations for 'Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))': ['Harry Potter and the Goblet of Fire (Book 4)', 'Harry Potter and the Prisoner of Azkaban (Book 3)', 'Harry Potter and the Order of the Phoenix (Book 5)', 'Harry Potter and the Chamber of Secrets (Book 2)', 'A Time to Kill']
Recommendations for 'The Secret Life of Bees': ['Under the Tuscan Sun', 'Good in Bed', "Dude, Where's My Country?", "Patty Jane's House of Curl", 'Snow Falling on Cedars']
Recommendations for 'Harry Potter and the Goblet of Fire (Book 4)': ['Harry Potter and the Prisoner of Azkaban (Book 3)', 'Harry Potter and the Order of the Phoenix (Book 5)', 'Harry Potter and the Chamber of Secrets (Book 2)', "Harry Potter and the Sorcerer's Stone (Book 1)", "Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"]
Recommendations for 'Bridget Jones's Diary': ['T

In [None]:
most_popular_book = sorted_books.index[3]
#Print the book title
print(most_popular_book)
# Define a range of top_n values to test
top_n_values = range(1,11)

for top_n in top_n_values:
  # Get recommendations using knn
  recommendations_knn = recommend_books_knn(most_popular_book, knn, item_user_matrix, top_n=top_n)

  # Get actual ratings for the recommended books
  actual_ratings = preprocessed_books_df[
      preprocessed_books_df['book_title'].isin(recommendations_knn)
  ]['rating'].values

  # Create binary labels (1 if rating >= 4, 0 otherwise)
  actual_labels = [1 if rating >= 4 else 0 for rating in actual_ratings]

  # Create predicted labels (1 for all recommendations)
  predicted_labels = [1] * len(actual_labels)

  # Calculate accuracy metrics
  accuracy  = accuracy_score(actual_labels, predicted_labels)
  precision = precision_score(actual_labels, predicted_labels, zero_division=0)
  recall = recall_score(actual_labels, predicted_labels)
  f1 = f1_score(actual_labels, predicted_labels)

  print(f"Top_n: {top_n}")
  print(f"Accuracy: {accuracy:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")
  print(f"F1-score: {f1:.4f}")
  print("--------------------")


Harry Potter and the Goblet of Fire (Book 4)
Top_n: 1
Accuracy: 0.6190
Precision: 0.6190
Recall: 1.0000
F1-score: 0.7647
--------------------
Top_n: 2
Accuracy: 0.5515
Precision: 0.5515
Recall: 1.0000
F1-score: 0.7109
--------------------
Top_n: 3
Accuracy: 0.5811
Precision: 0.5811
Recall: 1.0000
F1-score: 0.7350
--------------------
Top_n: 4
Accuracy: 0.5672
Precision: 0.5672
Recall: 1.0000
F1-score: 0.7238
--------------------
Top_n: 5
Accuracy: 0.4990
Precision: 0.4990
Recall: 1.0000
F1-score: 0.6658
--------------------
Top_n: 6
Accuracy: 0.4809
Precision: 0.4809
Recall: 1.0000
F1-score: 0.6495
--------------------
Top_n: 7
Accuracy: 0.4682
Precision: 0.4682
Recall: 1.0000
F1-score: 0.6378
--------------------
Top_n: 8
Accuracy: 0.4664
Precision: 0.4664
Recall: 1.0000
F1-score: 0.6361
--------------------
Top_n: 9
Accuracy: 0.4498
Precision: 0.4498
Recall: 1.0000
F1-score: 0.6205
--------------------
Top_n: 10
Accuracy: 0.4375
Precision: 0.4375
Recall: 1.0000
F1-score: 0.6087
-----