In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
books_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dsci351/Preprocessed_data.csv')

**Data Cleaning**

In [None]:
books_df.shape

(1031175, 19)

In [None]:
books_df.columns

Index(['Unnamed: 0', 'user_id', 'location', 'age', 'isbn', 'rating',
       'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_s', 'img_m', 'img_l', 'Summary', 'Language', 'Category', 'city',
       'state', 'country'],
      dtype='object')

In [None]:
# Drop irrelevant columns
cleaned_books_df = books_df.drop(['Unnamed: 0', 'location', 'img_s', 'img_m', 'img_l'], axis=1)

In [None]:
# Only keep users from 'usa'
cleaned_books_df = cleaned_books_df[cleaned_books_df['country'].str.contains('usa', case=False, na=False)]

In [None]:
cleaned_books_df.shape

(746495, 14)

In [None]:
# Only keep users who rated books in 'en' language
cleaned_books_df = cleaned_books_df[cleaned_books_df['Language'] == 'en']

In [None]:
cleaned_books_df.shape

(475650, 14)

In [None]:
# Convert 'year_of_publication' to integer data type
cleaned_books_df['year_of_publication'] = pd.to_numeric(cleaned_books_df['year_of_publication']).astype('int64')

In [None]:
cleaned_books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 475650 entries, 0 to 1031173
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              475650 non-null  int64  
 1   age                  475650 non-null  float64
 2   isbn                 475650 non-null  object 
 3   rating               475650 non-null  int64  
 4   book_title           475650 non-null  object 
 5   book_author          475650 non-null  object 
 6   year_of_publication  475650 non-null  int64  
 7   publisher            475650 non-null  object 
 8   Summary              475650 non-null  object 
 9   Language             475650 non-null  object 
 10  Category             475650 non-null  object 
 11  city                 475154 non-null  object 
 12  state                473625 non-null  object 
 13  country              475650 non-null  object 
dtypes: float64(1), int64(3), object(10)
memory usage: 54.4+ MB


In [None]:
cleaned_books_df.shape

(475650, 14)

In [None]:
cleaned_books_df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
0,2,18.0,195153448,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,Provides an introduction to classical myths pl...,en,['Social Science'],stockton,california,usa
16,2954,71.0,60973129,8,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"Here, for the first time in paperback, is an o...",en,['1940-1949'],wichita,kansas,usa
19,35704,53.0,374157065,6,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",en,['Medical'],kansas city,missouri,usa
20,83160,65.0,374157065,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",en,['Medical'],oregon city,oregon,usa
21,110912,36.0,374157065,10,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"Describes the great flu epidemic of 1918, an o...",en,['Medical'],milpitas,california,usa


**Data Preprocessing**

Exclude users with less than 100 ratings, and books with less than 20 ratings

In [None]:
# Count the number of ratings per user
user_rating_counts = cleaned_books_df['user_id'].value_counts()
user_rating_counts

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
198711,3976
153662,3837
98391,3719
35859,3565
278418,2383
...,...
63994,1
60360,1
54245,1
30584,1


In [None]:
# Filter users with at least 100 ratings
users_with_enough_ratings = user_rating_counts[user_rating_counts >= 100].index
users_with_enough_ratings

Index([198711, 153662,  98391,  35859, 278418,  76352,  16795, 235105, 230522,
       110973,
       ...
        89014, 143163, 267249, 196985,  41781, 145431, 111578, 176667, 260944,
        27812],
      dtype='int64', name='user_id', length=814)

In [None]:
# Exclude users with fewer than 100 ratings
preprocessed_books_df = cleaned_books_df[cleaned_books_df['user_id'].isin(users_with_enough_ratings)]
preprocessed_books_df.shape

(262337, 14)

In [None]:
# Count the number of ratings per book
book_rating_counts = preprocessed_books_df['book_title'].value_counts()
book_rating_counts

Unnamed: 0_level_0,count
book_title,Unnamed: 1_level_1
Wild Animus,296
Bridget Jones's Diary,224
Divine Secrets of the Ya-Ya Sisterhood: A Novel,209
The Nanny Diaries: A Novel,207
The Da Vinci Code,205
...,...
"The Mark of the Crown (Star Wars: Jedi Apprentice, Book 4)",1
"The Uncertain Path (Star Wars: Jedi Apprentice, Book 6)",1
"The Twisted Tale of Tiki Island (Give Yourself Goosebumps, No 21)",1
"Mark 947: A Life Shaped by God, Gender and Force of Will",1


In [None]:
# Filter books with at least 20 ratings
books_with_enough_ratings = book_rating_counts[book_rating_counts >= 20].index
books_with_enough_ratings

Index(['Wild Animus', 'Bridget Jones's Diary',
       'Divine Secrets of the Ya-Ya Sisterhood: A Novel',
       'The Nanny Diaries: A Novel', 'The Da Vinci Code', 'A Time to Kill',
       'The Secret Life of Bees', 'The Horse Whisperer',
       'Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))',
       'Snow Falling on Cedars',
       ...
       'The Story of Jonah (An Alice in Bibleland Storybook)',
       'Shadows of Steel', 'If Ever I Return, Pretty Peggy-O',
       'Running Scared', 'Cold Tea On A Hot Day', 'Motion to Suppress',
       'Only Love',
       'The Crepes of Wrath (Pennsylvania Dutch Mysteries with Recipes (Paperback))',
       'Odd Girl Out: The Hidden Culture of Aggression in Girls',
       'Serpent : A Novel from the NUMA Files (Numa Files Series)'],
      dtype='object', name='book_title', length=1768)

In [None]:
# Exclude books with fewer than 20 ratings
preprocessed_books_df = preprocessed_books_df[preprocessed_books_df['book_title'].isin(books_with_enough_ratings)]
preprocessed_books_df.shape

(70435, 14)

In [None]:
preprocessed_books_df.head()

Unnamed: 0,user_id,age,isbn,rating,book_title,book_author,year_of_publication,publisher,Summary,Language,Category,city,state,country
101,2977,25.0,440234743,0,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],richland,washington,usa
103,3363,29.0,440234743,0,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],knoxville,tennessee,usa
106,7346,49.0,440234743,9,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],sunnyvale,california,usa
110,9856,22.0,440234743,0,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],glendale,colorado,usa
115,13552,32.0,440234743,8,The Testament,John Grisham,1999,Dell,"A suicidal billionaire, a burnt-out Washington...",en,['Fiction'],cordova,tennessee,usa


**Create Rating Matrix**

In [None]:
rating_matrix = preprocessed_books_df.pivot_table(values='rating', index='user_id', columns='book_title')
rating_matrix.shape

(813, 1768)

In [None]:
rating_matrix

book_title,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,24 Hours,4 Blondes,7b,A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,A Case of Need,...,Wuthering Heights,Year of Wonders: A Novel of the Plague,Yesterday,You Belong To Me,You Belong to Me,You Belong to Me and Other True Cases (Ann Rule's Crime Files: Vol. 2),Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zlata's Diary: A Child's Life in Sarajevo,Zoya,"\O\"" Is for Outlaw"""
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,9.0,,,,,0.0,,,...,,,,,,,,,,
1733,,,,,,,,,,,...,,,,,,,,,,
2033,,,,,,,,,,,...,,,,,,,,,,
2110,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,0.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276680,,,,,,,,,,,...,,,,,,,,,,
277427,,,,,,10.0,,,,,...,,0.0,,,,,,,,
277639,,,,,,,,,,,...,,,,,,,,,,
278188,,0.0,,,,,,,,,...,,,,,,,,,,


In [None]:
# Get the number of unique rows
unique_row_count = rating_matrix.drop_duplicates().shape[0]
print(unique_row_count)

813


In [None]:
# Fill missing values with 0 (assuming the absence of a rating is equivalent to 0)
rating_matrix_filled = rating_matrix.fillna(0)
rating_matrix_filled

book_title,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,24 Hours,4 Blondes,7b,A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,A Case of Need,...,Wuthering Heights,Year of Wonders: A Novel of the Plague,Yesterday,You Belong To Me,You Belong to Me,You Belong to Me and Other True Cases (Ann Rule's Crime Files: Vol. 2),Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zlata's Diary: A Child's Life in Sarajevo,Zoya,"\O\"" Is for Outlaw"""
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277427,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Perform User-Based Collaborative Filtering with KNN**

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the data
scaler = StandardScaler()
rating_matrix_scaled = scaler.fit_transform(rating_matrix_filled)

In [None]:
# Create the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)
knn.fit(rating_matrix_scaled)

In [None]:
def get_recommendations(user_id, rating_matrix, knn_model, rating_matrix_scaled, n_neighbors, n_recommendations=10):
    # Get index of the user
    user_index = rating_matrix.index.get_loc(user_id)

    # Get nearest neighbors
    distances, indices = knn_model.kneighbors([rating_matrix_scaled[user_index]], n_neighbors=n_neighbors+1)  # +1 to include the target user

    # Get neighbors' indices (not including the user itself)
    neighbors_indices = indices.flatten()[1:]
    distances = distances.flatten()[1:]

    if len(neighbors_indices) == 0:
        print(f"No neighbors found for user {user_id}.")
        return pd.Series()

    # Get similar users
    similar_users = rating_matrix.index[neighbors_indices]
    print(f"Similar users to {user_id}:")
    for i, user in enumerate(similar_users):
        print(f"User: {user}, Distance: {distances[i]}")

    # Calculate the weighted average of the ratings from the nearest neighbors
    neighbor_ratings = rating_matrix.iloc[neighbors_indices]

    # Recommend items that the user has not rated yet
    user_ratings = rating_matrix.loc[user_id]
    unrated_items = user_ratings[user_ratings.isna()].index

    if len(unrated_items) == 0:
        print(f"User {user_id} has rated all items.")
        return pd.Series()

    # Compute average rating for unrated items
    recommendations = neighbor_ratings[unrated_items].mean().sort_values(ascending=False)

    return recommendations.head(n_recommendations)

In [None]:
user_id = 2033
if user_id not in rating_matrix.index:
    print(f"User ID {user_id} not found in the dataset.")
else:
    recommendations = get_recommendations(user_id, rating_matrix, knn, rating_matrix_scaled, n_neighbors=10, n_recommendations=10)
    print(recommendations)

Similar users to 2033:
User: 51386, Distance: 0.6993345281500734
User: 77809, Distance: 0.7042121104782539
User: 79186, Distance: 0.7701577083139439
User: 179978, Distance: 0.795280923782677
User: 208568, Distance: 0.7953880272221509
User: 208141, Distance: 0.8010742442401894
User: 201783, Distance: 0.8031639446178255
User: 175003, Distance: 0.8173567676878375
User: 219683, Distance: 0.8265117988582696
User: 170634, Distance: 0.8319682436038494
book_title
Skeleton Crew                                                       10.0
The Te of Piglet                                                    10.0
The 9 Steps to Financial Freedom                                    10.0
The Cat in the Hat                                                  10.0
Suzanne's Diary for Nicholas                                        10.0
Matilda                                                             10.0
What to Expect the First Year                                       10.0
Midnight in the Garden of Goo

In [None]:
user_id = 201783
if user_id not in rating_matrix.index:
    print(f"User ID {user_id} not found in the dataset.")
else:
    recommendations = get_recommendations(user_id, rating_matrix, knn, rating_matrix_scaled, n_neighbors=10, n_recommendations=10)
    print(recommendations)

Similar users to 201783:
User: 179978, Distance: 0.6639419305818689
User: 198711, Distance: 0.7286309706385543
User: 208141, Distance: 0.7426408755469847
User: 175003, Distance: 0.763641490895293
User: 170634, Distance: 0.7842833922709755
User: 2033, Distance: 0.8031639446178255
User: 196985, Distance: 0.8147167213522095
User: 259625, Distance: 0.8147167213522095
User: 210792, Distance: 0.8147167213522095
User: 133868, Distance: 0.8147167213522095
book_title
The 9 Steps to Financial Freedom                                                  10.000000
Christmas Box (Christmas Box Trilogy)                                             10.000000
Ender's Shadow                                                                    10.000000
Charlie and the Chocolate Factory                                                 10.000000
Matilda                                                                           10.000000
What to Expect the First Year                                                

**Evaluate Performance**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Split data into train and test sets
train_df, test_df = train_test_split(preprocessed_books_df, test_size=0.2, random_state=42)

In [None]:
# Construct rating matrix for train set
train_rating_matrix = train_df.pivot_table(values='rating', index='user_id', columns='book_title')
train_rating_matrix_filled = train_rating_matrix.fillna(0)

In [None]:
# Standardize the train data
scaler = StandardScaler()
train_rating_matrix_scaled = scaler.fit_transform(train_rating_matrix_filled)

In [None]:
# Initialize dictionary to store metrics for different n_neighbors
metrics = {}

# Define threshold for positive/negative classification
threshold = 7

In [None]:
# Lists to store true labels and predicted labels
y_true = []
y_pred = []
actual_ratings = []
predicted_ratings = []

# Construct the rating matrix for the training set
train_rating_matrix = train_df.pivot_table(values='rating', index='user_id', columns='book_title')
train_rating_matrix_filled = train_rating_matrix.fillna(0)

# Standardize the training data
train_rating_matrix_scaled = scaler.fit_transform(train_rating_matrix_filled)

for n in range(1, 11):
  # Fit the KNN model on the training data
  knn.fit(train_rating_matrix_scaled)

  # Evaluate on the test set
  for user_id, book_title, actual_rating in zip(test_df['user_id'], test_df['book_title'], test_df['rating']):
      if user_id in train_rating_matrix.index:
          recommendations = get_recommendations(user_id, train_rating_matrix, knn, train_rating_matrix_scaled, n_neighbors=n, n_recommendations=10)
          if book_title in recommendations.index:
              predicted_rating = recommendations[book_title]
          else:
              predicted_rating = train_rating_matrix.loc[user_id].mean()

          # Classify as positive (1) or negative (0) based on the threshold
          y_true.append(1 if actual_rating >= threshold else 0)
          y_pred.append(1 if predicted_rating >= threshold else 0)

          if not np.isnan(predicted_rating):
              actual_ratings.append(actual_rating)
              predicted_ratings.append(predicted_rating)

  # Calculate metrics
  if y_true and y_pred:  # Ensure lists are not empty
      accuracy = accuracy_score(y_true, y_pred)
      precision = precision_score(y_true, y_pred)
      recall = recall_score(y_true, y_pred)
      f1 = f1_score(y_true, y_pred)
      rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)

      # Store metrics for the value of n
      metrics[n] = {
          'Accuracy': accuracy,
          'Precision': precision,
          'Recall': recall,
          'F1-Score': f1,
          'RMSE': rmse
      }

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User: 21576, Distance: 0.8674611047184805
User: 62272, Distance: 0.8762961119172726
User: 155219, Distance: 0.8856791203998149
User: 208147, Distance: 0.8868295758660509
User: 249894, Distance: 0.8935690516332712
User: 229741, Distance: 0.9022045025341527
Similar users to 21014:
User: 11993, Distance: 0.8704644307548683
User: 234359, Distance: 0.8948303216339095
User: 223154, Distance: 0.8954050656888959
User: 260897, Distance: 0.897644144391586
User: 30824, Distance: 0.9030897960815525
User: 30533, Distance: 0.907357997534677
User: 163804, Distance: 0.9077009088209219
User: 182993, Distance: 0.9091599333225612
User: 29855, Distance: 0.9102332570089068
User: 242361, Distance: 0.9119149459033173
Similar users to 185233:
User: 2110, Distance: 0.6924880611655417
User: 102359, Distance: 0.765911406700361
User: 236283, Distance: 0.7687213652832499
User: 52199, Distance: 0.7858183783709025
User: 126492, Distance: 0.815843640046

In [None]:
# Print out the metrics for each n_neighbors
for n_neighbors, metric_values in metrics.items():
    print(f"Metrics for n_neighbors={n_neighbors}:")
    print(f"  Accuracy: {metric_values['Accuracy']}")
    print(f"  Precision: {metric_values['Precision']}")
    print(f"  Recall: {metric_values['Recall']}")
    print(f"  F1-Score: {metric_values['F1-Score']}")
    print(f"  RMSE: {metric_values['RMSE']}")
    print()

Metrics for n_neighbors=1:
  Accuracy: 0.8271455952296444
  Precision: 0.6554054054054054
  Recall: 0.07677087455480808
  F1-Score: 0.13744243712362736
  RMSE: 3.142017965413242

Metrics for n_neighbors=2:
  Accuracy: 0.8261872648541209
  Precision: 0.622848200312989
  Recall: 0.07874950534230313
  F1-Score: 0.13982083260144035
  RMSE: 3.15512393741417

Metrics for n_neighbors=3:
  Accuracy: 0.8255365466979011
  Precision: 0.6048387096774194
  Recall: 0.07914523149980214
  F1-Score: 0.13997433803802636
  RMSE: 3.163431490696282

Metrics for n_neighbors=4:
  Accuracy: 0.8247675161496415
  Precision: 0.5851528384279476
  Recall: 0.07954095765730115
  F1-Score: 0.14004528827730361
  RMSE: 3.1743974090716214

Metrics for n_neighbors=5:
  Accuracy: 0.824149925463193
  Precision: 0.5713467048710602
  Recall: 0.07890779580530273
  F1-Score: 0.13866481223922114
  RMSE: 3.1840489219069137

Metrics for n_neighbors=6:
  Accuracy: 0.82384467949173
  Precision: 0.5644171779141104
  Recall: 0.078881

**Offline Evaluation Metrics**

Spearman rank correlation

In [None]:
from scipy.stats import spearmanr

def calculate_spearman_rank_correlation(test_df, rating_matrix, knn_model, rating_matrix_scaled, n_neighbors, n_recommendations=10):
    user_spearman_scores = []

    for user_id in test_df['user_id'].unique():
        if user_id in rating_matrix.index:
            # Get true ratings for the user
            true_ratings = test_df[test_df['user_id'] == user_id].set_index('book_title')['rating'].to_dict()

            # Get predictions for the user
            predictions = get_recommendations(user_id, rating_matrix, knn_model, rating_matrix_scaled, n_neighbors, n_recommendations=n_recommendations)

            # Rank items based on true ratings and predicted ratings
            true_rank = np.argsort([-true_ratings.get(book, 0) for book in rating_matrix.columns])
            pred_rank = np.argsort([-predictions.get(book, 0) for book in rating_matrix.columns])

            # Compute Spearman rank correlation coefficient
            if len(true_rank) > 1 and len(pred_rank) > 1:
                spearman_corr = spearmanr(true_rank, pred_rank).correlation
                user_spearman_scores.append(spearman_corr)

    # Calculate the average Spearman rank correlation coefficient
    average_spearman = np.mean(user_spearman_scores) if user_spearman_scores else None

    return average_spearman

In [None]:
# Initialize dictionary to store Spearman scores for different n_neighbors
spearman_scores = {}

for n in range(1, 11):
    average_spearman = calculate_spearman_rank_correlation(test_df, rating_matrix, knn, rating_matrix_scaled, n_neighbors=n, n_recommendations=10)
    spearman_scores[n] = average_spearman
    print(f"Spearman Rank Correlation for top-10 recommendations for n_neighbors={n}: {average_spearman}")

# Print out the Spearman scores for different n_neighbors
print("\nSpearman Rank Correlation Scores for top-10 recommendations:")
for n_neighbors, spearman_corr in spearman_scores.items():
    print(f"n_neighbors={n_neighbors}: {spearman_corr}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User: 255092, Distance: 0.8927519306484912
User: 223087, Distance: 0.9078278701095406
User: 249628, Distance: 0.9144243810696613
User: 24921, Distance: 0.9189824385151626
Similar users to 147224:
User: 115003, Distance: 0.7973936865531495
User: 2891, Distance: 0.8058918434013969
User: 230522, Distance: 0.8161616853913276
User: 127359, Distance: 0.8681741452700827
User: 251394, Distance: 0.8766999179980104
User: 95923, Distance: 0.8812812401537509
User: 251339, Distance: 0.8902462260910644
User: 271245, Distance: 0.8902462260910644
User: 238545, Distance: 0.8902462260910644
User: 198621, Distance: 0.8902462260910644
Similar users to 145641:
User: 235842, Distance: 0.8643697650693952
User: 133747, Distance: 0.9098764140608963
User: 56856, Distance: 0.9149311978942682
User: 271705, Distance: 0.9204281706504904
User: 101209, Distance: 0.9216942420659028
User: 92979, Distance: 0.9252715495434062
User: 120598, Distance: 0.92909

In [None]:
# Initialize dictionary to store Spearman scores for different n_neighbors
spearman_scores = {}

for n in range(1, 11):
    average_spearman = calculate_spearman_rank_correlation(test_df, rating_matrix, knn, rating_matrix_scaled, n_neighbors=n, n_recommendations=20)
    spearman_scores[n] = average_spearman
    print(f"Spearman Rank Correlation for top-20 recommendations for n_neighbors={n}: {average_spearman}")

# Print out the Spearman scores for different n_neighbors
print("\nSpearman Rank Correlation Scores for top-20 recommendations:")
for n_neighbors, spearman_corr in spearman_scores.items():
    print(f"n_neighbors={n_neighbors}: {spearman_corr}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User: 255092, Distance: 0.8927519306484912
User: 223087, Distance: 0.9078278701095406
User: 249628, Distance: 0.9144243810696613
User: 24921, Distance: 0.9189824385151626
Similar users to 147224:
User: 115003, Distance: 0.7973936865531495
User: 2891, Distance: 0.8058918434013969
User: 230522, Distance: 0.8161616853913276
User: 127359, Distance: 0.8681741452700827
User: 251394, Distance: 0.8766999179980104
User: 95923, Distance: 0.8812812401537509
User: 251339, Distance: 0.8902462260910644
User: 271245, Distance: 0.8902462260910644
User: 238545, Distance: 0.8902462260910644
User: 198621, Distance: 0.8902462260910644
Similar users to 145641:
User: 235842, Distance: 0.8643697650693952
User: 133747, Distance: 0.9098764140608963
User: 56856, Distance: 0.9149311978942682
User: 271705, Distance: 0.9204281706504904
User: 101209, Distance: 0.9216942420659028
User: 92979, Distance: 0.9252715495434062
User: 120598, Distance: 0.92909

R-score

In [None]:
def calculate_r_score(user_id, recommendations, test_df, L=10, alpha=0.5):
    # Filter test set for specific user
    user_test_ratings = test_df[test_df['user_id'] == user_id].set_index('book_title')['rating']

    # Calculate neutral rating as the mean rating of the user
    neutral_rating = user_test_ratings.mean()

    r_score_sum = 0
    count = 0

    for rank, (item, predicted_rating) in enumerate(recommendations.items(), start=1):
        # Check if the item exists in user_test_ratings
        if item in user_test_ratings.index:
            # Get the scalar value of the actual rating
            actual_rating = user_test_ratings.loc[item]
            if isinstance(actual_rating, pd.Series):
                actual_rating = actual_rating.iloc[0]  # Take the first value if there's a Series

            # Calculate rating-based utility for this item
            rating_based_utility = max(actual_rating - neutral_rating, 0)
            # Calculate ranking-based utility for this item
            ranking_based_utility = 2 ** ((rank - 1) / alpha)
            # Update R-Score sum
            r_score_sum += rating_based_utility / ranking_based_utility
            count += 1

        # Stop if we have already processed L items
        if rank >= L:
            break

    if count == 0:
        return np.nan

    return r_score_sum / count

In [None]:
def evaluate_r_score(knn_model, train_rating_matrix, test_df, rating_matrix_scaled, n, L, alpha=2):
    r_scores = []

    for user_id in test_df['user_id'].unique():
        if user_id not in train_rating_matrix.index:
            continue

        recommendations = get_recommendations(user_id, train_rating_matrix, knn_model, rating_matrix_scaled, n_neighbors=n, n_recommendations=L)
        if recommendations.empty:
            continue

        r_score = calculate_r_score(user_id, recommendations, test_df, L, alpha=alpha)
        if not np.isnan(r_score):
            r_scores.append(r_score)

    if len(r_scores) == 0:
        return np.nan

    return np.mean(r_scores)

In [None]:
# Call function to get R-score
r_scores_by_n = {}
for n in range(1, 11):
  r_score = evaluate_r_score(knn, train_rating_matrix, test_df, train_rating_matrix_scaled, n=n, L=10, alpha=2)
  r_scores_by_n[n] = r_score
  print(f"R-Score for n={n}: {r_score}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User: 76499, Distance: 0.8946624130717009
User: 36715, Distance: 0.8967682873872496
User: 30824, Distance: 0.9011672732251189
User: 95923, Distance: 0.9038340707219226
User: 101305, Distance: 0.9083366882333019
Similar users to 106225:
User: 223154, Distance: 0.8438235251071975
User: 22625, Distance: 0.8453576426038543
User: 25601, Distance: 0.8755123685321472
User: 223087, Distance: 0.8849656684830727
User: 172512, Distance: 0.8877630147415905
User: 255092, Distance: 0.8887855961693872
User: 85993, Distance: 0.889278002260392
User: 73394, Distance: 0.9004511188054737
User: 249628, Distance: 0.9079652059001301
User: 231827, Distance: 0.916740932553414
Similar users to 147224:
User: 115003, Distance: 0.7534562269187941
User: 2891, Distance: 0.8231377444227675
User: 230522, Distance: 0.8318775862703595
User: 127359, Distance: 0.886097103316103
User: 95923, Distance: 0.8893412743109554
User: 251394, Distance: 0.8933344348647

In [None]:
print(r_scores_by_n)

{1: 0.7052956823633222, 2: 0.7351961556377895, 3: 0.5508598319349033, 4: 0.43867926366198395, 5: 0.5511304999328153, 6: 0.5039570280995685, 7: 0.6252825209137143, 8: 0.6008484670504077, 9: 0.60747675738075, 10: 0.3507794955789136}


In [None]:
print("R-scores (top-10 items) by n_neighbors:")
for key, value in r_scores_by_n.items():
  print("n_neighbors=" + str(key) + ":", float(value))

R-scores (top-10 items) by n_neighbors:
n_neighbors=1: 0.7052956823633222
n_neighbors=2: 0.7351961556377895
n_neighbors=3: 0.5508598319349033
n_neighbors=4: 0.43867926366198395
n_neighbors=5: 0.5511304999328153
n_neighbors=6: 0.5039570280995685
n_neighbors=7: 0.6252825209137143
n_neighbors=8: 0.6008484670504077
n_neighbors=9: 0.60747675738075
n_neighbors=10: 0.3507794955789136
