This is the code for the Cosine Similarity recommendation system. The limitations of this is:
1. Only take book data into account, and doesn't compare user-to-user
2. The amount of memory and computing power is large, with the whole book dataset python needs to allocate ~80Gb. Currently to get around this I'm using 20000 instances of book data.

*Tunable parameters:
- Number of training instances
- m: maximum number of books to consider for recommendations (take m books with highest rating from user and consider them for similarity)


In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from brs_data_preprocessing import get_preprocessed_data as preproc, merged_book_ratings as merge

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/legoeuro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
usersDf, bookDf, ratingDf = preproc('input/Users.csv', 'input/Books.csv', 'input/Ratings.csv')

#configs
#Convert all strings to categorical (might not be good for title)
bookDf['Book-Title'] = bookDf['Book-Title'].astype('category')
bookDf['Book-Author'] = bookDf['Book-Author'].astype('category')
bookDf['Publisher'] = bookDf['Publisher'].astype('category')

ratingDf['ISBN'] = ratingDf['ISBN'].astype(str)
bookDf['ISBN'] = bookDf["ISBN"].astype(str)
ratingDf['User-ID'] = ratingDf['User-ID'].astype(int)
usersDf['User-ID'] = ratingDf['User-ID'].astype(int)

ratingDf = ratingDf.iloc[:30000]
bookDf = bookDf.iloc[:10000]

#(optional) split location into multiple categories (from string split by ',')
# for now I will just skip location altogether
# The same goes for title, since title is unique for each book.
usersDf.drop(columns=['Location'], inplace=True)
bookDf.drop(columns=['Book-Title'], inplace=True)

# usersDf['Location'] = usersDf['Location'].apply(lambda x: x.split(','))
# locationList = usersDf['Location'].to_list()
# flattenList = map(lambda x: x.strip(), [item for row in locationList for item in row])

In [5]:
bookDf.head()

Unnamed: 0,ISBN,Book-Author,Year-Of-Publication,Publisher
0,195153448,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Carlo D'Este,1991,HarperPerennial
3,374157065,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [6]:
ratingDf.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
usersDf.head()

Unnamed: 0,User-ID,Age
1,276726,18.0
3,276729,17.0
5,276733,61.0
9,276745,26.0
10,276746,14.0


Generate input for XGBoost

In [8]:
def generateInput(X_u, X_b, y):
    """
    X_u: User features
    X_b: Book features
    y: Ratings
    tgt_users: Target users
    """
    merged = pd.merge(X_u, y, on='User-ID', how='inner')
    merged = pd.merge(merged, X_b, on='ISBN', how='inner')

    merged.drop(columns=['ISBN', 'User-ID'], inplace=True)

    # merged.fillna(0, inplace=True)
    merged = merged.dropna()
    return (merged.drop(columns=['Book-Rating']), merged['Book-Rating'])

In [9]:
from sklearn.model_selection import train_test_split
#Split into train and test
X, y = generateInput(usersDf, bookDf, ratingDf)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [12]:
y_train

548342      0
101705      0
688380      0
23712      10
1324514     0
           ..
892914      0
611664      0
1635249     9
229783      0
266282      0
Name: Book-Rating, Length: 1448265, dtype: int64

In [13]:
import xgboost as xgb

#tuneable parameters
model = xgb.XGBClassifier(tree_method="hist", device="cuda", learning_rate=0.1, max_depth=15, min_child_weight=5, n_estimators=250, enable_categorical=True)
model.fit(X_train, y_train, verbose=True)

: 

Apply cosine vectorizer to all books; Find user top 10 books that they have not read

In [None]:
def predict (model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['User-ID'])])
  
predictions = (X_test.groupby('User-ID')
               .apply(lambda x: predict(model, x)))
predictions

  .apply(lambda x: predict(model, x)))


User-ID
16.0            [-1.7457176, -1.634243, -2.035031, -2.204781]
22.0        [-2.0572004, -2.1904683, -2.1834931, -2.020229...
39.0        [-3.2873313, -3.4544895, -3.496933, -2.0658057...
53.0        [-3.5275736, -3.5275736, -3.6604211, -2.865101...
67.0        [-2.428505, -2.2839484, -2.3005881, -2.3111029...
                                  ...                        
278774.0    [-2.1939507, -2.1386213, -2.1386213, -2.048174...
278813.0                                         [-2.1935604]
278819.0                                         [-1.9607589]
278820.0                                         [-2.3529625]
278854.0    [-2.2929847, -2.2731333, -2.138771, -2.0048356...
Length: 266, dtype: object

In [None]:
#evaluate model
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(y_test, predictions))

ValueError: Found input variables with inconsistent numbers of samples: [152480, 266]