This is the code for the XGBoost recommendation system. The limitations of this is:
1. Take a lot of computing power and memory. The most data my laptop can run for this is 30000 ratings and 10000 books.

*Tunable parameters:
- Number of training instances
- Input for the XGBoost model
- What to use to encode the data: label, one-hot, or binary encoding


In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from brs_data_preprocessing import get_preprocessed_data as preproc, merged_book_ratings as merge

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/legoeuro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
usersDf, bookDf, ratingDf = preproc('Users.csv', 'Books.csv', 'Ratings.csv')

#configs
#Convert all strings to categorical (might not be good for title)
bookDf['Book-Title'] = bookDf['Book-Title'].astype('category')
bookDf['Book-Author'] = bookDf['Book-Author'].astype('category')
bookDf['Publisher'] = bookDf['Publisher'].astype('category')

ratingDf['ISBN'] = ratingDf['ISBN'].astype(str)
bookDf['ISBN'] = bookDf["ISBN"].astype(str)
ratingDf['User-ID'] = ratingDf['User-ID'].astype(int)
usersDf['User-ID'] = ratingDf['User-ID'].astype(int)

ratingDf = ratingDf.iloc[:30000]
bookDf = bookDf.iloc[:10000]

# bookDf.drop(columns=['Book-Title'], inplace=True)

COUNTRY_INDEX = 2
CITY_INDEX = 1
STATE_INDEX = 0
#For now I will use only country data
usersDf['Location'] = usersDf['Location'].apply(lambda x: x.split(','))
usersDf.drop(usersDf[usersDf['Location'].apply(lambda x: len(x) != 3)].index, inplace=True)
usersDf['Location'] = usersDf['Location'].apply(lambda x: x[CITY_INDEX])


Binary encoding (trade-off between data being ordered and memory cost)

In [3]:
from category_encoders import BinaryEncoder

enc_author = BinaryEncoder(cols=['Book-Author'])
bookDf = enc_author.fit_transform(bookDf)

enc_publisher = BinaryEncoder(cols=['Publisher'])
bookDf = enc_publisher.fit_transform(bookDf)

enc_location = BinaryEncoder(cols=['Location'])
usersDf = enc_location.fit_transform(usersDf)

Other encoding methods are label encoding (bad since our data is not ordinal), and one-hot encoding (not good when I tested - inefficient in memory and computing power)

In [4]:
# from sklearn import preprocessing
# from sklearn.decomposition import TruncatedSVD
# import nltk

# for label encoding
# bookDf['Book-Author'] = label_encoder.fit_transform(bookDf['Book-Author'])
# bookDf['Publisher'] = label_encoder.fit_transform(bookDf['Publisher'])
# usersDf['Location'] = label_encoder.fit_transform(usersDf['Location'])

# for one hot encoding
# authorCols = pd.get_dummies(bookDf['Book-Author'])
# publisherCols = pd.get_dummies(bookDf['Publisher'])
# bookDf = pd.concat([bookDf, authorCols, publisherCols], axis=1)
# bookDf.drop(columns=['Book-Author', 'Publisher'], inplace=True)

# locationCols = pd.get_dummies(usersDf['Location'])
# usersDf = pd.concat([usersDf, locationCols], axis=1)
# usersDf.drop(columns=['Location'], inplace=True)

# for SVD on publisher, author. SVD for users is beslow
# from testing on this dataset (truncated to 10000 books), SVD on publisher and author does not give better results
# svd = TruncatedSVD(n_components=100)
# authorSVD = CountVectorizer().fit_transform(bookDf['Book-Author'])

# authorSVD = svd.fit_transform(authorSVD)
# authorDF = pd.DataFrame(data=authorSVD).add_prefix('Book-Author-')
# bookDf = pd.concat([bookDf, authorDF], axis=1)
# bookDf.drop(columns=['Book-Author'], inplace=True)


In [5]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

stop = stopwords.words('english')
def preprocess(column):

    #make all words with lower letters
    column = column.str.lower()
    #getting rid of any punctution
    column = column.str.replace('http\S+|www.\S+|@|%|:|,|', '', case=False)
    #spliting each sentence to words to apply previous funtions on them 
    word_tokens = column.str.split()
    keywords = word_tokens.apply(lambda x: [item for item in x if item not in stop])
    #assemble words of each sentence again and assign them in new column
    for i in range(len(keywords)):
        keywords[i] = " ".join(keywords[i])
        column = keywords
    return column

bookDf['Book-Title'] = preprocess(bookDf['Book-Title'])
print(bookDf['Book-Title'])
bookVectorized = CountVectorizer().fit_transform(bookDf['Book-Title'])
# bookDf = pd.concat([bookDf, pd.DataFrame(data=bookVectorized.toarray())], axis=1)
# bookDf.drop(columns=['Book-Title'], inplace=True)

#too much columns --> SVD
svd = TruncatedSVD(n_components=100)
bookVectorized = svd.fit_transform(bookVectorized)
titleDF = pd.DataFrame(data=bookVectorized).add_prefix('Book-Title-')
bookDf = pd.concat([bookDf, titleDF], axis=1)
bookDf.drop(columns=['Book-Title'], inplace=True)




0                                     classical mythology
1                                            clara callan
2                                       decision normandy
3       flu: story great influenza pandemic 1918 searc...
4                                         mummies urumchi
                              ...                        
9995                 read tell says : stories (bard book)
9996                                           star rover
9997                                     die keltennadel.
9998                                     tod der datscha.
9999                                              dunkel.
Name: Book-Title, Length: 10000, dtype: object


Generate input for XGBoost

In [9]:
def generateInput(X_u, X_b, y):
    """
    X_u: User features
    X_b: Book features
    y: Ratings
    tgt_users: Target users
    """
    merged = pd.merge(y, X_u, on='User-ID', how='inner')
    merged = pd.merge(merged, X_b, on='ISBN', how='inner')

    merged.drop(columns=['ISBN', 'User-ID'], inplace=True)

    # merged.fillna(0, inplace=True)
    merged = merged.dropna()
    return (merged.drop(columns=['Book-Rating']), merged['Book-Rating'])

In [10]:
from sklearn.model_selection import train_test_split
#Split into train and test
X, y = generateInput(usersDf, bookDf, ratingDf)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



         Location_0  Location_1  Location_2  Location_3  Location_4  \
0                 0           0           0           0           0   
1                 0           0           0           0           0   
2                 0           0           0           0           0   
3                 0           0           0           0           0   
4                 0           0           0           0           0   
...             ...         ...         ...         ...         ...   
1799782           0           0           0           0           0   
1799783           0           0           0           0           0   
1799784           0           0           1           0           0   
1799785           0           0           0           0           0   
1799786           0           0           1           0           1   

         Location_5  Location_6  Location_7  Location_8  Location_9  ...  \
0                 0           0           0           0           0  ..

In [None]:
y_train

1065905    8
1249048    0
1547582    0
436463     0
464585     0
          ..
490951     0
238508     0
1222286    0
86228      0
1136330    0
Name: Book-Rating, Length: 1439829, dtype: int64

In [None]:
import xgboost as xgb

#tuneable parameters
model = xgb.XGBRegressor(tree_method="hist", learning_rate=0.1, max_depth=15, min_child_weight=5, n_estimators=250)
model.fit(X_train, y_train, verbose=True)

In [None]:
def predict (model, df):
    return model.predict(df)
  
predictions = predict(model, X_test)

predictions

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




array([-0.10176289,  0.08312488,  1.0975707 , ...,  0.04397249,
        1.4972544 , -0.05430675], dtype=float32)

In [None]:
#evaluate model
from sklearn.metrics import mean_squared_error
from math import sqrt
predictions = pd.DataFrame(predictions)
rms = sqrt(mean_squared_error(y_test, predictions))

rms

1.6241809822125899