In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from tensorflow import keras
from tensorflow import math
from catboost import CatBoostRegressor

In [2]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,User-ID,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,raw_title,categories
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning
3,6543,9,jamespatterson,2001.0,usa,34.0,1sttodieanovel,fiction
4,6543,6,alexandermccallsmith,2002.0,usa,34.0,theno1ladiesdetectiveagency,botswana


In [3]:
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Year-Of-Publication'] = data['Year-Of-Publication'].fillna(data['Year-Of-Publication'].median())

In [4]:
embedded_data = data[['User-ID', 'raw_title', 'Book-Rating']]
embedded_data['Book-Rating'] = embedded_data['Book-Rating'].astype(float)
embedded_data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embedded_data['Book-Rating'] = embedded_data['Book-Rating'].astype(float)


Unnamed: 0,User-ID,raw_title,Book-Rating
0,2313,themartianchronicles,7.0
1,2313,nonoboy,8.0
2,6543,fahrenheit451,8.0
3,6543,1sttodieanovel,9.0
4,6543,theno1ladiesdetectiveagency,6.0


In [5]:
label_encoder = LabelEncoder()
embedded_data['raw_title'] = label_encoder.fit_transform(embedded_data['raw_title'])
embedded_data = embedded_data.rename(columns={'User-ID': 'user-id', 'raw_title': 'book-id', 'Book-Rating': 'rating'})
embedded_data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embedded_data['raw_title'] = label_encoder.fit_transform(embedded_data['raw_title'])


Unnamed: 0,user-id,book-id,rating
0,2313,11266,7.0
1,2313,7036,8.0
2,6543,3469,8.0
3,6543,39,9.0
4,6543,11466,6.0


In [6]:
X_embedded = embedded_data[['user-id', 'book-id']]
y_embedded = embedded_data['rating']

In [7]:
all_users = X_embedded['user-id'].unique()
all_books = X_embedded['book-id'].unique()

In [8]:
user_input = keras.layers.Input(shape=(1,), name="User")
user_as_integer = keras.layers.IntegerLookup(vocabulary=all_users)(user_input)
user_embedding = keras.layers.Embedding(input_dim=len(all_users) + 1, output_dim=32, embeddings_regularizer=keras.regularizers.l2(0.1), name="user-embedding")(user_as_integer)

In [9]:
book_input = keras.layers.Input(shape=(1,), name="Book")
book_as_integer = keras.layers.IntegerLookup(vocabulary=all_books)(book_input)
book_embedding = keras.layers.Embedding(input_dim=len(all_books) + 1, output_dim=32, embeddings_regularizer=keras.regularizers.l2(0.1), name="book-embedding")(book_as_integer)

In [10]:
dot_product = keras.layers.Dot(axes=-1)([user_embedding, book_embedding])
flatten = keras.layers.Flatten()(dot_product)

In [11]:
output = keras.layers.Lambda(lambda x: 9 * math.sigmoid(x) + 1, name="Rating")(flatten)




In [12]:
model = keras.Model(inputs=[user_input, book_input], outputs=output)
model.compile(loss="mse", metrics=[keras.metrics.RootMeanSquaredError])

In [13]:
model.fit(x=[X_embedded['user-id'], X_embedded['book-id']], y=y_embedded, batch_size=256, epochs=40, validation_split=0.1)

Epoch 1/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 22.8450 - root_mean_squared_error: 2.8404 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 2/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 8.0639 - root_mean_squared_error: 2.8397 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 3/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 8.0931 - root_mean_squared_error: 2.8448 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 4/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 8.1314 - root_mean_squared_error: 2.8515 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 5/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - loss: 8.1406 - root_mean_squared_error: 2.8532 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 6/40
[1m245/245[0m [32m━━

<keras.src.callbacks.history.History at 0x238250465f0>

In [14]:
embedded_user_output = model.get_layer("user-embedding").output
embedded_book_output = model.get_layer("book-embedding").output
embedding_user_model = keras.Model(inputs=[user_input, book_input], outputs=embedded_user_output)
embedding_book_model = keras.Model(inputs=[user_input, book_input], outputs=embedded_book_output)

In [15]:
user_feature = embedding_user_model.predict(x=[X_embedded['user-id'], X_embedded['book-id']]).reshape((69659, 32)).tolist()
user_df = pd.DataFrame(user_feature)
user_df = user_df.add_suffix('_user')
book_feature = embedding_book_model.predict(x=[X_embedded['user-id'], X_embedded['book-id']]).reshape((69659, 32)).tolist()
book_df = pd.DataFrame(book_feature)
book_df = book_df.add_suffix('_book')

[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 981us/step
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 970us/step


In [17]:
data = data.drop(columns=['User-ID', 'raw_title'])
data.head(5)

Unnamed: 0,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,categories
0,7,raybradbury,1984.0,usa,23.0,fiction
1,8,johnokada,1978.0,usa,23.0,japanese
2,8,raybradbury,1976.0,usa,34.0,bookburning
3,9,jamespatterson,2001.0,usa,34.0,fiction
4,6,alexandermccallsmith,2002.0,usa,34.0,botswana


In [24]:
data = pd.concat([data, user_df, book_df], axis=1)
data.head(5)

Unnamed: 0,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,categories,0_user,1_user,2_user,3_user,...,22_book,23_book,24_book,25_book,26_book,27_book,28_book,29_book,30_book,31_book
0,7,raybradbury,1984.0,usa,23.0,fiction,5.383851e-35,2.213598e-35,5.186988e-35,2.470529e-35,...,9.475913999999999e-36,-8.186681e-36,-1.679035e-35,9.918262e-36,-2.76535e-36,-3.266017e-35,2.20553e-35,-3.903327e-36,2.279142e-35,-3.846418e-35
1,8,johnokada,1978.0,usa,23.0,japanese,5.383851e-35,2.213598e-35,5.186988e-35,2.470529e-35,...,4.005529e-35,4.530857e-35,2.558716e-35,-3.194719e-35,4.720458e-35,2.54722e-35,-2.309097e-35,4.423635e-35,5.765179e-35,2.776359e-35
2,8,raybradbury,1976.0,usa,34.0,bookburning,-2.248257e-35,1.282254e-35,1.618739e-35,-1.077885e-35,...,7.491415e-36,-1.126411e-35,2.433774e-35,-2.865133e-35,-1.06697e-35,3.17223e-36,-7.990629e-36,1.757548e-36,-6.606846e-36,-7.093147e-36
3,9,jamespatterson,2001.0,usa,34.0,fiction,-2.248257e-35,1.282254e-35,1.618739e-35,-1.077885e-35,...,-6.760812e-36,-8.619048e-37,-5.694455e-36,2.825074e-35,-9.676849999999999e-36,2.0241489999999998e-36,-1.01959e-35,-1.454614e-35,7.187703e-36,-4.760319e-36
4,6,alexandermccallsmith,2002.0,usa,34.0,botswana,-2.248257e-35,1.282254e-35,1.618739e-35,-1.077885e-35,...,2.2907709999999999e-35,-5.9623159999999997e-36,3.699239e-35,2.699168e-35,1.87926e-35,-4.142673e-36,5.9942459999999996e-36,1.632059e-35,-3.537558e-36,-2.238457e-35


In [31]:
X = data.drop(columns=['Book-Rating'])
y = data['Book-Rating']
cat_features = ['Book-Author', 'Location', 'categories']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
model_cat_boost = CatBoostRegressor(loss_function='RMSE', random_state=42)
model_cat_boost.fit(X_train, y_train, cat_features=cat_features, verbose=False)

<catboost.core.CatBoostRegressor at 0x23825a08af0>

In [36]:
predictions = model_cat_boost.predict(X_test).tolist()
np.sqrt(mean_squared_error(y_test.tolist(), predictions))

1.5498313095943363

In [37]:
r2_score(y_test.tolist(), predictions)

0.2554328207390917