In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from tensorflow import keras
from tensorflow import math
from catboost import CatBoostRegressor
from catboost import Pool
from catboost import cv

In [3]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,User-ID,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,raw_title,categories
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning
3,6543,9,jamespatterson,2001.0,usa,34.0,1sttodieanovel,fiction
4,6543,6,alexandermccallsmith,2002.0,usa,34.0,theno1ladiesdetectiveagency,botswana


In [4]:
data = data.rename(columns={'User-ID': 'user_id', 'Location': 'location', 'Age': 'age', 'raw_title': 'book_id', 'Book-Author': 'book_author', 'Year-Of-Publication': 'year_of_publication', 'categories': 'category', 'Book-Rating': 'book_rating'})
data.head(5)

Unnamed: 0,user_id,book_rating,book_author,year_of_publication,location,age,book_id,category
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning
3,6543,9,jamespatterson,2001.0,usa,34.0,1sttodieanovel,fiction
4,6543,6,alexandermccallsmith,2002.0,usa,34.0,theno1ladiesdetectiveagency,botswana


In [5]:
data.isna().sum()

user_id                    0
book_rating                0
book_author                0
year_of_publication       38
location                   0
age                    21675
book_id                    0
category                   0
dtype: int64

In [6]:
data['age'] = data['age'].fillna(data['age'].median())
data['year_of_publication'] = data['year_of_publication'].fillna(data['year_of_publication'].median())

In [7]:
data.isna().sum()

user_id                0
book_rating            0
book_author            0
year_of_publication    0
location               0
age                    0
book_id                0
category               0
dtype: int64

In [8]:
embedded_data = data[['user_id', 'book_id', 'book_rating']]
embedded_data['book_rating'] = embedded_data['book_rating'].astype(float)
embedded_data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embedded_data['book_rating'] = embedded_data['book_rating'].astype(float)


Unnamed: 0,user_id,book_id,book_rating
0,2313,themartianchronicles,7.0
1,2313,nonoboy,8.0
2,6543,fahrenheit451,8.0
3,6543,1sttodieanovel,9.0
4,6543,theno1ladiesdetectiveagency,6.0


In [9]:
label_encoder = LabelEncoder()
embedded_data['book_id'] = label_encoder.fit_transform(embedded_data['book_id'])
embedded_data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  embedded_data['book_id'] = label_encoder.fit_transform(embedded_data['book_id'])


Unnamed: 0,user_id,book_id,book_rating
0,2313,11266,7.0
1,2313,7036,8.0
2,6543,3469,8.0
3,6543,39,9.0
4,6543,11466,6.0


In [10]:
X_embedded = embedded_data[['user_id', 'book_id']]
y_embedded = embedded_data['book_rating']

In [11]:
all_users = X_embedded['user_id'].unique()
all_books = X_embedded['book_id'].unique()

In [12]:
user_input = keras.layers.Input(shape=(1,), name="user")
user_as_integer = keras.layers.IntegerLookup(vocabulary=all_users)(user_input)
user_embedding = keras.layers.Embedding(input_dim=len(all_users) + 1, output_dim=32, embeddings_regularizer=keras.regularizers.l2(0.1), name="user_embedding")(user_as_integer)

In [13]:
book_input = keras.layers.Input(shape=(1,), name="book")
book_as_integer = keras.layers.IntegerLookup(vocabulary=all_books)(book_input)
book_embedding = keras.layers.Embedding(input_dim=len(all_books) + 1, output_dim=32, embeddings_regularizer=keras.regularizers.l2(0.1), name="book_embedding")(book_as_integer)

In [14]:
dot_product = keras.layers.Dot(axes=-1)([user_embedding, book_embedding])
flatten = keras.layers.Flatten()(dot_product)

In [15]:
output = keras.layers.Lambda(lambda x: 9 * math.sigmoid(x) + 1, name="rating")(flatten)




In [16]:
embedding_model = keras.Model(inputs=[user_input, book_input], outputs=output)
embedding_model.compile(loss="mse", metrics=[keras.metrics.RootMeanSquaredError])

In [17]:
embedding_model.fit(x=[X_embedded['user_id'], X_embedded['book_id']], y=y_embedded, batch_size=256, epochs=40, validation_split=0.1)

Epoch 1/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 22.9034 - root_mean_squared_error: 2.8527 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 2/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 8.1193 - root_mean_squared_error: 2.8494 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 3/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 8.1546 - root_mean_squared_error: 2.8556 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 4/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 8.1412 - root_mean_squared_error: 2.8532 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 5/40
[1m245/245[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 8.1306 - root_mean_squared_error: 2.8514 - val_loss: 7.6130 - val_root_mean_squared_error: 2.7535
Epoch 6/40
[1m245/245[0m [32m━━

<keras.src.callbacks.history.History at 0x12df5dca5f0>

In [18]:
embedded_user_output = embedding_model.get_layer("user_embedding").output
embedded_book_output = embedding_model.get_layer("book_embedding").output
embedding_user_model = keras.Model(inputs=[user_input, book_input], outputs=embedded_user_output)
embedding_book_model = keras.Model(inputs=[user_input, book_input], outputs=embedded_book_output)

In [19]:
user_feature = embedding_user_model.predict(x=[X_embedded['user_id'], X_embedded['book_id']]).reshape((data.shape[0], 32)).tolist()
user_df = pd.DataFrame(user_feature)
user_df = user_df.add_suffix('_user')
book_feature = embedding_book_model.predict(x=[X_embedded['user_id'], X_embedded['book_id']]).reshape((data.shape[0], 32)).tolist()
book_df = pd.DataFrame(book_feature)
book_df = book_df.add_suffix('_book')

[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 840us/step
[1m2177/2177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 840us/step


In [20]:
data = data.drop(columns=['user_id', 'book_id'])
data.head(5)

Unnamed: 0,book_rating,book_author,year_of_publication,location,age,category
0,7,raybradbury,1984.0,usa,23.0,fiction
1,8,johnokada,1978.0,usa,23.0,japanese
2,8,raybradbury,1976.0,usa,34.0,bookburning
3,9,jamespatterson,2001.0,usa,34.0,fiction
4,6,alexandermccallsmith,2002.0,usa,34.0,botswana


In [21]:
data = pd.concat([data, user_df, book_df], axis=1)
data.head(5)

Unnamed: 0,book_rating,book_author,year_of_publication,location,age,category,0_user,1_user,2_user,3_user,...,22_book,23_book,24_book,25_book,26_book,27_book,28_book,29_book,30_book,31_book
0,7,raybradbury,1984.0,usa,23.0,fiction,-1.618703e-35,3.21625e-35,2.32477e-35,-4.432507e-35,...,2.635972e-35,-1.474804e-35,-6.191571999999999e-36,-1.605098e-35,-3.507932e-35,3.220586e-35,4.080667e-36,1.175463e-36,-3.585158e-35,-3.0717799999999996e-36
1,8,johnokada,1978.0,usa,23.0,japanese,-1.618703e-35,3.21625e-35,2.32477e-35,-4.432507e-35,...,4.966776e-35,-4.013646e-35,3.602758e-35,-5.66311e-35,2.905765e-35,-2.721745e-35,-4.050468e-35,-3.191607e-35,3.047668e-35,-2.668279e-35
2,8,raybradbury,1976.0,usa,34.0,bookburning,9.045199e-36,2.402558e-35,-2.382347e-35,-6.284702e-36,...,4.26439e-36,-7.029181e-36,9.1207e-38,-7.444526999999999e-36,-5.974926e-36,5.404129e-36,-1.190291e-35,5.922095e-36,-2.147113e-35,1.232466e-35
3,9,jamespatterson,2001.0,usa,34.0,fiction,9.045199e-36,2.402558e-35,-2.382347e-35,-6.284702e-36,...,9.800239e-36,1.192074e-35,8.240200999999999e-36,4.130697e-36,-4.337147e-36,-7.616007e-36,5.6848069999999997e-36,-2.086665e-35,-1.810264e-35,-2.0446539999999998e-36
4,6,alexandermccallsmith,2002.0,usa,34.0,botswana,9.045199e-36,2.402558e-35,-2.382347e-35,-6.284702e-36,...,-3.304633e-35,3.42219e-35,-1.022814e-35,1.343375e-35,2.372908e-35,1.814841e-35,-5.006544e-36,-1.380913e-35,-3.437544e-35,-3.467882e-35


In [24]:
X = data.drop(columns=['book_rating'])
y = data['book_rating']
cat_features = ['book_author', 'location', 'category']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
model_cat_boost = CatBoostRegressor()
params = {'depth': [4, 8, 12, 16], 'l2_leaf_reg': [0.5, 1, 1.5], 'iterations': [500, 1000, 1500], 'loss_function': 'RMSE', 'verbose': False, 'random_state': 42}
train_pool = Pool(X_train, y_train, cat_features=cat_features)
cv_results = cv(train_pool, params, fold_count=3, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CatBoostError: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/json_helper.h:173: Can't parse parameter "iterations" with value: [500,1000,1500]

In [42]:
model_cat_boost.fit(X_train, y_train, cat_features=cat_features, verbose=False)

<catboost.core.CatBoostRegressor at 0x12d949078b0>

In [31]:
predictions = model_cat_boost.predict(X_test).tolist()
np.sqrt(mean_squared_error(y_test.tolist(), predictions))

1.560285055145355

In [32]:
r2_score(y_test.tolist(), predictions)

0.24535460637944262

# Обертка

In [None]:
class RecSys:
    def __init__(self, book_embedding_model, user_embedding_model, main_model):
        self.book_embedding_model = book_embedding_model
        self.user_embedding_model = user_embedding_model
        self.main_model = main_model
    def get_recommendations(self, user_id, n=10):
        # sql запрос
        user_feature = self.user_embedding_model.predict(x=[data['user_id'], data['book_id']]).reshape((data.shape[0], 32)).tolist()
        user_df = pd.DataFrame(user_feature)
        user_df = user_df.add_suffix('_user')
        book_feature = self.book_embedding_model.predict(x=[data['user_id'], data['book_id']]).reshape((data.shape[0], 32)).tolist()
        book_df = pd.DataFrame(book_feature)
        book_df = book_df.add_suffix('_book')
        data = data.drop(columns=['user_id', 'book_id'])
        data = pd.concat([data, user_df, book_df], axis=1)
        X = data.drop(columns=['book_rating'])
        cat_features = ['book_author', 'location', 'category']
        predictions = model_cat_boost.predict(X).tolist()
        recommendations = pd.DataFrame({'book_id': data['book_id'], 'rating': predictions})
        recommendations = recommendations.sort_values(by='rating')
        return recommendations.head(n)['book_id'].tolist()
    def get_user_history(self, user_id, n=10):
        # sql запрос
    def get_popular_books(self, n=10):
        # sql запрос