In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,User-ID,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,raw_title,categories
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning
3,6543,9,jamespatterson,2001.0,usa,34.0,1sttodieanovel,fiction
4,6543,6,alexandermccallsmith,2002.0,usa,34.0,theno1ladiesdetectiveagency,botswana


In [3]:
data = data.rename(columns={'raw_title': 'title'})
data.head(5)

Unnamed: 0,User-ID,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,title,categories
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning
3,6543,9,jamespatterson,2001.0,usa,34.0,1sttodieanovel,fiction
4,6543,6,alexandermccallsmith,2002.0,usa,34.0,theno1ladiesdetectiveagency,botswana


In [4]:
data[['title', 'User-ID', 'Book-Rating']].isna().sum()

title          0
User-ID        0
Book-Rating    0
dtype: int64

# Первый этап обучения

In [5]:
reader = Reader(rating_scale=(1, 10))
surprise_data = Dataset.load_from_df(data[['User-ID', 'title', 'Book-Rating']], reader)
surprise_data.df.head(5)

Unnamed: 0,User-ID,title,Book-Rating
0,2313,themartianchronicles,7
1,2313,nonoboy,8
2,6543,fahrenheit451,8
3,6543,1sttodieanovel,9
4,6543,theno1ladiesdetectiveagency,6


In [6]:
trainset, testset = train_test_split(surprise_data, test_size=0.99, random_state=42)

In [7]:
model = SVD()
model.fit(trainset)
explicit_predictions = model.test(testset)

In [8]:
accuracy.rmse(explicit_predictions)

RMSE: 1.7995


1.7994647822864622

In [9]:
user_id = []
title = []
real_rating = []
predicted_rating = []
for i in explicit_predictions:
    user_id.append(i.uid)
    title.append(i.iid)
    real_rating.append(i.r_ui)
    predicted_rating.append(i.est)
explicit_data = pd.DataFrame(columns=['User-ID', 'title', 'Real_rating', 'Predicted_rating'])
explicit_data['User-ID'] = user_id
explicit_data['title'] = title
explicit_data['Real_rating'] = real_rating
explicit_data['Predicted_rating'] = predicted_rating
explicit_data.head(5)

Unnamed: 0,User-ID,title,Real_rating,Predicted_rating
0,18172,thehitchhikersguidetothegalaxy,9.0,8.095973
1,90614,seehowtheyrun,6.0,7.738831
2,219193,thehatboxbaby,7.0,7.731322
3,169252,thetaoofpooh,9.0,7.731322
4,272349,angelasashesamemoir,3.0,7.731322


In [10]:
explicit_data = explicit_data.sort_values('Predicted_rating', ascending=False)
explicit_data = explicit_data.groupby('User-ID').apply(lambda x: x.head(20)).reset_index(drop=True)
explicit_data.head(5)

Unnamed: 0,User-ID,title,Real_rating,Predicted_rating
0,8,janedoe,5.0,7.731322
1,12,ifidknownthenwhatiknownowwhynotlearnfromthemis...,10.0,7.731322
2,16,airframe,9.0,7.731322
3,17,prophet,3.0,7.731322
4,17,deathintheclouds,7.0,7.731322


In [11]:
testset = trainset.build_anti_testset()
predictions = model.test(testset)

In [12]:
user_id = []
title = []
predicted_rating = []
for i in predictions:
    user_id.append(i.uid)
    title.append(i.iid)
    predicted_rating.append(i.est)
implicit_data = pd.DataFrame(columns=['User-ID', 'title', 'Predicted_rating'])
implicit_data['User-ID'] = user_id
implicit_data['title'] = title
implicit_data['Predicted_rating'] = predicted_rating
implicit_data.head(5)

Unnamed: 0,User-ID,title,Predicted_rating
0,135149,thehearing,7.944095
1,135149,lastmanstanding,7.768169
2,135149,thevalleyofhorses,7.994058
3,135149,thebutcherboy,7.504219
4,135149,misery,7.892265


In [13]:
implicit_data = implicit_data.sort_values('Predicted_rating', ascending=False)
implicit_data = implicit_data.groupby('User-ID').apply(lambda x: x.head(20)).reset_index(drop=True)
implicit_data.head(5)

Unnamed: 0,User-ID,title,Predicted_rating
0,651,mydreamofyou,8.230393
1,651,theonlyone,8.202541
2,651,thehitchhikersguidetothegalaxy,8.159332
3,651,awomansplace,8.139248
4,651,ellenfoster,8.13676


# Второй этап обучения

In [14]:
merged_data = data.merge(explicit_data, left_on=['title', 'User-ID'], right_on=['title', 'User-ID'], how='inner')
merged_data.head(5)

Unnamed: 0,User-ID,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,title,categories,Real_rating,Predicted_rating
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction,7.0,7.731322
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese,8.0,7.731322
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning,8.0,8.013448
3,6543,7,ursulahegi,1997.0,usa,34.0,saltdancers,fiction,7.0,7.731322
4,6543,6,davasobel,1999.0,usa,34.0,galileosdaughterahistoricalmemoirofsciencefait...,biography&autobiography,6.0,7.748623


In [15]:
merged_data = merged_data.drop(['Real_rating'], axis=1)
merged_data.head(5)

Unnamed: 0,User-ID,Book-Rating,Book-Author,Year-Of-Publication,Location,Age,title,categories,Predicted_rating
0,2313,7,raybradbury,1984.0,usa,23.0,themartianchronicles,fiction,7.731322
1,2313,8,johnokada,1978.0,usa,23.0,nonoboy,japanese,7.731322
2,6543,8,raybradbury,1976.0,usa,34.0,fahrenheit451,bookburning,8.013448
3,6543,7,ursulahegi,1997.0,usa,34.0,saltdancers,fiction,7.731322
4,6543,6,davasobel,1999.0,usa,34.0,galileosdaughterahistoricalmemoirofsciencefait...,biography&autobiography,7.748623


In [16]:
catFeatures = ['Book-Author', 'Location', 'categories', 'title']
numFeatures = ['Year-Of-Publication', 'Age', 'Predicted_rating', 'User-ID']
target = 'Book-Rating'

In [17]:
X = merged_data[np.concatenate((numFeatures, catFeatures), axis=0)]
y = merged_data[target]

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
model = CatBoostRegressor(loss_function='RMSE', random_state=42)
model.fit(X_train, y_train, cat_features=catFeatures, verbose=False)

<catboost.core.CatBoostRegressor at 0x14a3baa8d60>

In [21]:
predictions = model.predict(X_test).tolist()
np.sqrt(mean_squared_error(y_test.tolist(), predictions))

1.754969209869911

# Делаем рекомендации

In [22]:
books_data = pd.read_csv('data/books_data.csv')
books_data.head(5)

Unnamed: 0,title,Book-Author,Year-Of-Publication,categories
0,themartianchronicles,raybradbury,1984.0,fiction
1,nonoboy,johnokada,1978.0,japanese
2,fahrenheit451,raybradbury,1976.0,bookburning
3,1sttodieanovel,jamespatterson,2001.0,fiction
4,theno1ladiesdetectiveagency,alexandermccallsmith,2002.0,botswana


In [23]:
user_data = pd.read_csv('data/user_data.csv')
user_data.head(5)

Unnamed: 0,User-ID,Location,Age
0,2313,usa,23.0
1,6543,usa,34.0
2,8680,usa,2.0
3,10314,usa,
4,23768,usa,45.0


In [24]:
merged_test = implicit_data.merge(user_data, left_on=['User-ID'], right_on=['User-ID'], how='inner')
merged_test.head(5)

Unnamed: 0,User-ID,title,Predicted_rating,Location,Age
0,651,mydreamofyou,8.230393,usa,26.0
1,651,theonlyone,8.202541,usa,26.0
2,651,thehitchhikersguidetothegalaxy,8.159332,usa,26.0
3,651,awomansplace,8.139248,usa,26.0
4,651,ellenfoster,8.13676,usa,26.0


In [26]:
merged_test = merged_test.merge(books_data, left_on=['title'], right_on=['title'], how='inner')
merged_test.head(5)

Unnamed: 0,User-ID,title,Predicted_rating,Location,Age,Book-Author,Year-Of-Publication,categories
0,651,mydreamofyou,8.230393,usa,26.0,nualao'faolain,2001.0,"womenauthors,irish"
1,2276,mydreamofyou,8.188369,usa,46.0,nualao'faolain,2001.0,"womenauthors,irish"
2,2461,mydreamofyou,8.099402,usa,59.0,nualao'faolain,2001.0,"womenauthors,irish"
3,3556,mydreamofyou,8.442663,usa,68.0,nualao'faolain,2001.0,"womenauthors,irish"
4,11287,mydreamofyou,8.145257,usa,54.0,nualao'faolain,2001.0,"womenauthors,irish"


In [27]:
merged_test_X = merged_test[np.concatenate((numFeatures, catFeatures), axis=0)]
merged_test_X.head(5)

Unnamed: 0,Year-Of-Publication,Age,Predicted_rating,User-ID,Book-Author,Location,categories,title
0,2001.0,26.0,8.230393,651,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
1,2001.0,46.0,8.188369,2276,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
2,2001.0,59.0,8.099402,2461,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
3,2001.0,68.0,8.442663,3556,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
4,2001.0,54.0,8.145257,11287,nualao'faolain,usa,"womenauthors,irish",mydreamofyou


In [28]:
merged_test_X.to_csv('data/merged_test_X.csv', index=False)

In [29]:
books_data = pd.read_csv('data/books_data.csv')
books_data.head(5)

Unnamed: 0,title,Book-Author,Year-Of-Publication,categories
0,themartianchronicles,raybradbury,1984.0,fiction
1,nonoboy,johnokada,1978.0,japanese
2,fahrenheit451,raybradbury,1976.0,bookburning
3,1sttodieanovel,jamespatterson,2001.0,fiction
4,theno1ladiesdetectiveagency,alexandermccallsmith,2002.0,botswana


In [31]:
merged_test_X = pd.read_csv('data/merged_test_X.csv')
merged_test_X.head(5)

Unnamed: 0,Year-Of-Publication,Age,Predicted_rating,User-ID,Book-Author,Location,categories,title
0,2001.0,26.0,8.230393,651,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
1,2001.0,46.0,8.188369,2276,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
2,2001.0,59.0,8.099402,2461,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
3,2001.0,68.0,8.442663,3556,nualao'faolain,usa,"womenauthors,irish",mydreamofyou
4,2001.0,54.0,8.145257,11287,nualao'faolain,usa,"womenauthors,irish",mydreamofyou


In [34]:
def getRecommendations(data, books_data, model, user_id, n):
    predictions = model.predict(data).tolist()
    merged_test_pred = data
    merged_test_pred['pred'] = predictions
    merged_test_pred = merged_test_pred.drop('Predicted_rating', axis=1)
    result = data[data['User-ID'] == user_id].sort_values('pred', ascending=False).head(n)['title'].tolist()
    return result

In [35]:
getRecommendations(merged_test_X, books_data, model, 651, 10)

['halfmagic',
 'gotellitonthemountain',
 '1984',
 'thehitchhikersguidetothegalaxy',
 'investingfordummiessecondedition',
 'serpentine',
 'theonlyone',
 'messiah',
 'messiah',
 'afinebalance']