In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Author,Year-Of-Publication,Publisher,Location,Age,raw_title,categories
0,2313,553278223,7,raybradbury,1984.0,spectra,usa,23.0,themartianchronicles,fiction
1,2313,295955252,8,johnokada,1978.0,universityofwashingtonpress,usa,23.0,nonoboy,japanese
2,6543,345342968,8,raybradbury,1987.0,delrey,usa,34.0,fahrenheit451,bookburning
3,6543,446610038,9,jamespatterson,2002.0,warnervision,usa,34.0,1sttodieanovel,fiction
4,6543,1400031346,6,alexandermccallsmith,2002.0,anchor(uk),usa,34.0,theno1ladiesdetectiveagency,botswana


In [3]:
data[['ISBN', 'User-ID', 'Book-Rating']].isna().sum()

ISBN           0
User-ID        0
Book-Rating    0
dtype: int64

# Первый этап обучения

In [4]:
reader = Reader(rating_scale=(1, 10))
surprise_data = Dataset.load_from_df(data[['User-ID', 'ISBN', 'Book-Rating']], reader)
surprise_data.df.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,2313,553278223,7
1,2313,295955252,8
2,6543,345342968,8
3,6543,446610038,9
4,6543,1400031346,6


In [5]:
trainset, testset = train_test_split(surprise_data, test_size=0.99, random_state=42)

In [6]:
model = SVD()
model.fit(trainset)
explicit_predictions = model.test(testset)

In [7]:
accuracy.rmse(explicit_predictions)

RMSE: 1.8000


1.8000249785235587

In [8]:
user_id = []
isbn = []
real_rating = []
predicted_rating = []
for i in explicit_predictions:
    user_id.append(i.uid)
    isbn.append(i.iid)
    real_rating.append(i.r_ui)
    predicted_rating.append(i.est)
explicit_data = pd.DataFrame(columns=['User-ID', 'ISBN', 'Real_rating', 'Predicted_rating'])
explicit_data['User-ID'] = user_id
explicit_data['ISBN'] = isbn
explicit_data['Real_rating'] = real_rating
explicit_data['Predicted_rating'] = predicted_rating
explicit_data.head(5)

Unnamed: 0,User-ID,ISBN,Real_rating,Predicted_rating
0,18172,0671664964,9.0,7.731322
1,90614,0446603929,6.0,7.747922
2,219193,1565122992,7.0,7.731322
3,169252,0140067477,9.0,7.731322
4,272349,068484267X,3.0,7.731322


In [9]:
explicit_data = explicit_data.sort_values('Predicted_rating', ascending=False)
explicit_data = explicit_data.groupby('User-ID').apply(lambda x: x.head(20)).reset_index(drop=True)
explicit_data.head(5)

Unnamed: 0,User-ID,ISBN,Real_rating,Predicted_rating
0,8,1552041778,5.0,7.731322
1,12,1879384493,10.0,7.731322
2,16,345402871,9.0,7.731322
3,17,891076182,3.0,7.731322
4,17,425099148,7.0,7.731322


In [10]:
testset = trainset.build_anti_testset()
predictions = model.test(testset)

In [11]:
user_id = []
isbn = []
predicted_rating = []
for i in predictions:
    user_id.append(i.uid)
    isbn.append(i.iid)
    predicted_rating.append(i.est)
implicit_data = pd.DataFrame(columns=['User-ID', 'ISBN', 'Predicted_rating'])
implicit_data['User-ID'] = user_id
implicit_data['ISBN'] = isbn
implicit_data['Predicted_rating'] = predicted_rating
implicit_data.head(5)

Unnamed: 0,User-ID,ISBN,Predicted_rating
0,135149,451204891,7.74499
1,135149,446611778,7.670464
2,135149,553250531,7.839163
3,135149,385312377,7.56755
4,135149,451153553,8.06037


In [12]:
implicit_data = implicit_data.sort_values('Predicted_rating', ascending=False)
implicit_data = implicit_data.groupby('User-ID').apply(lambda x: x.head(20)).reset_index(drop=True)
implicit_data.head(5)

Unnamed: 0,User-ID,ISBN,Predicted_rating
0,651,316776963,8.350729
1,651,61000280,8.156658
2,651,345337662,8.146017
3,651,451153553,8.145964
4,651,345342968,8.141771


# Второй этап обучения

In [13]:
merged_data = data.merge(explicit_data, left_on=['ISBN', 'User-ID'], right_on=['ISBN', 'User-ID'], how='inner')
merged_data.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Author,Year-Of-Publication,Publisher,Location,Age,raw_title,categories,Real_rating,Predicted_rating
0,2313,553278223,7,raybradbury,1984.0,spectra,usa,23.0,themartianchronicles,fiction,7.0,7.731322
1,2313,295955252,8,johnokada,1978.0,universityofwashingtonpress,usa,23.0,nonoboy,japanese,8.0,7.731322
2,6543,345342968,8,raybradbury,1987.0,delrey,usa,34.0,fahrenheit451,bookburning,8.0,8.026338
3,6543,684844826,7,ursulahegi,1997.0,touchstone,usa,34.0,saltdancers,fiction,7.0,7.731322
4,6543,140280553,6,davasobel,2000.0,penguinbooks,usa,34.0,galileosdaughterahistoricalmemoirofsciencefait...,biography&autobiography,6.0,7.768863


In [14]:
merged_data = merged_data.drop(['Real_rating'], axis=1)
merged_data.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Author,Year-Of-Publication,Publisher,Location,Age,raw_title,categories,Predicted_rating
0,2313,553278223,7,raybradbury,1984.0,spectra,usa,23.0,themartianchronicles,fiction,7.731322
1,2313,295955252,8,johnokada,1978.0,universityofwashingtonpress,usa,23.0,nonoboy,japanese,7.731322
2,6543,345342968,8,raybradbury,1987.0,delrey,usa,34.0,fahrenheit451,bookburning,8.026338
3,6543,684844826,7,ursulahegi,1997.0,touchstone,usa,34.0,saltdancers,fiction,7.731322
4,6543,140280553,6,davasobel,2000.0,penguinbooks,usa,34.0,galileosdaughterahistoricalmemoirofsciencefait...,biography&autobiography,7.768863


In [15]:
catFeatures = ['Book-Author', 'Publisher', 'Location', 'categories', 'ISBN']
numFeatures = ['Year-Of-Publication', 'Age', 'Predicted_rating', 'User-ID']
target = 'Book-Rating'

In [16]:
X = merged_data[np.concatenate((numFeatures, catFeatures), axis=0)]
y = merged_data[target]

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
model = CatBoostRegressor(loss_function='RMSE', random_state=42)
model.fit(X_train, y_train, cat_features=catFeatures, verbose=False)

<catboost.core.CatBoostRegressor at 0x287864cdff0>

In [20]:
predictions = model.predict(X_test).tolist()
np.sqrt(mean_squared_error(y_test.tolist(), predictions))

1.7527424930367588

# Делаем рекомендации

In [22]:
books_data = pd.read_csv('data/books_data.csv')
books_data.head(5)

Unnamed: 0,ISBN,raw_title,Book-Author,Year-Of-Publication,Publisher,categories
0,553278223,themartianchronicles,raybradbury,1984.0,spectra,fiction
1,295955252,nonoboy,johnokada,1978.0,universityofwashingtonpress,japanese
2,345342968,fahrenheit451,raybradbury,1987.0,delrey,bookburning
3,446610038,1sttodieanovel,jamespatterson,2002.0,warnervision,fiction
4,1400031346,theno1ladiesdetectiveagency,alexandermccallsmith,2002.0,anchor(uk),botswana


In [23]:
user_data = pd.read_csv('data/user_data.csv')
user_data.head(5)

Unnamed: 0,User-ID,Location,Age
0,2313,usa,23.0
1,6543,usa,34.0
2,8680,usa,2.0
3,10314,usa,
4,23768,usa,45.0


In [24]:
merged_test = implicit_data.merge(user_data, left_on=['User-ID'], right_on=['User-ID'], how='inner')
merged_test.head(5)

Unnamed: 0,User-ID,ISBN,Predicted_rating,Location,Age
0,651,316776963,8.350729,usa,26.0
1,651,61000280,8.156658,usa,26.0
2,651,345337662,8.146017,usa,26.0
3,651,451153553,8.145964,usa,26.0
4,651,345342968,8.141771,usa,26.0


In [25]:
merged_test = merged_test.merge(books_data, left_on=['ISBN'], right_on=['ISBN'], how='inner')
merged_test.head(5)

Unnamed: 0,User-ID,ISBN,Predicted_rating,Location,Age,raw_title,Book-Author,Year-Of-Publication,Publisher,categories
0,651,316776963,8.350729,usa,26.0,metalkprettyoneday,davidsedaris,2001.0,backbaybooks,humor
1,1903,316776963,7.885376,taiwan,36.0,metalkprettyoneday,davidsedaris,2001.0,backbaybooks,humor
2,2109,316776963,8.22306,canada,,metalkprettyoneday,davidsedaris,2001.0,backbaybooks,humor
3,2276,316776963,8.417924,usa,46.0,metalkprettyoneday,davidsedaris,2001.0,backbaybooks,humor
4,2354,316776963,8.183165,canada,,metalkprettyoneday,davidsedaris,2001.0,backbaybooks,humor


In [27]:
merged_test_X = merged_test[np.concatenate((numFeatures, catFeatures), axis=0)]
merged_test_X.head(5)

Unnamed: 0,Year-Of-Publication,Age,Predicted_rating,User-ID,Book-Author,Publisher,Location,categories,ISBN
0,2001.0,26.0,8.350729,651,davidsedaris,backbaybooks,usa,humor,316776963
1,2001.0,36.0,7.885376,1903,davidsedaris,backbaybooks,taiwan,humor,316776963
2,2001.0,,8.22306,2109,davidsedaris,backbaybooks,canada,humor,316776963
3,2001.0,46.0,8.417924,2276,davidsedaris,backbaybooks,usa,humor,316776963
4,2001.0,,8.183165,2354,davidsedaris,backbaybooks,canada,humor,316776963


In [28]:
merged_test_X.to_csv('data/merged_test_X.csv', index=False)

In [30]:
books_data = pd.read_csv('data/books_data.csv')
books_data.head(5)

Unnamed: 0,ISBN,raw_title,Book-Author,Year-Of-Publication,Publisher,categories
0,553278223,themartianchronicles,raybradbury,1984.0,spectra,fiction
1,295955252,nonoboy,johnokada,1978.0,universityofwashingtonpress,japanese
2,345342968,fahrenheit451,raybradbury,1987.0,delrey,bookburning
3,446610038,1sttodieanovel,jamespatterson,2002.0,warnervision,fiction
4,1400031346,theno1ladiesdetectiveagency,alexandermccallsmith,2002.0,anchor(uk),botswana


In [31]:
merged_test_X = pd.read_csv('data/merged_test_X.csv')
merged_test_X.head(5)

Unnamed: 0,Year-Of-Publication,Age,Predicted_rating,User-ID,Book-Author,Publisher,Location,categories,ISBN
0,2001.0,26.0,8.350729,651,davidsedaris,backbaybooks,usa,humor,316776963
1,2001.0,36.0,7.885376,1903,davidsedaris,backbaybooks,taiwan,humor,316776963
2,2001.0,,8.22306,2109,davidsedaris,backbaybooks,canada,humor,316776963
3,2001.0,46.0,8.417924,2276,davidsedaris,backbaybooks,usa,humor,316776963
4,2001.0,,8.183165,2354,davidsedaris,backbaybooks,canada,humor,316776963


In [32]:
def getRecommendations(data, books_data, model, user_id, n):
    predictions = model.predict(data).tolist()
    merged_test_pred = data
    merged_test_pred['pred'] = predictions
    merged_test_pred = merged_test_pred.drop('Predicted_rating', axis=1)
    result = data[data['User-ID'] == user_id].sort_values('pred', ascending=False).head(n)['ISBN'].tolist()
    return result

In [33]:
getRecommendations(merged_test_X, books_data, model, 651, 10)

['0804804966',
 '1853261483',
 '1564025276',
 '0064401847',
 '0345342968',
 '0451117298',
 '0679440585',
 '0440195934',
 '0553561669',
 '0345342763']