# Проектная работа. Рекомендательные системы

## Получение датасета 

In [1]:
import json
import gzip
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Библиотеки по машинному обучению
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

# Библиотеки построения диаграмм
from matplotlib import pyplot as plt
import seaborn as sns

# Установка режима отображения диаграмм
%matplotlib inline
plt.rcParams["figure.figsize"] = (18, 5)
plt.style.use("ggplot")

В рамках проектной работы я взял для исследования данные из открытого источника amazon, которые будут похожи на реальные данные (я брал разные наборы данных о продаже техники, программного обеспечения и цифровой музыки – и алгоритмы работали схоже). Далее уже в рамках пилотного проекта будет запрошена реальная база данных клиентов и товаров.

Берем данные по продаже программного обеспечения по ссылке https://nijianmo.github.io/amazon/index.html,

In [2]:
# Функция чтения данных из файла gzip - json
def parse(path):
    for line in gzip.open(path, 'rb'):
        yield json.loads(line)

# Функция загрузки данных из файла json в датафрейм
def get_dataframe(path):
    data = {}
    for i, item in enumerate(parse(path)):
        data[i] = item
        
    return pd.DataFrame.from_dict(data, orient='index')

In [3]:
# Загрузка данных по рейтингу продаж программного обеспечения
# asin - идентификатор продукта, например, 111846130
# reviewerID - идентификатор рецензента, например, A3NHUQ33CFH3VM 
# overall - рейтинг
# reviewTime - время отзыва
# reviewText - текст отзыва
# summary - краткий отзыв
#rating_data = pd.read_csv('Appliances.csv', names=['asin', 'reviewer_id', 'rating', 'timestamp'])
rating_data = get_dataframe('Software.json.gz')
print(len(rating_data))
rating_data.head()

459436


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"03 11, 2014",A240ORQ2LF9LUI,77613252,{'Format:': ' Loose Leaf'},Michelle W,The materials arrived early and were in excell...,Material Great,1394496000,,
1,4.0,True,"02 23, 2014",A1YCCU0YRLS0FE,77613252,{'Format:': ' Loose Leaf'},Rosalind White Ames,I am really enjoying this book with the worksh...,Health,1393113600,,
2,1.0,True,"02 17, 2014",A1BJHRQDYVAY2J,77613252,{'Format:': ' Loose Leaf'},Allan R. Baker,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",ARE YOU KIDING ME?,1392595200,7.0,
3,3.0,True,"02 17, 2014",APRDVZ6QBIQXT,77613252,{'Format:': ' Loose Leaf'},Lucy,This book was missing pages!!! Important pages...,missing pages!!,1392595200,3.0,
4,5.0,False,"10 14, 2013",A2JZTTBSLS1QXV,77775473,,Albert V.,I have used LearnSmart and can officially say ...,Best study product out there!,1381708800,,


In [4]:
rating_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 459436 entries, 0 to 459435
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         459436 non-null  float64
 1   verified        459436 non-null  bool   
 2   reviewTime      459436 non-null  object 
 3   reviewerID      459436 non-null  object 
 4   asin            459436 non-null  object 
 5   style           234401 non-null  object 
 6   reviewerName    459412 non-null  object 
 7   reviewText      459370 non-null  object 
 8   summary         459380 non-null  object 
 9   unixReviewTime  459436 non-null  int64  
 10  vote            127853 non-null  object 
 11  image           1508 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 42.5+ MB


In [5]:
# Загрузка данных о программном обеспечении
#asin - идентификатор продукта, например. 0000013714
#title - описание товара
#brand - брэнд
#main_cat - категория
#description - описание
#rank - ранг
meta_data = get_dataframe('meta_Software.json.gz')
print(len(meta_data))
meta_data.head()

26790


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,[],,[],,HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK ...,[],,HOLT. RINEHART AND WINSTON,[],"25,550 in Software (",[],Software,,</div>,.a-box-inner{background-color:#fff}#alohaBuyBo...,0030672120,[],[],
1,[],,"[, <b>Latin rhythms that will get your kids si...",,"Sing, Watch, &amp; Learn Spanish (DVD + Guide)...",[],,McGraw Hill,[],"15,792 in Software (",[],Software,,</div>,,0071480935,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,[],,[<b>Connect is the only integrated learning sy...,,Connect with LearnSmart Access Card for Microb...,[],,McGraw-Hill Science/Engineering/Math,[],"16,900 in Software (",[],Software,,</div>,,007329506X,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
3,[],,[],,LearnSmart Standalone Access Card for Prescott...,[],,McGraw-Hill Education,[],"12,986 in Software (",[],Software,,</div>,,0073513458,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
4,[],,[<i>Anatomy &amp; Physiology Revealed Cat</i> ...,,Anatomy &amp; Physiology Revealed Student Acce...,"[0323394612, 0323227937, 1118527488]",,McGraw-Hill Education,[],"14,861 in Software (",[],Software,,</div>,$4.83,0073525758,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [6]:
# Оставляем только необходимые столбцы
rating_data2 = rating_data[['asin','reviewerID','overall','unixReviewTime','reviewText','summary']]
rating_data2.columns = ['item_id','user_id','rating','timestamp','text','summary']
rating_data2['timestamp'] = pd.to_datetime(rating_data2['timestamp'].astype(int), unit='s')
item_data2 = meta_data[['asin', 'title', 'brand', 'main_cat', 'rank']]
item_data2.columns = ['item_id', 'title', 'brand', 'main_cat', 'rank']
item_data2 = item_data2[item_data2['title'].isnull() == False].reset_index(drop=True)
#rating_data2.head()
#item_data2.head()

In [7]:
# Объединяем датасеты
data = rating_data2.merge(item_data2, on='item_id', how='left')
data.head()

Unnamed: 0,item_id,user_id,rating,timestamp,text,summary,title,brand,main_cat,rank
0,77613252,A240ORQ2LF9LUI,4.0,2014-03-11,The materials arrived early and were in excell...,Material Great,Connect Personal Health with LearnSmart 1 Seme...,McGraw-Hill Humanities/Social Sciences/Languages,Software,"15,675 in Software ("
1,77613252,A1YCCU0YRLS0FE,4.0,2014-02-23,I am really enjoying this book with the worksh...,Health,Connect Personal Health with LearnSmart 1 Seme...,McGraw-Hill Humanities/Social Sciences/Languages,Software,"15,675 in Software ("
2,77613252,A1BJHRQDYVAY2J,1.0,2014-02-17,"IF YOU ARE TAKING THIS CLASS DON""T WASTE YOUR ...",ARE YOU KIDING ME?,Connect Personal Health with LearnSmart 1 Seme...,McGraw-Hill Humanities/Social Sciences/Languages,Software,"15,675 in Software ("
3,77613252,APRDVZ6QBIQXT,3.0,2014-02-17,This book was missing pages!!! Important pages...,missing pages!!,Connect Personal Health with LearnSmart 1 Seme...,McGraw-Hill Humanities/Social Sciences/Languages,Software,"15,675 in Software ("
4,77775473,A2JZTTBSLS1QXV,5.0,2013-10-14,I have used LearnSmart and can officially say ...,Best study product out there!,LearnSmart Access Card for Experience Psychology,McGraw-Hill Education,Software,"9,130 in Software ("


In [8]:
data.to_csv('data_Software', sep='\t', encoding='utf-8')

# EDA

In [9]:
# Вывод информации о структуре данных
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500217 entries, 0 to 500216
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   item_id    500217 non-null  object        
 1   user_id    500217 non-null  object        
 2   rating     500217 non-null  float64       
 3   timestamp  500217 non-null  datetime64[ns]
 4   text       500148 non-null  object        
 5   summary    500152 non-null  object        
 6   title      499831 non-null  object        
 7   brand      499831 non-null  object        
 8   main_cat   499831 non-null  object        
 9   rank       499831 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 42.0+ MB


In [10]:
# Список товаров и кол-во их покупок
data.item_id.value_counts()

B00UB76290    8994
B00CTTEKJW    7939
B00NG7JVSQ    6395
B00H9A60O4    4730
B00E6LJ2SA    4048
              ... 
B00Y0CIJ0W       1
B00076YWJ2       1
B000HJMG2E       1
B00UT1W7PQ       1
B014R6K82M       1
Name: item_id, Length: 21663, dtype: int64

In [11]:
# Список покупателей и кол-во их покупок
data.user_id.value_counts()

A15S4XW3CRISZ5    132
A5JLAU2ARJ0BO     121
A3W4D8XOGLWUN5     73
A680RUE1FDO8B      71
A3VL4RXCWNSR3H     71
                 ... 
A33PEXCIXESRDO      1
ANXEIZVHOWZB1       1
A3N8U7VRBEWECZ      1
A1U74R5Q2LG6Y1      1
AO2Z8S5DLXDWA       1
Name: user_id, Length: 375147, dtype: int64

In [12]:
# Спискок рейтингов
data.rating.value_counts()

5.0    228984
1.0    113565
4.0     79843
3.0     43001
2.0     34824
Name: rating, dtype: int64

### Предобработка данных

In [13]:
# Исключение непопулярных товаров и покупателей, кто купил мало товаров
df = data.copy()
df = df[df.groupby('item_id').transform('count').iloc[:, 0] > 10]
df = df[df.groupby('user_id').transform('count').iloc[:, 0] > 3]

# Удаление пустых позиций из датафрейма
df = df.drop_duplicates(['item_id', 'user_id']).dropna()
df.shape

(35370, 10)

In [14]:
# Функция разбиения данных на тренировочную и тестовую выборки
def train_test_split(data, ratio=0.2, user_col='user_id', item_col='item_id',
                     rating_col='rating', time_col='timestamp'):
    data = data.sort_values(by=[time_col])
    
    indx = int(len(data) * (1 - ratio))
    train_data = data[[user_col, item_col, rating_col]][:indx]
    test_data = data[[user_col, item_col, rating_col]][indx:]
    
    return train_data, test_data

In [15]:
train_data, test_data = train_test_split(df, ratio=0.2)
print(train_data.shape, test_data.shape)

(28296, 3) (7074, 3)


In [16]:
train_data.head()

Unnamed: 0,user_id,item_id,rating
3854,A3H4TIVTTA5IBB,B0000296ZH,3.0
3336,A3QQO9GVE0GOFE,B00001YVBG,3.0
5244,A1U9LTA3EWSNY1,B00003IRBV,5.0
5054,A1U9LTA3EWSNY1,B00003IRBU,5.0
3974,A2DJDS2KLFR9YM,B00002K10B,4.0


### Колоборативная фильтрация

### User-based model

In [48]:
class UserBased(BaseEstimator):
    def fit(self, train_data, user_col='user_id', item_col='item_id', rating_col='rating'):
        data = train_data.copy()
        
        # Определение списка покупателей и товаров
        self.users = data[user_col].unique()
        self.items = data[item_col].unique()

        # Определение среднего рейтинга товара для каждого покупателя
        self.rating_mean = data.groupby(user_col)[rating_col].mean()
        
        # Расчет среднего рейтинга товара для каждого покупателя в датафрейме
        data[rating_col] = data[rating_col] - data.groupby(user_col)[rating_col].transform('mean')

        # Построение сводной таблицы рейтингов товаров в разрезе покупателя и товара
        self.ratings = pivot_table(data, values=rating_col, index=user_col, columns=item_col)

        # Расчет коэффициента схожести покупателей
        self.similarities = pd.DataFrame(cosine_similarity(self.ratings), index=self.ratings.index)
        
        return self

    def predict_rating(self, pr_user, pr_item):
        # если в обучающей выборке нет такого товара или покупателя то устнавливаем нулевой рейтинг
        if not pr_item in self.items or not pr_user in self.users:
            return 0

        # Расчет прогнозного значения рейтинга товара на основе коэффициента схожести
        numerator = np.dot(self.similarities.loc[pr_user], self.ratings.loc[:, pr_item])
        denominator = self.similarities.loc[pr_user].sum() - 1
        
        return self.rating_mean[pr_user] + numerator / denominator
    

    def predict(self, X, user_col='user_id', item_col='item_id'):
        # Расчет прогнозных значений рейтингов товаров
        rating_pred = test_data[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return rating_pred

In [49]:
# Функция построения сводных таблиц
def pivot_table(data, values='rating', index='user_id', columns='item_id'):
    rows, rows_pos = np.unique(data[index], return_inverse=True)
    cols, cols_pos = np.unique(data[columns], return_inverse=True)
    
    table = np.zeros((len(rows), len(cols)), dtype=np.float16)
    table[rows_pos, cols_pos] = data[values]
    
    return pd.DataFrame(table, index=rows, columns=cols)

# Функция расчета коэффициента RMSE
def RMSE(Y_true, Y_pred):
    return np.sqrt(mean_squared_error(Y_true, Y_pred))

In [50]:
# Построение модели и прогнозирование рейтинга товаров на основе схожести покупателей
print('Старт обучения UserBased...')
ub_model = UserBased().fit(train_data)
print('Старт прогнозирования...')
ub_pred = ub_model.predict(test_data)
print('RMSE = {:0.4f}'.format(RMSE(test_data['rating'], ub_pred)))

Старт обучения UserBased...
Старт прогнозирования...
RMSE = 3.6353


### Item-based model

In [51]:
class ItemBased(BaseEstimator):
    def fit(self, train_data, user_col='user_id', item_col='item_id', rating_col='rating'):
        data = train_data.copy()

        # Определение списка покупателей и товаров
        self.users = data[user_col].unique()
        self.items = data[item_col].unique()
        
        # Определение среднего рейтинга для каждого товара
        self.rating_mean = data.groupby(item_col)[rating_col].mean()

        # Расчет среднего рейтинга каждого товара в датафрейме
        data[rating_col] = data[rating_col] - data.groupby(item_col)[rating_col].transform('mean')

        # Построение сводной таблицы рейтингов товаров в разрезе покупателя и товара
        self.ratings = pivot_table(data, values=rating_col, index=item_col, columns=user_col)

        # Расчет коэффициента схожести товаров
        self.similarities = pd.DataFrame(cosine_similarity(self.ratings), index=self.ratings.index)

        return self
    
    def predict_rating(self, pr_user, pr_item):
        # Для отсутствующих покупателей и товаров устанавливаем нулевой рейтинг
        if not pr_item in self.items or not pr_user in self.users:
            return 0
        
        # Расчет прогнозного значения рейтинга товара на основе коэффициента схожести
        numerator = np.dot(self.similarities.loc[pr_item], self.ratings.loc[:, pr_user])
        denominator = self.similarities.loc[pr_item].sum() - 1
        
        return self.rating_mean[pr_item] + numerator / denominator
    
    def predict(self, test_data, user_col='user_id', item_col='item_id'):
        # Расчет прогнозных значений рейтингов товаров
        rating_pred = test_data[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return rating_pred

In [52]:
# Построение модели и прогнозирование рейтинга товаров на основе схожести товаров
print('Старт обучения ItemBased...')
ib_model = ItemBased().fit(train_data)
print('Старт прогнозирования...')
ib_pred = ib_model.predict(test_data)
print('RMSE = {:0.4f}'.format(RMSE(test_data['rating'], ib_pred)))

Старт обучения ItemBased...
Старт прогнозирования...
RMSE = 3.6713


### Проверка рекомендательной системы

In [53]:
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,text,summary,title,brand,main_cat,rank
21,321719816,A38NELQT98S4H8,4.0,2010-10-20,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
22,321719816,A3QJU4FEN8PQSZ,4.0,2010-10-18,"The demo is done with the PC version, with ref...",A good value,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
23,321719816,ACJT8MUC0LRF0,5.0,2010-10-16,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
24,321719816,A2RQ0AT4XZUIXL,5.0,2010-10-14,I've been working with Dreamweaver for a few y...,Wonderful introduction to Dreamweaver,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
25,321719816,AYUF7YETYOLNX,5.0,2010-10-12,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("


In [54]:
# Случайно выбранный покупатель для построения системы рекомендаций
user_id = 'A3QQO9GVE0GOFE'

In [55]:
# Товары, которые купил покупатель
df[df['user_id'] == 'A3QQO9GVE0GOFE']

Unnamed: 0,item_id,user_id,rating,timestamp,text,summary,title,brand,main_cat,rank
3336,B00001YVBG,A3QQO9GVE0GOFE,3.0,1999-12-01,"On the postive side, it looks just like the TV...",Great looking but thin on gameplay,The Simpsons: Virtual Springfield,by\n \n Fox Interactive Media,Video Games,"[>#63,976 in Video Games (See Top 100 in Video..."
17762,B00006BN8J,A3QQO9GVE0GOFE,2.0,2002-11-27,Our kids have enjoyed all three of the previou...,A disappointment. Try one of the other Thomas ...,Thomas &amp; Friends Building the New Line - ...,by\n \n Atari,Video Games,"[>#64,480 in Video Games (See Top 100 in Video..."
30938,B0001MXNVA,A3QQO9GVE0GOFE,4.0,2004-09-21,"Like a lot of Yu-Gi-Oh fans, my son enjoyed co...",A great way to learn how to play the card game,Yu-Gi-Oh Power of Chaos: Joey the Passion - PC,by\n \n Konami,Video Games,"[>#66,200 in Video Games (See Top 100 in Video..."
386987,B000029714,A3QQO9GVE0GOFE,4.0,2000-05-09,"Thomas, his friends, and the island of Sodor l...","Outstanding graphics & sound, a bit light in g...",Thomas the Tank Engine &amp; Friends - PC,by\n \n Atari,Video Games,"[>#85,005 in Video Games (See Top 100 in Video..."


In [56]:
# Построение вектора покупателя
user_vector = pivot_table(df).loc[user_id].reset_index()
user_vector['user_id'] = user_id

# Переименование столбцов датафрейма и задание нужной последовательности
user_vector.columns = ['item_id', 'rating', 'user_id']
user_vector = user_vector[['user_id', 'item_id', 'rating']]
user_vector = user_vector.merge(item_data2, how='left', on='item_id')

# Вывод первых позиций
user_vector.head()



Unnamed: 0,user_id,item_id,rating,title,brand,main_cat,rank
0,A3QQO9GVE0GOFE,77613252,0.0,Connect Personal Health with LearnSmart 1 Seme...,McGraw-Hill Humanities/Social Sciences/Languages,Software,"15,675 in Software ("
1,A3QQO9GVE0GOFE,321700945,0.0,Learn Adobe Photoshop Lightroom 3 by Video,Peach Pit Press,Software,"8,501 in Software ("
2,A3QQO9GVE0GOFE,321719816,0.0,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
3,A3QQO9GVE0GOFE,321719824,0.0,Learn Adobe Flash Professional CS5 by Video: C...,Peach Pit Press,Software,"11,954 in Software ("
4,A3QQO9GVE0GOFE,615179088,0.0,Human Japanese,Brak Software,Software,"3,689 in Software ("


In [57]:
# Построение прогноза рейтинга неприобретенных товаров
rating_null = user_vector[user_vector['rating']==0]
rating_pred = np.round((ub_model.predict(rating_null) + ib_model.predict(rating_null)) / 2, 1)

# Объединение реальных и прогнозных рейтингов товаров
item_rating = pd.concat([user_vector, rating_pred], axis=1).fillna(0)
item_rating.columns = ['user_id', 'item_id', 'rating', 'title', 'brand', 'main_cat', 'rank', 'pred_rating']

# Исключение товаров с низким реальным и прогнозным рейтингами
item_rating['total_rating'] = item_rating['rating'] + item_rating['pred_rating']
item_rating = item_rating[item_rating.total_rating >= 3].reset_index(drop=True)
item_rating.head()



Unnamed: 0,user_id,item_id,rating,title,brand,main_cat,rank,pred_rating,total_rating
0,A3QQO9GVE0GOFE,B00001YVBG,3.0,The Simpsons: Virtual Springfield,by\n \n Fox Interactive Media,Video Games,"[>#63,976 in Video Games (See Top 100 in Video...",0.0,3.0
1,A3QQO9GVE0GOFE,B00001YVBG,3.0,The Simpsons: Virtual Springfield,by\n \n Fox Interactive Media,Video Games,"[>#63,976 in Video Games (See Top 100 in Video...",0.0,3.0
2,A3QQO9GVE0GOFE,B000029714,4.0,Thomas the Tank Engine &amp; Friends - PC,by\n \n Atari,Video Games,"[>#85,005 in Video Games (See Top 100 in Video...",0.0,4.0
3,A3QQO9GVE0GOFE,B000029714,4.0,Thomas the Tank Engine &amp; Friends - PC,by\n \n Atari,Video Games,"[>#85,005 in Video Games (See Top 100 in Video...",0.0,4.0
4,A3QQO9GVE0GOFE,B00002S76T,0.0,eMerchant pro,BIG PICTURE TECHNOLOGIES,Software,"61,014 in Software (",4.5,4.5


In [58]:
item_rating.shape

(19, 9)

In [59]:
# Вывод ТОП-лучших из предлагаемых к покупке товаров по предсказанному рейтингу
item_rating[item_rating.pred_rating >= 4].sort_values('pred_rating', ascending=False)


Unnamed: 0,user_id,item_id,rating,title,brand,main_cat,rank,pred_rating,total_rating
15,A3QQO9GVE0GOFE,B000056B5W,0.0,Teach Me Blues Guitar,Voyetra,Software,[],5.0,5.0
9,A3QQO9GVE0GOFE,B00004W4AL,0.0,Quicken 2001 Basic,Intuit,Software,"7,089 in Software (",4.8,4.8
4,A3QQO9GVE0GOFE,B00002S76T,0.0,eMerchant pro,BIG PICTURE TECHNOLOGIES,Software,"61,014 in Software (",4.5,4.5
5,A3QQO9GVE0GOFE,B00003IRBV,0.0,Nancy Drew: Stay Tuned for Danger - PC,by\n \n Her Interactive,Video Games,"[>#47,965 in Video Games (See Top 100 in Video...",4.4,4.4
16,A3QQO9GVE0GOFE,B000056ORJ,0.0,Video Studio 5.0,ULead Systems,Software,"61,407 in Software (",4.3,4.3
8,A3QQO9GVE0GOFE,B00004UFGD,0.0,101 Languages of the World,Transparent Language,Software,"5,966 in Software (",4.2,4.2


# Библиотека Surprise

Используем библиотеку Surprise для Baseline, SVD, kNN и других методов

In [86]:
#!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-win_amd64.whl size=712976 sha256=0392ab57275a4fb7ba45c9ef25563b772161e783887fde34c1db80fb3fffc86f
  Stored in directory: c:\users\tima-\appdata\local\pip\cache\wheels\76\44\74\b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [66]:
import surprise
import json
import pandas as pd
import numpy as np
from collections import defaultdict

In [67]:
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,text,summary,title,brand,main_cat,rank
21,321719816,A38NELQT98S4H8,4.0,2010-10-20,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
22,321719816,A3QJU4FEN8PQSZ,4.0,2010-10-18,"The demo is done with the PC version, with ref...",A good value,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
23,321719816,ACJT8MUC0LRF0,5.0,2010-10-16,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
24,321719816,A2RQ0AT4XZUIXL,5.0,2010-10-14,I've been working with Dreamweaver for a few y...,Wonderful introduction to Dreamweaver,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("
25,321719816,AYUF7YETYOLNX,5.0,2010-10-12,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,Learn Adobe Dreamweaver CS5 by Video: Core Tra...,Peach Pit Press,Software,"9,956 in Software ("


In [68]:
df.describe()

Unnamed: 0,rating
count,35370.0
mean,3.72078
std,1.498078
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [69]:
reader = surprise.Reader(line_format='user item rating', rating_scale=(0, 5))

In [70]:
dataset = surprise.Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

In [71]:
dataset

<surprise.dataset.DatasetAutoFolds at 0x1dd31eda548>

## Baseline

In [72]:
bl = surprise.BaselineOnly()

In [73]:
%%time

surprise.model_selection.cross_validate(bl, dataset, measures=["RMSE", "MAE"], cv=10, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.3138  1.3178  1.3310  1.3372  1.3083  1.3235  1.3132  1.3168  1.3236  1.2980  1.3183  0.0107  
MAE (testset)     1.0796  1.0797  1.0961  1.1099  1.0746  1.0918  1.0838  1.0772  1.0850  1.0755  1.0853  0.0105  
Fit time          0.09    0.10    0.10    0.11    0.11    0.10    0.11    0.11    0.09    0.11    0.10    0.01    
Test time         0.02    0.01    0.02    0.02    0.02    0.02    0.02    0.01    0.01    0.01    0.02    0.00    
Wall time: 3.47 s


{'test_rmse': array([1.31383462, 1.3177953 , 1.33099915, 1.3372475 , 1.30833859,
        1.32348005, 1.31315812, 1.31678261, 1.32357254, 1.29797599]),
 'test_mae': array([1.07956827, 1.07973314, 1.09607718, 1.10989329, 1.07457866,
        1.09183371, 1.08383111, 1.07724751, 1.08504066, 1.07553067]),
 'fit_time': (0.09371066093444824,
  0.09963369369506836,
  0.10019612312316895,
  0.10813355445861816,
  0.10689520835876465,
  0.10034799575805664,
  0.10834002494812012,
  0.11263442039489746,
  0.08799934387207031,
  0.10643815994262695),
 'test_time': (0.016515016555786133,
  0.01461482048034668,
  0.019307374954223633,
  0.01659536361694336,
  0.0194857120513916,
  0.021147489547729492,
  0.01735973358154297,
  0.014113187789916992,
  0.014999866485595703,
  0.014000415802001953)}

Для Baseline значение RMSE = 1.3195

## SVD

In [74]:
svd1 = surprise.SVD(n_factors=6, n_epochs=30, lr_all=0.002, reg_all=0.05, random_state=42)

In [75]:
%%time

surprise.model_selection.cross_validate(svd1, dataset, measures=["RMSE", "MAE"], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.3076  1.2932  1.3428  1.3112  1.3162  1.3078  1.3367  1.3207  1.3226  1.3115  1.3170  0.0138  
MAE (testset)     1.0746  1.0645  1.1034  1.0756  1.0900  1.0722  1.0929  1.0864  1.0898  1.0791  1.0829  0.0111  
Fit time          0.80    0.75    0.84    0.77    0.83    0.86    0.79    0.77    0.81    0.78    0.80    0.03    
Test time         0.02    0.02    0.03    0.03    0.02    0.03    0.02    0.02    0.03    0.02    0.02    0.00    
Wall time: 9.25 s


{'test_rmse': array([1.30762569, 1.29317206, 1.3428375 , 1.31124599, 1.31617845,
        1.30775125, 1.33671198, 1.3207472 , 1.32262888, 1.31154682]),
 'test_mae': array([1.07456177, 1.0644908 , 1.10340455, 1.07556062, 1.09004952,
        1.07220209, 1.09294163, 1.08643321, 1.08981707, 1.07907622]),
 'fit_time': (0.7992308139801025,
  0.7529363632202148,
  0.84342360496521,
  0.7737212181091309,
  0.832634449005127,
  0.85835862159729,
  0.7941508293151855,
  0.7651560306549072,
  0.8143706321716309,
  0.777116060256958),
 'test_time': (0.022060871124267578,
  0.02478194236755371,
  0.029129505157470703,
  0.025961637496948242,
  0.02319622039794922,
  0.025460004806518555,
  0.022000789642333984,
  0.02483654022216797,
  0.02812361717224121,
  0.02326679229736328)}

In [76]:
# Ищем наилучшие параметры для SVD
param_grid = {'n_factors': [3, 6, 10, 20, 40], 'n_epochs': [30,], 
              'biased': [True, ], 'lr_all': [0.005, 0.001], 'reg_all': [0.05, 0.02],
              'random_state': [42, ]
             }

In [77]:
gs_svd = surprise.model_selection.GridSearchCV(surprise.SVD, param_grid, measures=["rmse", "mae"], cv=3)

In [78]:
%%time

gs_svd.fit(dataset)

Wall time: 53 s


In [79]:
vars(gs_svd.best_estimator["rmse"])

{'n_factors': 3,
 'n_epochs': 30,
 'biased': True,
 'init_mean': 0,
 'init_std_dev': 0.1,
 'lr_bu': 0.005,
 'lr_bi': 0.005,
 'lr_pu': 0.005,
 'lr_qi': 0.005,
 'reg_bu': 0.05,
 'reg_bi': 0.05,
 'reg_pu': 0.05,
 'reg_qi': 0.05,
 'random_state': 42,
 'verbose': False,
 'bsl_options': {},
 'sim_options': {'user_based': True}}

In [80]:
# С лучшими параметрами
svd = surprise.SVD(n_factors=3, n_epochs=30, biased = True, lr_all=0.005, reg_all=0.02, random_state=42)

In [81]:
%%time

surprise.model_selection.cross_validate(svd, dataset, measures=["RMSE", "MAE"], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.2371  1.2458  1.2483  1.2467  1.2567  1.2443  1.2836  1.2412  1.2572  1.2442  1.2505  0.0125  
MAE (testset)     0.9881  0.9957  0.9990  1.0055  1.0087  0.9970  1.0376  0.9931  1.0073  0.9931  1.0025  0.0133  
Fit time          0.82    0.74    0.80    0.87    0.87    0.81    0.95    0.77    0.83    0.84    0.83    0.05    
Test time         0.02    0.03    0.03    0.03    0.02    0.03    0.03    0.02    0.03    0.03    0.03    0.00    
Wall time: 9.42 s


{'test_rmse': array([1.23714722, 1.24583869, 1.24832874, 1.2467361 , 1.25674117,
        1.24429452, 1.28358372, 1.24117596, 1.25717294, 1.24423878]),
 'test_mae': array([0.98813649, 0.99567683, 0.99900009, 1.00546997, 1.00866867,
        0.99701554, 1.03755581, 0.9931284 , 1.00734705, 0.99307776]),
 'fit_time': (0.8217587471008301,
  0.7393434047698975,
  0.7999999523162842,
  0.8672659397125244,
  0.8726916313171387,
  0.8109302520751953,
  0.947413444519043,
  0.7715725898742676,
  0.8316671848297119,
  0.8443658351898193),
 'test_time': (0.022542476654052734,
  0.025002241134643555,
  0.025000572204589844,
  0.026012897491455078,
  0.023485183715820312,
  0.02900528907775879,
  0.0308685302734375,
  0.02378535270690918,
  0.02744770050048828,
  0.02899003028869629)}

Для SVD значение RMSE = 1.2502

## kNN

In [82]:
# Подбираем параметры
param_grid = {'k': [5, 10, 20, 40], 'min_k': [2,], 
              'sim_options': {
                  'name': ['cosine'],
                  'user_based': [True, False]
                  }
             }


In [83]:
gs = surprise.model_selection.GridSearchCV(surprise.KNNBaseline, param_grid, measures=["rmse", "mae"], cv=3)

In [84]:
%%time

gs.fit(dataset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Comput

In [85]:
vars(gs.best_estimator["rmse"])

{'bsl_options': {},
 'sim_options': {'name': 'cosine', 'user_based': False},
 'verbose': True,
 'k': 20,
 'min_k': 2}

In [86]:
# С лучшими параметрами
knn = surprise.KNNBaseline(k=40, min_k = 1, sim_options={'name':'cosine', 'user_based': True}, random_state=42)

In [87]:
surprise.model_selection.cross_validate(knn, dataset, measures=["RMSE", "MAE"], cv=10, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Comput

{'test_rmse': array([1.39316581, 1.39186798, 1.41234086, 1.41108828, 1.41691844,
        1.41445873, 1.39969009, 1.38709323, 1.39837112, 1.41944896]),
 'test_mae': array([1.09512523, 1.09715739, 1.10619693, 1.0928734 , 1.10676345,
        1.09577189, 1.09348049, 1.07132939, 1.08827655, 1.11246743]),
 'fit_time': (2.513823986053467,
  2.6343841552734375,
  2.750800609588623,
  2.656562566757202,
  2.4553465843200684,
  2.4111013412475586,
  2.3967607021331787,
  2.347466468811035,
  2.5778391361236572,
  2.598707437515259),
 'test_time': (0.19843721389770508,
  0.21052765846252441,
  0.25277256965637207,
  0.26184701919555664,
  0.20025134086608887,
  0.22941279411315918,
  0.2503011226654053,
  0.2295064926147461,
  0.2180023193359375,
  0.20223188400268555)}

Для KNN значение RMSE = 1.4056. Даже хуже, чем для Baseline

## Slope One

In [88]:
so = surprise.SlopeOne()

In [89]:
%%time

surprise.model_selection.cross_validate(so, dataset, measures=["RMSE", "MAE"], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.3685  1.3983  1.3809  1.3712  1.3917  1.3875  1.4133  1.3910  1.3763  1.4032  1.3882  0.0136  
MAE (testset)     0.9452  0.9696  0.9682  0.9661  0.9616  0.9603  0.9935  0.9741  0.9533  0.9735  0.9665  0.0124  
Fit time          0.44    0.43    0.41    0.46    0.44    0.41    0.42    0.42    0.49    0.47    0.44    0.03    
Test time         0.04    0.05    0.04    0.07    0.04    0.05    0.04    0.04    0.05    0.04    0.05    0.01    
Wall time: 5.73 s


{'test_rmse': array([1.36853196, 1.39828058, 1.38088451, 1.37121388, 1.39173991,
        1.38749872, 1.41325817, 1.39101996, 1.37628887, 1.40323407]),
 'test_mae': array([0.94517449, 0.96961051, 0.9681869 , 0.96612871, 0.96164749,
        0.96030332, 0.99352241, 0.97412876, 0.95328922, 0.97345441]),
 'fit_time': (0.44169139862060547,
  0.4280238151550293,
  0.4051814079284668,
  0.4617743492126465,
  0.43725132942199707,
  0.40837526321411133,
  0.41829514503479004,
  0.4236750602722168,
  0.48627376556396484,
  0.4713311195373535),
 'test_time': (0.03999662399291992,
  0.0454256534576416,
  0.04450654983520508,
  0.07186675071716309,
  0.04200172424316406,
  0.04675602912902832,
  0.03900027275085449,
  0.043886423110961914,
  0.051012277603149414,
  0.0409998893737793)}

## Co Clastering

In [90]:
param_grid = {
              'n_cltr_u': [3, 6, 10, 20],
              'n_cltr_i': [3, 6, 10, 20],
              'n_epochs': [30,],
              'random_state': [42,]
             }

In [91]:
gs_cc = surprise.model_selection.GridSearchCV(surprise.CoClustering, param_grid, measures=["rmse", "mae"], cv=3)

In [92]:
%%time

gs_cc.fit(dataset)

Wall time: 2min 44s


In [93]:
vars(gs_cc.best_estimator["rmse"])

{'bsl_options': {},
 'sim_options': {'user_based': True},
 'n_cltr_u': 6,
 'n_cltr_i': 3,
 'n_epochs': 30,
 'verbose': False,
 'random_state': 42}

In [94]:
# С лучшими параметрами
co = surprise.CoClustering(n_cltr_u=3, n_cltr_i = 3, n_epochs=30, random_state=42)

In [95]:
surprise.model_selection.cross_validate(co, dataset, measures=["RMSE", "MAE"], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.3495  1.3666  1.4306  1.3527  1.3612  1.3822  1.3421  1.3532  1.3387  1.3711  1.3648  0.0253  
MAE (testset)     0.9346  0.9499  0.9955  0.9461  0.9425  0.9697  0.9347  0.9327  0.9434  0.9609  0.9510  0.0186  
Fit time          2.68    2.74    2.73    2.76    2.61    2.79    2.80    2.76    2.72    2.76    2.74    0.05    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([1.34945685, 1.36659508, 1.4306453 , 1.35270113, 1.3612164 ,
        1.3822388 , 1.3421049 , 1.35321991, 1.33872894, 1.37108659]),
 'test_mae': array([0.93464951, 0.9498786 , 0.99552863, 0.94607381, 0.94254211,
        0.96966071, 0.93472782, 0.93265145, 0.94342193, 0.96089763]),
 'fit_time': (2.684061050415039,
  2.7429986000061035,
  2.7345476150512695,
  2.7638564109802246,
  2.6120030879974365,
  2.7900335788726807,
  2.7978954315185547,
  2.7583718299865723,
  2.715259552001953,
  2.7557880878448486),
 'test_time': (0.017999649047851562,
  0.02008843421936035,
  0.022008419036865234,
  0.01699995994567871,
  0.017999887466430664,
  0.01699995994567871,
  0.020039081573486328,
  0.016969680786132812,
  0.01821303367614746,
  0.017999887466430664)}

Для Co Clasting значение RMSE = 1.3571, практический такой же, как и для Baseline

## Расчитаем рекомендации для лучшей модели

In [96]:
# Берем лучшую модель, с лучшими параметрами
svd = surprise.SVD(n_factors=3, n_epochs=30, biased = True, lr_all=0.005, reg_all=0.02, random_state=42)

In [97]:
# В библиотеки surpise реализована правильная разбивка на тренировачную и тестовую часть, которая учитывает, 
# чтобы примеры были обязательно в обеих выборках
trainset = dataset.build_full_trainset()

In [98]:
%%time

svd.fit(trainset)

Wall time: 902 ms


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1dd4d14d388>

In [99]:
# Итератор для всех пар (u, i), которых НЕТ в обучающем наборе
testset = trainset.build_anti_testset()

In [100]:
products = {k:v for (k,v) in zip(df['item_id'], df['title'])}
products

{'0321719816': 'Learn Adobe Dreamweaver CS5 by Video: Core Training in Web Communication',
 '0321700945': 'Learn Adobe Photoshop Lightroom 3 by Video',
 '0321719824': 'Learn Adobe Flash Professional CS5 by Video: Core Training in Rich Media Communication',
 '0615179088': 'Human Japanese',
 '0763855553': 'Microsoft Office 365 Home | 1-year subscription, 5 users, PC/Mac Key Card',
 '0976963027': "Dave Ramsey's Personal Finance Software Version 5.3",
 '0982697813': 'NIV, GLO Premium, DVD: Multi-device',
 '1413313701': 'Quicken WillMaker Plus 2011 [Old Version]',
 '1413309674': 'Quicken WillMaker Plus 2009',
 '1413313728': 'Quicken Legal Business Pro 2011 [Old Version]',
 '1426296355': 'Complete National Geographic: Every Issue Since 1888',
 '158298302X': 'SUSE Linux 10.1',
 '1597750328': 'TOPO! National Geographic USGS Topographic Maps (California)',
 '1597750301': 'TOPO! National Geographic USGS Topographic Maps (Arizona)',
 '1600775411': 'Spanish Levels 1, 2 &amp; 3',
 '1615354336': 'En

In [101]:
def get_top_n(predictions, n=10):
    """Возвращает топ-N рекомендаций для каждого пользователя из набора прогнозов.

     Аргументы:
         предсказания (список объектов предсказаний): список предсказаний, каторый
             получен тестовым методом алгоритма.
         n(int): количество рекомендаций для вывода для каждого пользователя. По умолчанию
             это 10.

     Возвращает:
         словарь, в котором ключи представляют собой идентификаторы пользователей (необработанные), 
             а значения представляют собой списки кортежей:
         [(необработанный идентификатор элемента, оценка рейтинга), ...] размера n.    
    """

    # Сначала сопоставляем прогнозы с каждым пользователем.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Затем сортируем прогнозы для каждого пользователя и извлекаем n самых высоких.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [102]:
def get_recommendations(algo, user_id, n=20):
    global products, testset
    
    products_bought = set(df[df['user_id'] == user_id]['item_id'])
    products_bought_1 = [products[i] for i in products_bought]
    
    print(user_id)
    print('Куплено', products_bought_1)
    
    testset_1 = (el for el in testset if el[0] == user_id)
    
    predictions = algo.test(testset_1)
    
    products_rec = get_top_n(predictions, n=n)[user_id]
    
    
    products_rec = [p for p in products_rec if p[0] not in products_bought]
    
    products_rec_1 = [(products[p[0]], p[1]) for p in products_rec]
    
    print('\n\n\nРекомендации:', products_rec_1)
    
    return products_rec

In [103]:
rec = get_recommendations(algo=svd, user_id='A3QQO9GVE0GOFE')

A3QQO9GVE0GOFE
Куплено ['Yu-Gi-Oh Power of Chaos: Joey the Passion - PC', 'The Simpsons: Virtual Springfield', 'Thomas the Tank Engine &amp; Friends - PC', 'Thomas &amp; Friends  Building the New Line - PC/Mac']



Рекомендации: [('Key Span 4-Port USB Hub - White (UH-4WH)', 4.6484719180270275), ('Irocks Red USB 2.0 Illuminated 4PORT Hub', 4.639997572544239), ('Iogear POWERLINE/HOMEPLUG WALL MOUNT (GHPU21)', 4.632621441564319), ('Sony VAIO 802.11a/b/g Ethernet Converter (PCWADE80)', 4.629184346401905), ('World of Goo', 4.578937924108348), ('Corel Painter 2015 (Old Version)', 4.521944363150756), ('PhotoImpact 6.0', 4.460517645113921), ('Autodesk SketchBook Pro 7', 4.45761623387858), ("Apple iWork '08 - Old Version", 4.4541534839801935), ('Apple Mac OS X 10.4 Tiger [OLD VERSION]', 4.4447336969638735), ('TurboTax Deluxe Fed + Efile 2013 OLD VERSION', 4.439505985284535), ('Pajama Sam 3: You Are What You Eat From Your Head to Your Feet - PC/Mac', 4.415359346046722), ('H&amp;R Block Tax Softw

In [104]:
rec = get_recommendations(algo=bl, user_id='A3QQO9GVE0GOFE')

A3QQO9GVE0GOFE
Куплено ['Yu-Gi-Oh Power of Chaos: Joey the Passion - PC', 'The Simpsons: Virtual Springfield', 'Thomas the Tank Engine &amp; Friends - PC', 'Thomas &amp; Friends  Building the New Line - PC/Mac']



Рекомендации: [('Key Span 4-Port USB Hub - White (UH-4WH)', 4.783990814310533), ('Irocks Red USB 2.0 Illuminated 4PORT Hub', 4.775938023673797), ('Iogear POWERLINE/HOMEPLUG WALL MOUNT (GHPU21)', 4.756752201006135), ('Sony VAIO 802.11a/b/g Ethernet Converter (PCWADE80)', 4.756024220131915), ('Corel Painter 2015 (Old Version)', 4.379413594839381), ('Kindle for PC [Download]', 4.353078642111101), ("Apple iWork '08 - Old Version", 4.313876090907532), ('Norton Security Premium &ndash; 10 Devices &ndash; 1 Year Subscription - Instant Download - 2019 Ready', 4.297179651367568), ('TurboTax Deluxe Fed + Efile 2013 OLD VERSION', 4.286348012565313), ('Apple Mac OS X 10.4 Tiger [OLD VERSION]', 4.2827774242824335), ('Pajama Sam 3: You Are What You Eat From Your Head to Your Feet - PC/Mac

Алгоритмы baseline и svd рекомендуют практически одинаковые товары 

## Content-based recommender systems

In [36]:
item_data2.head()

Unnamed: 0,item_id,title,brand,main_cat,rank
0,0030672120,HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK ...,HOLT. RINEHART AND WINSTON,Software,"25,550 in Software ("
1,0071480935,"Sing, Watch, &amp; Learn Spanish (DVD + Guide)...",McGraw Hill,Software,"15,792 in Software ("
2,007329506X,Connect with LearnSmart Access Card for Microb...,McGraw-Hill Science/Engineering/Math,Software,"16,900 in Software ("
3,0073513458,LearnSmart Standalone Access Card for Prescott...,McGraw-Hill Education,Software,"12,986 in Software ("
4,0073525758,Anatomy &amp; Physiology Revealed Student Acce...,McGraw-Hill Education,Software,"14,861 in Software ("


In [42]:
item_data2.shape

(26790, 5)

In [37]:
# Выполняем TF-IDF и считаем скор для каждого названия товара.
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf.fit_transform(item_data2['title'])

In [38]:
# Считаем похожесть одного описания на другое, используя косинусовую меру.
cosine_similarities = cosine_similarity(tfidf_matrix)

In [39]:
tfidf_matrix.shape

(26790, 13203)

In [40]:
# Оставляем 50 наиболее похожих по описанию товаров для каждого товара 
# (убираем первый элемент - описание товара похоже больше всего само на себя)
similarities = {}
for i in range(len(cosine_similarities)):
    # сортировка по схожести и запись индексов
    similar_indx = cosine_similarities[i].argsort()[:-50:-1]
    similarities[item_data2['item_id'].iloc[i]] = [(cosine_similarities[i][x], 
                                                     item_data2.iloc[x]['title'], 
                                                     item_data2.iloc[x]['item_id']) for x in similar_indx][1:]


In [44]:
similarities

{'0030672120': [(0.27803654320409116, 'QuickStudy Physics 1', 'B000RPHPJM'),
  (0.26335580967091154, 'Thinkwell Physics I', '1605380261'),
  (0.2551411073843006, 'Nuclear Physics Training Course CD', 'B00068X4NG'),
  (0.24342922592001426, 'High Achiever Physics 2', 'B000J45PBQ'),
  (0.22624754006449938, 'LDS Library Lesson &amp; Talk Builder', '0977022242'),
  (0.2145463754082158, ' SmartBook for Core Concepts in Health', 'B012P5L9NE'),
  (0.20035109975756946,
   'Learning Resources Radius CD Card Set Math Concepts &amp; Vocabulary K-2',
   'B000P9AUJW'),
  (0.19957397229454887, 'PRINT IT 3 [CD-ROM]', 'B0009GV1Y4'),
  (0.19619116931408415, 'Quick Speed PC', 'B00IMNYMAG'),
  (0.19241151557532646, 'Award Quick', '1886715378'),
  (0.1900449316748076,
   ' LearnSmart for Core Concepts in Health, Brief',
   'B00JJ3VY5O'),
  (0.1891216746266291, 'My First CD-ROM: Preschool', 'B0001GJC7A'),
  (0.1891216746266291, 'My First CD-ROM: Preschool', 'B0001GJC7A'),
  (0.1888303055795214, 'Quick Books

In [41]:
# всего товаров
len(similarities)

21639

In [43]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, item, recom_item):
        rec_items = len(recom_item)

        print(f'The {rec_items} recommended items are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_item[i][1]} with {round(recom_item[i][0], 3)} similarity score")
            print("--------------------")

    def recommend(self, recommendation, verbose = True):
        # Выбрать товар для поиска рекомендаций
        item = recommendation['item']
        # Выбрать число товаров для рекомендаций
        number_items = recommendation['item_number']
        # Выбрать товары, наиболее похожее из матрицы
        recom_item = self.matrix_similar[item][:number_items]
        # распечатать
        if verbose:
            self._print_message(item=item, recom_item=recom_item)
        return recom_item

In [46]:
recommedations = ContentBasedRecommender(similarities)

In [47]:
idx = 42

recommendation = {
    "item": item_data2['item_id'].iloc[idx],
    "item_number": 5
}
print('Recommendations for ',item_data2['item_id'].iloc[idx], item_data2['title'].iloc[idx],':\n\n')
recom_item = recommedations.recommend(recommendation)

Recommendations for  0077773284 LearnSmart Access Card for Introduction to Managerial Accounting :


The 5 recommended items are:
Number 1:
 LearnSmart for Financial and Managerial Accounting with 0.647 similarity score
--------------------
Number 2:
 LearnSmart for Garrison Managerial Accounting with 0.579 similarity score
--------------------
Number 3:
LearnSmart Access Card for Fundamentals of Financial Accounting with 0.523 similarity score
--------------------
Number 4:
LearnSmart Access Card for Chemistry with 0.486 similarity score
--------------------
Number 5:
LearnSmart Access Card for Biology with 0.479 similarity score
--------------------


Далее можно построить комбинированную систему, например можно добавить к топ-рекомендациям полученных, например, с помощью модели SVD, рекомендации, которые контекстно похожи на "любимый" товар, который пользователь купил больше всего

## Feature augmentation recommender systems

Рассмотрим еще один комбинированный подход, где оценки полученные на основе алгоритма коллабаративной фильтрации, мы используем в системе на основе Content-base

In [63]:
# Оценки полученные на основе алгоритма коллабаративной фильтрации, мы используем в системе на основе Content-base
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf.fit_transform(item_rating['title'])

# Определение коэффициентов схожести товаров по наименованию
cosine_similarities = cosine_similarity(tfidf_matrix)

# Выборка товаров с лучшим прогнозным рейтингом и коэффициентом схожести
indx = item_rating[item_rating.rating >= 4].index.to_list()
similar_items = [similar_indx[:-50:-1] for similar_indx in cosine_similarities[indx].argsort(axis=1)]
similar_items = np.array(similar_items).flatten()


### Вывод ТОП-лучших для покупки товаров по схожести товаров

In [62]:
top = item_rating.iloc[similar_items]
top = top[top.rating == 0]
top = top.astype(str).drop_duplicates()
# #.sort_values('pred_rating', ascending=False)

top.head()

Unnamed: 0,user_id,item_id,rating,title,brand,main_cat,rank,pred_rating,total_rating
5,A3QQO9GVE0GOFE,B00003IRBV,0.0,Nancy Drew: Stay Tuned for Danger - PC,by\n \n Her Interactive,Video Games,"['>#47,965 in Video Games (See Top 100 in Vide...",4.4,4.4
11,A3QQO9GVE0GOFE,B00004WJOU,0.0,Curious George Preschool Learning Games - PC/Mac,Simon &amp; Schuster,Software,"10,994 in Software (",3.6,3.6
7,A3QQO9GVE0GOFE,B00004R8JP,0.0,The American Sign Language Dictionary,"Multimedia 2000, Inc.",Software,"15,795 in Software (",3.8,3.8
4,A3QQO9GVE0GOFE,B00002S76T,0.0,eMerchant pro,BIG PICTURE TECHNOLOGIES,Software,"61,014 in Software (",4.5,4.5
6,A3QQO9GVE0GOFE,B00004NHL6,0.0,Portuguese Now! 8.0,Transparent Language,Software,"51,451 in Software (",3.0,3.0


### Выводы

Были рассмотрены основные подходы для рекомендательных систем. 

Очень понравилась библиотеке Surprise, своей скоростью и разнообразием алгоритмов для построения рекомендательных систем. 

Эффективность алгоритмов оценивалась с помощью метрики RMSE.
Но, на практике нам нужно получить на самом деле топ-5 или топ-10 товаров, которые мы должны рекомендовать. Если для остальных тысячах товаров мы не угадаем с оценками, то это не страшно. Поэтому сравнивая системы с помощью RMSE или MSE не очень корректно. 

Оценивать систему нужно на бизнесе, проверяя через какое-то время, сколько из рекомендованных товаров в итоге купит пользователей.

Лучше использовать комбинированные подходы. Использовать множество простых алгоритмов, чем искать один супер-сложный, которые закроет всю область.

Обязательно нужна фильтрация по группам товаров, и для каждой из них желательно разработать свою рекомендательную модель. 
Для Content-base системы нужно смотреть не на последний товар (не на самый любимый), а на несколько последних (несколько любимых).

Можно еще добавлять в рекомендации самые ходовые и новинки из этой же категории, а потом случайно перемешивать.
