# Домашняя работа 3. Гибридные рекомендательные системы

Датасет ml-latest
 - Вспомнить подходы, которые мы разбирали
 - Выбрать понравившийся подход к гибридным системам
 - Написать свою

Решением будет ссылка на гитхаб с готовым ноутбуком

## Импорт библиотек

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm, tqdm_notebook

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# Модели и метрики
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.preprocessing import normalize

%matplotlib inline

## Загрузка данных

In [2]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv', )
links = pd.read_csv('../data/ml-latest-small/links.csv')
tags = pd.read_csv('../data/ml-latest-small/tags.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv',)

## Построение гибридной рекомендательной системы

 - Холодный старт (до 5 оценок) - рекомендуем наиболее популярные фильмы
 - Теплый старт (от 5 до 10 оценок) - рекомендуем фильмы на основе сожержания со стекингом
 - Горячий старт (от 10 до бесконечности оценок) - коллаборативная фильтрация с блендингом (Item, User based)

##### Холодный старт - наиболее популярные фильмы

In [3]:
# Рекомендательная система для холодного старта
movies_means = ratings.groupby('movieId').agg({'userId': np.count_nonzero, 
                                'rating': [np.median, np.var, np.average]})
movies_means.columns=['userid_count', 'movie_rating_median', 'movie_rating_var', 'movie_rating_average']
movies_means=movies_means.fillna(0)

In [4]:
movies_means_normalize = pd.DataFrame(data=normalize(movies_means), columns=movies_means.columns)
movies_means_normalize['movieId']=movies_means.index
movies_means_normalize.head()

Unnamed: 0,userid_count,movie_rating_median,movie_rating_var,movie_rating_average,movieId
0,0.999656,0.018598,0.003241,0.018231,1
1,0.998984,0.031786,0.00706,0.031167,2
2,0.996164,0.057471,0.021315,0.062444,3
3,0.87443,0.374756,0.090715,0.294451,4
4,0.996044,0.060982,0.016728,0.062434,5


In [5]:
# Нормируем количество оценок пользователей и рейтинг фильмов
movies_popularity = movies_means

# Популярность фильма = нормированный райтинг * нормированное количество оценок
movies_popularity['popularity'] = movies_popularity['userid_count'] * movies_popularity['movie_rating_average']
movies_popularity = movies_popularity.merge(movies, on='movieId', how='left', sort=False)[['movieId', 'title', 'genres', 'popularity']]

In [6]:
def cold_start(userId):
    movies = movies_popularity.sort_values('popularity', ascending=False)[['movieId', 'title', 'popularity']].head(10)
    return movies

In [7]:
cold_start(100)

Unnamed: 0,movieId,title,popularity
277,318,"Shawshank Redemption, The (1994)",1404.0
314,356,Forrest Gump (1994),1370.0
257,296,Pulp Fiction (1994),1288.5
1938,2571,"Matrix, The (1999)",1165.5
510,593,"Silence of the Lambs, The (1991)",1161.0
224,260,Star Wars: Episode IV - A New Hope (1977),1062.0
97,110,Braveheart (1995),955.5
2224,2959,Fight Club (1999),931.5
461,527,Schindler's List (1993),929.5
418,480,Jurassic Park (1993),892.5


#### Теплый старт - рекомендация на основе содержания

###### Сформируем следующие признаки, для того чтобы сделать Content-bases рекомендации:
    1) TF-IDF метрика на жанрах и тегах
    2) Средняя оценка пользователя и фильма
    3) Медианная оценка пользователя и фильма
    4) Дисперсия оценки пользователя и фильма
    5) Количество оценок пользователя и фильма
    
Так же обогатим модель результатами **логистической регрессии** для оценки пользователя, 
после обучим алгоритм **KNN** на полученных данных. 

Для каждого пользователя будет формироваться **своя модель** на основе поставленных им оценок и тэгов.

In [8]:
# Сгруппируем тэги для фильмов
grouped_tags = tags.groupby('movieId').agg({'tag': [(lambda x: "|".join(x)), np.count_nonzero]})
grouped_tags.columns=['all_tags', 'all_tags_count']

#### Формируем таблицу с фильмами

In [9]:
# Функция для TF-IDF метрики
def tf_idf(row, value, dictionary):
    return (1/len(row.split('|')))*dictionary[value] if value in row else 0

In [10]:
# Датасет фильмов и оценок пользователей к ним
movies_with_tags = movies.merge(grouped_tags, on='movieId', how='left', sort=False)\
                                 .merge(movies_means_normalize, on='movieId', how='left', sort=False)
movies_with_tags['all_tags'] = movies_with_tags['all_tags'].fillna('')
movies_with_tags = movies_with_tags.fillna(0)

In [11]:
# Сформируем список жанров:
genres_list = []
for i in movies.genres.str.split('|'):
    for j in i:
        genres_list.append(j)
        
### Итоговый словарь жанров:
genres_dict = {i:np.log(len(movies)/genres_list.count(i)) for i in genres_list}

In [12]:
# Добавим новые фичи в датасет (TF-IDF на жанрах):
for i in tqdm(genres_dict):
    movies_with_tags['tf_idf_'+i] = movies_with_tags.apply(lambda row: tf_idf(row['genres'], i, genres_dict), axis=1)

100%|██████████| 20/20 [00:02<00:00,  6.78it/s]


In [13]:
# Сформируем список тэгов
tags_list = []
for i in grouped_tags.all_tags.str.split('|'):
    for j in i :
        tags_list.append(j)
        
# Итоговый словарь тэгов
tags_dict = {i:np.log(len(movies)/tags_list.count(i)) for i in tags_list if tags_list.count(i)>5 and i!=''}

In [14]:
# Добавим новые фичи в датасет (TF-IDF на тэгах для пользователя и фильма):
for i in tqdm(tags_dict):
    movies_with_tags['tf_idf_'+i] = movies_with_tags\
                .apply(lambda row: tf_idf(row['all_tags'], i, tags_dict), axis=1)

100%|██████████| 125/125 [00:22<00:00,  5.61it/s]


In [15]:
movies_tf_idf = movies_with_tags.drop(['genres', 'title', 'all_tags'], axis=1)

##### Формируем таблицу с пользователями

In [16]:
# Средняя оценка, медианное значение, дисперсия, количество оценок пользователей
users_mean = ratings.groupby('userId').agg({'movieId': np.count_nonzero, 
                                'rating': [np.median, np.var, np.average]})
users_mean.columns=['movieid_count', 'user_rating_median', 'user_rating_var', 'user_rating_average']

In [17]:
users_mean_normalize = pd.DataFrame(data=normalize(users_mean), columns=users_mean.columns)
users_mean_normalize['userId']=users_mean.index
users_mean_normalize.head()

Unnamed: 0,movieid_count,user_rating_median,user_rating_var,user_rating_average,userId
0,0.999587,0.021543,0.002758,0.018813,1
1,0.981496,0.135379,0.021966,0.133628,2
2,0.99179,0.012715,0.111151,0.061946,3
3,0.999661,0.018512,0.007993,0.016455,4
4,0.992294,0.090209,0.022123,0.082008,5


In [18]:
# И посмотрим какие оценки в среднем пользователь ставит жанрам фильмов
user_ratings = ratings.merge(movies, on='movieId', how='left', sort=False)[['userId', 'movieId', 'rating', 'genres']]
for i in tqdm(genres_dict):
    user_ratings['genre_'+i] = user_ratings.apply(lambda row: row['rating'] if i in row['genres'] else None,axis=1)
    
user_ratings = user_ratings.groupby('userId').mean().drop(['movieId', 'rating'], axis=1).fillna(0)

100%|██████████| 20/20 [00:29<00:00,  1.46s/it]


In [19]:
users_values = users_mean_normalize.merge(user_ratings, on='userId', how='left', sort=False)

In [20]:
users_values.head(2)

Unnamed: 0,movieid_count,user_rating_median,user_rating_var,user_rating_average,userId,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Fantasy,...,genre_Horror,genre_Mystery,genre_Sci-Fi,genre_War,genre_Musical,genre_Documentary,genre_IMAX,genre_Western,genre_Film-Noir,genre_(no genres listed)
0,0.999587,0.021543,0.002758,0.018813,1,4.388235,4.689655,4.547619,4.277108,4.297872,...,3.470588,4.166667,4.225,4.5,4.681818,0.0,0.0,4.285714,5.0,0.0
1,0.981496,0.135379,0.021966,0.133628,2,4.166667,0.0,0.0,4.0,0.0,...,3.0,4.0,3.875,4.5,0.0,4.333333,3.75,3.5,0.0,0.0


##### Итоговый датасет для обучения

In [21]:
movies_users = ratings.merge(users_values, on='userId', how='left', sort=False)\
                      .merge(movies_tf_idf, on='movieId', how='left', sort=False)

data = movies_users.drop(['userId', 'movieId','timestamp'], axis=1)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Columns: 175 entries, rating to tf_idf_Magic
dtypes: float64(175)
memory usage: 135.4 MB


In [23]:
X = data.drop(['rating'], axis=1)
y = data['rating']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

## Обучение 
#### 1. Huber-регрессия

In [25]:
regr_model = HuberRegressor()
epsilons = [4, 5]
alphas =[0.0001, 0.001]

regr_params = dict(epsilon=epsilons, alpha=alphas)

regr_grid = GridSearchCV(regr_model, regr_params, cv=5, scoring='neg_mean_squared_error')

In [26]:
regr_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'epsilon': [4, 5], 'alpha': [0.0001, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [27]:
best_estimator = regr_grid.best_estimator_
best_estimator

HuberRegressor(alpha=0.001, epsilon=5, fit_intercept=True, max_iter=100,
        tol=1e-05, warm_start=False)

In [28]:
predictions = best_estimator.predict(X_test)
mean_squared_error(predictions, y_test)

0.8129338792618208

In [29]:
predictions

array([3.76610137, 4.07404676, 3.84858635, ..., 3.2899016 , 2.89040975,
       3.67551327])

###### Формируем рекомендации для пользователей на теплом старте

In [30]:
def warm_start(userId):
    moviesids = ratings[(ratings.userId==1)]['movieId'].values
    
    movies_not_wathed = movies_tf_idf[(~movies.movieId.isin(moviesids))]
    movies_not_wathed['key'] = 0
    users = users_values[(users_values.userId==userId)]
    users['key'] = 0
    
    movies_not_wathed = movies_not_wathed.merge(users, on='key', how='left')
    data = movies_not_wathed.drop(['userId', 'movieId', 'key'], axis=1).fillna(0)
    
    result = movies_not_wathed[['movieId']].merge(movies, on='movieId', how='inner')
    result['prediction'] = best_estimator.predict(data)
    
    return result.sort_values('prediction', ascending=False).head(10)

In [31]:
warm_start(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,movieId,title,genres,prediction
660,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,10.597331
4677,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,9.172502
238,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,9.166347
3331,4878,Donnie Darko (2001),Drama|Mystery|Sci-Fi|Thriller,8.005288
7140,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,7.44914
2856,4144,In the Mood For Love (Fa yeung nin wa) (2000),Drama|Romance,5.87518
8683,135536,Suicide Squad (2016),Action|Crime|Sci-Fi,5.735998
6980,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,5.545115
2515,3676,Eraserhead (1977),Drama|Horror,5.300437
1288,1921,Pi (1998),Drama|Sci-Fi|Thriller,5.250983


#### Горячий старт - SVD на Surprise (Коллаборативная фильтрация)

Для обучения модели будем использовать тех пользователей, которые поствили больше 10 оценок

In [37]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader

from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [38]:
movies_and_ratings = ratings.merge(movies, on='movieId', how='left')
dataset = pd.DataFrame({
    'uid': movies_and_ratings.userId,
    'iid': movies_and_ratings.movieId,
    'rating': movies_and_ratings.rating
})

In [39]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [45]:
# Установим параметры алгоритма

In [46]:
trainset, testset = train_test_split(data, test_size=.25)

In [63]:
##### Use grid search for model selecting:
n_factorss=[25, 50, 60, 75, 100]
reg_alls = [0.005, 0.02,  0.05, 0.1, 0.5]

param_grid = { 'n_factors': n_factorss,
               'reg_all': reg_alls }

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

gs.fit(data)

In [82]:
gs.best_score

{'rmse': 0.8741976715761056}

In [79]:
algo = gs.best_estimator['rmse']
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10e79beb8>

In [81]:
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8680


0.8679577927573275

In [83]:
# Подбираем 10 фильмов для известного пользователя
def hot_start(uid):
    df = pd.DataFrame(columns=['userId', 'movieId', 'title', 'rating_pred'])
    uid_movies = movies_and_ratings[(movies_and_ratings.userId==uid)].movieId.values
    iid_movies = [i for i in movies_and_ratings.movieId.unique() if i not in uid_movies]
    for i in iid_movies:
        if trainset.knows_item(i):
            if len(movies[(movies.movieId==i)])!=0:
                title = ''.join(movies[(movies.movieId==i)].title.values)
                prediction = algo.predict(uid=uid, iid=i)
                df = df.append({'userId': uid, 'movieId': i, 'title':title, 'rating_pred': prediction.est}, ignore_index=True)
    return df.sort_values('rating_pred', ascending=False).head(10)

In [85]:
hot_start(100)

Unnamed: 0,userId,movieId,title,rating_pred
203,100,318,"Shawshank Redemption, The (1994)",4.607177
273,100,912,Casablanca (1942),4.573458
1465,100,1262,"Great Escape, The (1963)",4.5684
270,100,904,Rear Window (1954),4.549644
214,100,1272,Patton (1970),4.529413
3,100,50,"Usual Suspects, The (1995)",4.501932
56,100,1196,Star Wars: Episode V - The Empire Strikes Back...,4.492901
817,100,1201,"Good, the Bad and the Ugly, The (Buono, il bru...",4.490251
825,100,1276,Cool Hand Luke (1967),4.489899
1585,100,1204,Lawrence of Arabia (1962),4.488606


In [109]:
# Функция для определения предсказаний рекомендательной системы
def create_predictions(userId):
    try: 
        count_films = users_mean[(users_mean.index==userId)].movieid_count.values[0]
        if count_films > 50:
            print('Use hot start with {0} number films'.format(count_films))
            movies = hot_start(userId)
        elif count_films>15:
            movies = warm_start(userId)
            print('Use warm start with {0} number films'.format(count_films))
        else:
            movies = cold_start(userId)
            print('Use cold start with {0} number films'.format(count_films))
        return movies
    except:
        print('No information about user')
        movies = cold_start(userId)
        return movies

In [111]:
create_predictions(1)

Use hot start with 232 number films


Unnamed: 0,userId,movieId,title,rating_pred
0,1,318,"Shawshank Redemption, The (1994)",5.0
1473,1,1204,Lawrence of Arabia (1962),5.0
76,1,912,Casablanca (1942),4.976283
1352,1,1262,"Great Escape, The (1963)",4.969381
95,1,1250,"Bridge on the River Kwai, The (1957)",4.963579
746,1,3275,"Boondock Saints, The (2000)",4.958239
73,1,904,Rear Window (1954),4.956128
1370,1,3836,Kelly's Heroes (1970),4.951676
1319,1,3030,Yojimbo (1961),4.950055
97,1,1266,Unforgiven (1992),4.936721
