### Задание к теме «Гибридные рекомендательные системы»

Датасет ml-latest

Вспомнить подходы, которые мы разбирали

Выбрать понравившийся подход к гибридным системам

Написать свою

In [1]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from tqdm import tqdm_notebook

import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
# задаем датасет в формате surprise
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [5]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [8]:
# обучаем модель KNNWithMeans
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7efc9c3006d8>

In [9]:
# делаем оценку качества на тесте
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8728


0.8728229416089248

In [10]:
# Определим пользователя и фильмы, которые он посмотрел и оценил
current_user_id = 20.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

In [11]:
scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [12]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [13]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [14]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [15]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
def recommend_for_user(user_id):
    """Функция рекомендаций"""
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]

    print('Last User Movie: ', last_user_movie, '\n')
    print('Recommendation for the User: ', '\n')
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [17]:
recommend_for_user(20.0)

Last User Movie:  What Lies Beneath (2000) 

Recommendation for the User:  

Kingdom II, The (Riget II) (1997) 4.167669172932332
The Phantom of the Opera (1962) 3.524253391545983
The Iguana With The Tongue of Fire (1971) 3.524253391545983
Pensione Paura (1977) 3.524253391545983
The Hearse (1980) 3.524253391545983
Full Circle (1978) 3.524253391545983
Creature from Black Lake (1976)  3.524253391545983
Blow Job (1980) 3.524253391545983
Restless Souls (Bag det stille ydre) (2005) 3.524253391545983
Robin Redbreast (1970) 3.524253391545983
