In [None]:
import pandas as pd
import numpy as np
import sklearn
import datetime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from annoy import AnnoyIndex
import pickle

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model

In [None]:
np.random.seed(123)

In [None]:
rating_dt = pd.read_csv('ratings.csv')
movie_dt = pd.read_csv('movies.csv')

### Опишем гибридную модель

In [None]:
class EnsembleRecommender():

    """
    Гибридный ансамбль, который комбинирует NCF DL модель и MF-ANN модель.
    Базируется на 2 предъобученных моделях

    """

    def __init__(self,rating_df,movie_df, rating_matrix, item_vector):
        # инициализируем
        self.rating_df = rating_df
        self.movie_df = movie_df
        self.user_ids = rating_df['userId'].unique()
        self.movie_ids = rating_df['movieId'].unique()
        self.user2user_encoded = {x: i for i, x in enumerate(self.user_ids)}
        self.movie2movie_encoded = {x: i for i, x in enumerate(self.movie_ids)}
        self.movie_encoded2movie = {i: x for i, x in enumerate(self.movie_ids)}
        self.rating_matrix = rating_matrix
        self.item_vector = item_vector


    def NCF_recommendation(self,userId,top_k=10):
        # функция рекомендаций на моделе NCF

        # загружаем модель
        model =  tf.keras.models.load_model('dl_model.h5')

        # кодируем пользователей userId
        client_encoded = self.user2user_encoded[userId]

        # все рейтинги
        movie_watched = self.rating_df[self.rating_df['userId'] == userId]['movieId'].values

        # фильмы, которые не были в рекомендованы, рекомендовать NCF
        movie_poll_encoded = []
        for item in self.movie_ids:
            if not np.isin(item, movie_watched):
                movie_poll_encoded.append(self.movie2movie_encoded[item])

        # фильмы без рейтинга в набор данных
        d = {'user_encoded': [client_encoded] * len(movie_poll_encoded), 'movie_encoded' : movie_poll_encoded}
        client_df = pd.DataFrame(d)

        # предикт рейтинг
        ratings = model.predict([client_df['user_encoded'], client_df['movie_encoded']])

        # сортируем и отбираем лучшие
        top_ratings_idx = ratings.flatten().argsort()[-top_k:][::-1]
        top_ratings = ratings[top_ratings_idx].flatten()
        recommend_movieId = [self.movie_encoded2movie.get(movie_poll_encoded[x]) for x in top_ratings_idx]

        # формируем финальный вид
        top_movie_rec = pd.DataFrame({'movieId': recommend_movieId, 'prediction': top_ratings}).set_index('movieId')
        top_movie_rec = top_movie_rec.join(self.movie_df.set_index('movieId'))

        return top_movie_rec[['title','genres']]


    #
    # далее модель ANN
    #
    def get_rated_movies(self,userId,threshold=2):
        # отбираем фильмы, которые с рейтингом выше порога

        all_rates = self.rating_df[self.rating_df['userId'] == userId]
        high_rates = all_rates[all_rates['rating'] >= threshold]['rating'].values
        high_rate_movie = all_rates[all_rates['rating'] >= threshold]['movieId'].values
        return high_rate_movie, high_rates


    def ann(self, metric, num_trees):
        # Модель Implement Approximate Nearest Neighborhood испольузется для поиска схожих объектов
        # формирует единый набор в виде словаря, где указаны элементы и его вектор
        rating_dictionary = {self.movie_ids[i]: self.item_vectors[i] for i in range(len(self.movie_ids))}

        # ann
        f = len(self.item_vectors[1])
        t = AnnoyIndex(f, metric)
        for key in rating_dictionary:
            t.add_item(key, rating_dictionary.get(key))
        t.build(num_trees) # деревья в обучении
        t.save('rating.ann')


    def ANN_recommendation(self,userId, dimension = 14, metric = 'angular',
                           num_tree=20, threshold=2, top_n=10):
        # используем обученную модель ANN

        v = self.item_vector
        f = len(v[1])
        u = AnnoyIndex(f, metric)
        u.load('rating.ann')

        # формируем рекомендации
        high_rate_movie, rate = self.get_rated_movies(userId,threshold=threshold)
        movielist = []
        distancelist = []

        if len(high_rate_movie) > 1:
            # поиск лучших соседей
            for movieid in high_rate_movie:
                movie, dist = u.get_nns_by_item(movieid, top_n, include_distances=True)
                movielist.extend(movie[1:])

                # get the weighted distance based on rating scores
                weighted_dist = (np.array(dist[1:])/rate[np.where(high_rate_movie == movieid)]).tolist()
                distancelist.extend(weighted_dist)

            #использовать больше 20 соседей для рекомендации
            if len(movielist) > 20:
                sorted_recommend = np.array(movielist)[np.array(distancelist).argsort()]
                movielist = sorted_recommend[:20]

        # финальный вид
        top_movie_rec = self.movie_df.loc[self.movie_df['movieId'].isin(movielist)].set_index('movieId')

        return top_movie_rec


    def Popular_recommendation(self, top_k = 20):
        # формирование Топ - N рекомендаций
        grouped_rating = self.rating_df.groupby('movieId')['rating'].mean()
        grouped_count = self.rating_df.groupby('movieId')['movieId'].count()

        # формируем финальный набор данных
        df_grouped = pd.DataFrame(grouped_count)
        df_grouped.columns = ['count']

        df_group_avg = pd.DataFrame(grouped_rating)
        df_group_avg.columns = ['avg_rating']

        # соединяем в один набор
        df_grouped = df_grouped.join(df_group_avg, on ='movieId')
        df_grouped.sort_values(by=['count','avg_rating'], ascending=False)

        # топ
        top_k_rec = df_grouped.loc[df_grouped['count'] > 1000].loc[df_grouped['avg_rating']>4.0][:top_k]

        # финалим
        top_movie_rec = self.movie_df.loc[self.movie_df['movieId'].isin(top_k_rec.index.values)].set_index('movieId')

        return top_movie_rec

    def User_Classification(self,userId):
        # классификатор пользователей по кол-ву оценки фильмов (3 класса)
        if userId not in self.user_ids:
            return '0'
        else:
            num_of_rated_movies = len(self.rating_df.loc[self.rating_df.userId == userId]['movieId'].unique())
            if 1 < num_of_rated_movies < 50:
                return '1-50'
            elif 51 < num_of_rated_movies < 150:
                return '51-150'
            else:
                return '151'


    def Recommend(self, userId):
        # рекомендация в зависимости от кол-ва оценок
        classification = self.User_Classification(userId)

        if classification == '0':
            return self.Popular_recommendation()
        elif classification == '1-50':
            return self.NCF_recommendation(userId)
        elif classification == '51-150':
            return self.NCF_recommendation(userId)[:15].append(self.ANN_recommendation(userId)[:5])
        else:
            return self.NCF_recommendation(userId)[:10].append(self.ANN_recommendation(userId)[:10])


In [None]:
user_ids = rating_dt["userId"].unique().tolist()
num_all_user = len(user_ids)
rand_userid = np.random.choice(user_ids, size = int(num_all_user * 0.1), replace=False)
sample_df = rating_dt.loc[rating_dt['userId'].isin(rand_userid)]

In [None]:
# загружаем модель
filename = 'nmf_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
item_vector = loaded_model.components_.T

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [None]:
# формируем необходимый вид матрицы
def movie_use_matrix_pivot(df_):
    mu_matrix = df_.pivot(index = 'userId',
                          columns = 'movieId',
                          values = 'rating').fillna(0)
    # compress original matrix
    mu_matrix_cp = csr_matrix(mu_matrix.values)
    return mu_matrix, mu_matrix_cp

rating_matrix, rating_matrix_cp = movie_use_matrix_pivot(sample_df)

In [None]:
Ensemble = EnsembleRecommender(sample_df, movie_dt, rating_matrix, item_vector )

In [None]:
# Показать рекомендации
Ensemble.Recommend(14)



Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5445,Minority Report (2002),Action|Crime|Mystery|Sci-Fi|Thriller
122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi
3911,Best in Show (2000),Comedy
508,Philadelphia (1993),Drama
527,Schindler's List (1993),Drama|War
136598,Vacation (2015),Adventure|Comedy
34072,"March of the Penguins (Marche de l'empereur, L...",Documentary
4975,Vanilla Sky (2001),Mystery|Romance|Sci-Fi|Thriller
4898,Novocaine (2001),Comedy|Crime|Mystery|Thriller
1104,"Streetcar Named Desire, A (1951)",Drama
