In [None]:
import pandas as pd
import numpy as np
import sklearn
import datetime
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# DL библиотекиэ
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, Dropout, Dense, BatchNormalization, concatenate, dot
from keras.optimizers import Adam
from keras.utils import plot_model, model_to_dot
from keras.constraints import non_neg
from IPython.display import SVG
from sklearn.metrics import mean_squared_error
import os

In [None]:
np.random.seed(123)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# корректировка даты
def dateparse (time_in_secs):
    return datetime.datetime.fromtimestamp(float(time_in_secs))

In [None]:
# загружаем данные =)
rating_dt = pd.read_csv('ratings.csv')
movie_dt = pd.read_csv('movies.csv')
tag_dt = pd.read_csv('tags.csv', parse_dates=['timestamp'])

### Формируем данные в правильном виде и делаем train-test

In [None]:
# уникальные пользователи
user_ids = rating_dt["userId"].unique().tolist()
num_all_user = len(user_ids)

In [None]:
# выбор 20% рандомно для сэмпла
rand_userid = np.random.choice(user_ids, size = int(num_all_user * 0.1), replace=False)
sample_df = rating_dt.loc[rating_dt['userId'].isin(rand_userid)]

# userid, movieid энкодинг для индексов
user_ids = sample_df['userId'].unique()
num_users = len(user_ids)
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
sample_df['user_encoded'] = sample_df['userId'].map(user2user_encoded)

movie_ids = sample_df['movieId'].unique()
num_movies = len(movie_ids)
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
sample_df['movie_encoded'] = sample_df['movieId'].map(movie2movie_encoded)


In [None]:
# train-test split
train, test = train_test_split(sample_df, test_size = 0.2, random_state=123)

num_train_user = len(np.unique(train['user_encoded']))
num_train_movie = len(np.unique(train['movie_encoded']))

print(f'total movie: {num_movies}',
      f'\nmovie in train: {num_train_movie}',
      f'\nmovie not in train: {num_movies - num_train_movie} ({1 - num_train_movie / num_movies :.2f})',
      f'\n\ntotal user: {num_users}',
      f'\nuser in train: {num_train_user}',
      f'\nuser not in train: {num_users - num_train_user} ({1 - num_train_user / num_users :.2f})'
     )

### Модель

In [None]:
def NCF_model(embed_size = 10, drop_out_prob = 0.2):
    """
    Описываем архитектуру модели

    Embedding слои
    Drop Out слои


    """
    # определить входные данные
    movie_input = Input(shape=[1],name='movie-input')
    user_input = Input(shape=[1], name='user-input')

    # MLP embeddings для пользователей и элементов
    movie_embedding_mlp = Embedding(num_movies, embed_size,
                                    name='movie-embedding-mlp')(movie_input)
    movie_vec_mlp = Flatten(name='flatten-movie-mlp')(movie_embedding_mlp)
    movie_vec_mlp = Flatten(name='flatten-movie-mlp')(movie_embedding_mlp)

    user_embedding_mlp = Embedding(num_users, embed_size,
                                   name='user-embedding-mlp')(user_input)
    user_vec_mlp = Flatten(name='flatten-user-mlp')(user_embedding_mlp)

    # MF embeddings для пользователей и элементов
    movie_embedding_mf = Embedding(num_movies, embed_size,
                                   name='movie-embedding-mf')(movie_input)
    movie_vec_mf = Flatten(name='flatten-movie-mf')(movie_embedding_mf)

    user_embedding_mf = Embedding(num_users, embed_size,
                                  name='user-embedding-mf')(user_input)
    user_vec_mf = Flatten(name='flatten-user-mf')(user_embedding_mf)

    # MLP
    concat = concatenate([movie_vec_mlp, user_vec_mlp], axis=-1, name='concat')
    concat_dropout = Dropout(drop_out_prob)(concat)

    fc_1 = Dense(100, name='fc-1', activation='relu')(concat_dropout)
    fc_1_bn = BatchNormalization(name='batch-norm-1')(fc_1)
    fc_1_dropout = Dropout(drop_out_prob)(fc_1_bn)

    fc_2 = Dense(50, name='fc-2', activation='relu')(fc_1_dropout)
    fc_2_bn = BatchNormalization(name='batch-norm-2')(fc_2)
    fc_2_dropout = Dropout(drop_out_prob)(fc_2_bn)

    # Предикт
    pred_mlp = Dense(10, name='pred-mlp', activation='relu')(fc_2_dropout)
    pred_mf = dot([movie_vec_mf, user_vec_mf], axes=1, normalize=False)

    combine_mlp_mf = concatenate([pred_mf, pred_mlp], axis=-1, name='pred_mf')

    # Финализация модели
    result = Dense(1, name='result', activation='relu')(combine_mlp_mf)

    model = Model([user_input, movie_input], result)
    model.compile(optimizer=Adam(lr=0.01), loss='mean_squared_error')

    return model

#### Обзор модели

In [None]:
model = NCF_model()

# визуализируем модель
SVG(model_to_dot(model, dpi=50, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
model.summary()

In [None]:
# обучение NCF
model = NCF_model(embed_size=20)
history = model.fit([train['user_encoded'], train['movie_encoded']], train['rating'], epochs=5)

pd.Series(history.history['loss']).plot(logy=True)
plt.xlabel("Epoch")
plt.ylabel("Train Error")
plt.show()

In [None]:
# сохраним для переиспользования
model.save('dl_model.h5')

# from keras.models import load_model
# model = load_model('dl_model.h5')

In [None]:
# сохраним для единого теста
# test.to_csv('test_movies.csv')

In [None]:
# rmse
y_hat = np.round(model.predict([test['user_encoded'], test['movie_encoded']]), decimals=2)
y_true = test['rating']
print(f'testing rms: {mean_squared_error(y_true, y_hat)}')

In [None]:
# предикт
d = {'prediction': y_hat.tolist(), 'true_value': y_true.values.tolist()}
test_pred = pd.DataFrame(d)
test_pred.head(5)

In [None]:
# сделаем расчет Топ - N
def NCF_recommendation(rec_model, client_id, top_k = 10):
    client_encoded = user2user_encoded[client_id]
    movie_watched = sample_df[sample_df['userId'] == client_id]['movieId'].values

    movie_poll_encoded = []
    for item in movie_ids:
        if not np.isin(item, movie_watched):
            movie_poll_encoded.append(movie2movie_encoded[item])

    d = {'user_encoded': [client_encoded] * len(movie_poll_encoded), 'movie_encoded' : movie_poll_encoded}
    client_df = pd.DataFrame(d)

    ratings = rec_model.predict([client_df['user_encoded'], client_df['movie_encoded']])

    top_ratings_idx = ratings.flatten().argsort()[-top_k:][::-1]
    top_ratings = ratings[top_ratings_idx].flatten()
    recommend_movieId = [movie_encoded2movie.get(movie_poll_encoded[x]) for x in top_ratings_idx]

    top_movie_rec = pd.DataFrame({'movieId': recommend_movieId, 'prediction': top_ratings}).set_index('movieId')
    top_movie_rec = top_movie_rec.join(movie_dt.set_index('movieId'))

    return top_movie_rec

In [None]:
client_id = int(np.random.choice(user_ids, 1))
print(f'recommendation for client: {client_id}')
NCF_recommendation(model, client_id, top_k=15)