In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import math

### Класс для преобразования тестовых ярлыков в цифровые

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                try:
                    output[col] = LabelEncoder().fit_transform(output[col].astype(str))
                except:
                    print("Failed to process on {}: {}".format(col, output[col]))
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

### Загрузим файлы

In [None]:
tags = pd.read_csv('../input/tags.csv')

In [None]:
tags.head()

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
movies = pd.read_csv('../input/movies.csv')

In [None]:
movies.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
tags.head()

### Преобразуем ярлыки в цифры

In [None]:
tags_with_num_labels = MultiColumnLabelEncoder(columns = ['tag']).fit_transform(tags)

In [None]:
tags_with_num_labels.head()

In [None]:
movies_with_num_labels = MultiColumnLabelEncoder(columns = ['title', 'genres']).fit_transform(movies)

In [None]:
movies_with_num_labels.head()

### Класс для нормализации рейтинга

In [None]:
class NormalizedRating(object):
    def weighted_rating(self, data):
        m = self.n_quantile
        C = self.vote_mean
        v = data['vote_count']
        R = data['vote_average']
        # Calculation based on the IMDB formula
        return (v/(v+m) * R) + (m/(m+v) * C)

    def get_votes(self, group):
        return { "vote_average": group.mean(), "vote_count": group.count() }

    def normalize_data(self, X):
        rating_stats_extra = X.groupby('movieId')['rating'].apply(self.get_votes).unstack()
        rating_stats_extra = rating_stats_extra.reset_index()

        self.vote_quantile = rating_stats_extra['vote_count'].quantile(self.n_quantile)
        self.vote_mean = rating_stats_extra['vote_average'].mean()

        normalized_data = rating_stats_extra.copy().loc[rating_stats_extra['vote_count'] >= self.vote_quantile]
        normalized_data['score'] = normalized_data.apply(self.weighted_rating, axis=1)

        return normalized_data
    
    # При n_quantile > 0 score для менее популярных данных будет 0-м
    def fit(self, X, n_quantile=0.90):
        self.n_quantile = n_quantile
        return self.normalize_data(X).fillna(0)

    def fit_transform(self, X, n_quantile=0.90):
        self.n_quantile = n_quantile
        
        return X.merge(self.fit(X, self.n_quantile))

#### Нормализируем рейтинг

In [None]:
quantiles = [ 0.0, 0.1, 0.20, 0.40, 0.50, 0.75, 0.80, 0.85, 0.90, 0.95, 0.99 ]
for quantile in quantiles:
    plt.hist(NormalizedRating().fit_transform(train, n_quantile = quantile).sort_values('score', ascending=False).score, cumulative=False, label="{}".format(quantile))

plt.legend(loc='upper right')
plt.grid(True)
plt.show()

In [None]:
norm_rating = NormalizedRating().fit_transform(train, n_quantile = 0.85).sort_values('score', ascending=False)

In [None]:
plt.hist(norm_rating.score)

In [None]:
plt.hist(norm_rating.rating)

In [None]:
norm_rating.head()

In [None]:
norm_rating.describe()

### Скомбинируем данные в одну таблицу

In [None]:
combined = train.merge(norm_rating)

In [None]:
combined.count()

In [None]:
combined.head()

### Проанализируем данные

In [None]:
for column in combined:
    plt.title("График для {}".format(column))
    plt.hist(combined[column])
    plt.show()

In [None]:
plt.hist(combined.groupby('userId').size())

In [None]:
user_activity = combined.groupby('userId').size().reset_index()

In [None]:
user_activity.rename(index=int, columns={ 0: 'n_count' }, inplace=True)

In [None]:
user_activity.describe()

In [None]:
low_active_users = user_activity[user_activity.n_count < 10]

In [None]:
combined.count()

In [None]:
combined.userId[low_active_users.userId.tolist()].count()

### Удалим данные с 0-м рейтингом

In [None]:
combined.describe()

In [None]:
X = combined[combined.score != 0]

In [None]:
X.describe()

In [None]:
X[X.score < X.rating].describe()

In [None]:
X[X.score > X.rating].describe()

### Обучим модель

In [None]:
#from lightgbm import LGBMRanker, LGBMClassifier

In [None]:
#y = X['rating']
#x = X.drop(columns = ['rating'])

In [None]:
# Нужно разобраться с параметрами. На больших матрицах работает очень медленно
# model = LGBMRanker(n_estimators = 2, boosting_type='dart', max_depth=10)
#model = LGBMRanker(num_trees=2)
#model.fit(x, y.astype(int), group=np.array([x.shape[0]]))

In [None]:
#X['score']=model.predict(X)

### Обработаем полученный рейтинг в SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
from collections import Counter
from collections import defaultdict
from scipy.sparse import csr_matrix
import math

In [None]:
# Score: 0.02410
class TopRecommender(object):
    def fit(self, train_data):
        counts = Counter(train_data['movieId'])
        self.predictions = counts.most_common()
        
    def predict(self, user_id, n_recommendations=10):
        return [movie_id for movie_id, frequency in self.predictions[:n_recommendations]]

In [None]:
# Score: 0.04682
class SVDRecommender(object):
    def fit(self, data, n_components = 30):
        self.top_recommender = TopRecommender()
        self.top_recommender.fit(data)
        
        # Создаем словари соответствия
        # UserId -> Номер строки в матрице
        # MovieId -> Номер колонки в матрице
        self.users = defaultdict(lambda: len(self.users))
        self.movies = defaultdict(lambda: len(self.movies))

        # Создаем матрицу взаимодействий пользователь -> фильм
        rows = data.userId.apply(lambda userId: self.users[userId])
        cols = data.movieId.apply(lambda movieId: self.movies[movieId])

        vals = [1.0]* len(cols)
        self.interaction_matrix = csr_matrix((vals, (rows, cols)))

        # Обучим модель SVD
        self.model = TruncatedSVD(n_components = n_components, algorithm='arpack')
        self.model.fit(self.interaction_matrix)

        # Обратный словарь колонка -> ID фильма. Понадобится для предсказаний
        self.movies_reverse = {}

        for movie_id in self.movies:
            movie_idx = self.movies[movie_id]
            self.movies_reverse[movie_idx] = movie_id

    def predict(self, user_id, n_recommendations=10):
        if user_id not in self.users:
            return self.top_recommender.predict(user_id, n_recommendations=10)
        
        # Получить прдстваление пользователя с сниженной размерности
        user_interactions = self.interaction_matrix.getrow(self.users[user_id])
        user_low_dimensions = self.model.transform(user_interactions)
        return self.predict_low_dimension(user_low_dimensions, user_interactions, n_recommendations)

    def predict_low_dimension(self, user_low_dimensions, user_interactions, man_n=10):
        # Получить приближенное предсатвление пользователя
        user_predictions = self.model.inverse_transform(user_low_dimensions)[0]
        recommendations = []

        # Пробегаем по колонкам в порядке убывания предсказанного значения
        for movie_idx in reversed(np.argsort(user_predictions)):
            # Добавляем фильм к ремомендациям только если пользователь его еще не смотрел
            if user_interactions[0, movie_idx] == 0.0:
                movie = self.movies_reverse[movie_idx]
                score = user_predictions[movie_idx]
                #recommendations.append((movie, score))
                recommendations.append(movie)

            if (len(recommendations) >= man_n):
                return recommendations

In [None]:
# Score: 0.02474
# 3% улучшения по сравнению с простым TopRecommender.
class TopRecommenderNormalized(object):
    def fit(self, train_data, n_quantile = 0.90):
        self.predictions = NormalizedRating().fit_transform(train_data, n_quantile = n_quantile).sort_values('score', ascending=False)
        
    def predict(self, user_id, n_recommendations=10):
        return self.predictions['movieId'].unique()[:n_recommendations].tolist()

In [None]:
# Score depends on model parameters. See bellow
class SVDRecommenderWithUserRating(SVDRecommender):
    def replace_rating_with_score(self, row):
        if row.score > row.rating:
            row.rating = row.score
        return row

    def fit(self, data, model = TruncatedSVD(n_components = 30, algorithm='arpack'), normalized = False, n_quantile = 0.90, rating_power = pow, rating_power_n = 3, rating_shift_n = 0):
        self.top_recommender = TopRecommenderNormalized()
        self.top_recommender.fit(data, n_quantile = n_quantile)
        
        if normalized:
            normalized_data = NormalizedRating().fit_transform(data, n_quantile = n_quantile)
            data = data.merge(normalized_data)
            data = data.apply(self.replace_rating_with_score, axis=1).fillna(0)

        # Создаем словари соответствия
        # UserId -> Номер строки в матрице
        # MovieId -> Номер колонки в матрице
        self.users = defaultdict(lambda: len(self.users))
        self.movies = defaultdict(lambda: len(self.movies))

        # Создаем матрицу взаимодействий пользователь -> фильм
        rows = data.userId.apply(lambda userId: self.users[userId])
        cols = data.movieId.apply(lambda movieId: self.movies[movieId])
        
        # Заполняем значениями пользовательской оценки
        if (rating_power == math.exp):
            vals = data.rating.apply(lambda x: rating_power(x + rating_shift_n))
        elif (rating_power == pow):
            if rating_power_n:
                vals = data.rating.apply(lambda x: rating_power(x + rating_shift_n, rating_power_n))
            else:
                vals = data.rating.apply(lambda x: rating_power(x + rating_shift_n, x))
        else:
            vals = data.rating.apply(lambda x: rating_power(x + rating_shift_n))
        
        self.interaction_matrix = csr_matrix((vals, (rows, cols)))
    
        # Обучим модель SVD
        self.model = model
        self.model.fit(self.interaction_matrix)

        # Обратный словарь колонка -> ID фильма. Понадобится для предсказаний
        self.movies_reverse = {}

        for movie_id in self.movies:
            movie_idx = self.movies[movie_id]
            self.movies_reverse[movie_idx] = movie_id

In [None]:
recommender = SVDRecommenderWithUserRating()

In [None]:
# Score: 0.05280
# params = {
#    "model": TruncatedSVD(n_components = 50, algorithm='arpack'),
#    "rating_power": math.exp
#}

# Score: 0.04439
# params = {
#    "model": TruncatedSVD(n_components = 100, algorithm='arpack'),
#    "rating_power": math.exp
#}

# Score: 0.04624
# params = {
#    "model": TruncatedSVD(n_components = 75, algorithm='arpack'),
#    "rating_power": math.exp
#}

# Score: 0.05130
# params = {
#    "model": TruncatedSVD(n_components = 55, algorithm='arpack'),
#    "rating_power": math.exp
#}

# Score: 0.05291
# params = {
#     "model": TruncatedSVD(n_components = 45, algorithm='arpack'),
#     "rating_power": math.exp
# }

# Score: 0.05442
# params = {
#     "model": TruncatedSVD(n_components = 30, algorithm='arpack'),
#     "rating_power": math.exp
# }

# Score: 0.05436
# params = {
#     "model": TruncatedSVD(n_components = 24, algorithm='arpack'),
#     "rating_power": math.exp
#}

# Score: 0.05311
# params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": math.exp,
#    "normalized": True,
#    "n_quantile": 0.85
#}

# Score: 0.05446
# params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": math.exp
#}

# Score: 0.05442
# params = {
#    "model": TruncatedSVD(n_components = 26, algorithm='arpack'),
#    "rating_power": lambda x: math.exp(math.exp(x)),
#    "rating_shift_n": -4.5
#}

# Score: 0.05505
# params = {
#    "model": TruncatedSVD(n_components = 26, algorithm='arpack'),
#    "rating_power": lambda x: math.exp(math.exp(x)),
#    "rating_shift_n": -4
#}

# Score: 0.05244
# params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": lambda x: math.pow(math.pow(2, x), x),
#    "rating_shift_n": -4
#}

# Score: 0.05282
#params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": lambda x: math.pow(2, x),
#    "rating_shift_n": -4
#}

# Score: 0.05282
# params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": lambda x: math.pow(2, x),
#    "rating_shift_n": 0
#}

# Score: 0.04705
# params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": lambda x: math.pow(2, math.pow(2, x)),
#    "rating_shift_n": 0
#}

# Best score!
# Score: 0.05511
# params = {
#    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
#    "rating_power": lambda x: math.exp(math.exp(x)),
#    "rating_shift_n": -4
#}

# Score: 0.05516
# LS: 0.006899
params = {
    "model": TruncatedSVD(n_components = 25, algorithm='arpack'),
    "rating_power": lambda x: math.exp(math.exp(x if x < 1 else x - 0.1)),
    "rating_shift_n": -4
}

In [None]:
ratings = pd.read_csv('../input/train.csv')

In [None]:
recommender.fit(ratings, **params)

### Загрузим тестовые файлы

In [None]:
with open('../input/test_user_id.list', 'r') as file:
    test_user_id = file.read()
test_user_id = test_user_id.split(',')

### Предсказания для существующего пользователя

In [None]:
recommender.predict(user_id=138208, n_recommendations=10)

#### Пользователь со NDCG@10 = 1.0 

In [None]:
recommender.predict(user_id=49443, n_recommendations=10)

### Предсказания для отсутствующего пользователя

In [None]:
recommender.predict(user_id=9999999, n_recommendations=10)

### Запишем предсказания

In [None]:
with open('submit.csv', 'w') as f:
    f.write('userId,movieId\n')
    for user_id in test_user_id:
        recommendations = recommender.predict(user_id=int(user_id), n_recommendations=10)
        for rec in recommendations:
            f.write(str(user_id) + ',' + str(int(rec)) + '\n')

### Проверим результаты

In [None]:
train = ratings[:int(ratings.shape[0] * 0.75)]
validation = ratings[int(ratings.shape[0] * 0.75):]

In [None]:
recommender = SVDRecommenderWithUserRating()
recommender.fit(train, **params)

In [None]:
def dcg_at_k(r, k, method=0):
    """
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.

def ndcg_at_k(r, k, method=0):
    """
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [None]:
verbose = True
num_to_print = 10
total_ndcg = 0

for user_id, group in validation.groupby('userId'):
    ground_truth_films = [int(data.movieId) for row, data in group.iterrows()]
    recommendations = recommender.predict(user_id, n_recommendations=20)
    
    relevance_scores = []
    for rec in recommendations:
        if rec in ground_truth_films:
            relevance_scores.append(len(ground_truth_films) - ground_truth_films.index(rec))
        else:
            relevance_scores.append(0)
    total_ndcg += ndcg_at_k(relevance_scores, k=10)
    
    if verbose and np.random.random() > 0.999:
        user_films_train = train[train.userId == user_id].movieId.values
        print('Идентификатор пользователя: ', user_id)
        print(
            'Фильмы в обучающей выборке для этого пользователя:',
            [movies[movies.movieId == movie_id].title.values[0] for movie_id in user_films_train[:num_to_print]],
            '\n'
        )
        print(
            'Просмотренные на самом деле фильмы: ', 
            [movies[movies.movieId == movie_id].title.values[0] for movie_id in ground_truth_films[:num_to_print]],
            '\n'
        )
        
        print(
            'Рекомендации топ-рекомендера: ', 
            [movies[movies.movieId == rec_id].title.values[0] for rec_id in recommendations],
            '\n'
        )
        print('Значение NDCG@10 = ', ndcg_at_k(relevance_scores, k=10), '\n\n')

In [None]:
total_ndcg / validation.shape[0]