In [109]:
import random

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

# !pip install surprise
# !pip install setuptools==58
# !pip install --user ml_metrics

from surprise import SVD, KNNWithMeans, Dataset, Reader, KNNBasic
from surprise.model_selection import GridSearchCV
from collections import defaultdict

## Подготовка данных

In [5]:
data_path = "./data/"

In [6]:
rating = pd.read_csv(data_path + "rating.csv")
df = rating[rating["userId"] <= 1000]

In [7]:
df = df[["userId", "movieId", "rating"]]

In [8]:
from IPython.display import clear_output

train_data = []
selected_movies = set()

for user_id in tqdm(range(1, 1001)):
    ind = np.random.choice(df[df["userId"] == user_id].index)
    movie_id = int(df.loc[ind, :]["movieId"])
    rating = df.loc[ind, :]["rating"]
    
    selected_movies.add(movie_id)
    train_data.append([user_id, movie_id, rating])
    df.drop(ind)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [9]:
unique_movies = df["movieId"].unique()

for movie_id in tqdm(unique_movies):
    if movie_id not in selected_movies:
        ind = np.random.choice(df[df["movieId"] == movie_id].index)
        user_id = int(df.loc[ind, :]["userId"])
        rating = df.loc[ind, :]["rating"]

        train_data.append([user_id, movie_id, rating])
        df.drop(ind)
        

  0%|          | 0/9746 [00:00<?, ?it/s]

In [10]:
train_size = int(0.8 * (len(df) + len(train_data)) - len(train_data))

In [13]:
train, test = train_test_split(df, train_size=train_size, shuffle=True, random_state=42)

In [14]:
train = pd.concat((train, pd.DataFrame(train_data, columns=train.columns)))

In [87]:
reader = Reader(rating_scale=(0, 5))
# data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# raw_ratings = data.raw_ratings                                             
# random.shuffle(raw_ratings)                                                

# threshold = int(.7 * len(raw_ratings))                                     
# trainset_raw_ratings = raw_ratings[:threshold]                             
# test_raw_ratings = raw_ratings[threshold:]                                 

# data.raw_ratings = trainset_raw_ratings
# trainset = data.build_full_trainset()
# testset = data.construct_testset(test_raw_ratings)
trainset = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
# testset = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)

In [88]:
testset = []
for user_id in tqdm(test['userId'].unique()):
    rated_movies = set(train[train['userId'] == user_id]['movieId'])
    for movie_id in set(train['movieId']) - rated_movies:
        testset.append((user_id, movie_id, 0.))

  0%|          | 0/999 [00:00<?, ?it/s]

## Получение топа рекомендаций (для моделей модуля `surpise`)

In [None]:
# функция взята из официального FAQ'a пакета `surprise`

def get_top_k(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [movie_id for movie_id, _ in user_ratings[:n]]

    return top_n

## Метрика: MAP@K

In [None]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(predictions, test, threshold=4., k=10):
    recommendations = get_top_k(predictions, k)
    
    apks = []
    for user_id in tqdm(test['userId'].unique()):
        actual = list(test[(test['userId'] == user_id) & (test['rating'] >= threshold)]['movieId'])
        apks.append(apk(actual, recommendations[user_id], k))

    return np.mean(apks)

## SVD

In [16]:
params_svd = {
    'n_factors': [i for i in range(50, 251, 50)],
#     'lr_all': [10**i for i in range(-3, 1)],
#     'reg_all': [10**i for i in range(-3, 1)],
}

grid_search_svd = GridSearchCV(
    algo_class=SVD,
    param_grid=params_svd,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [17]:
grid_search_svd.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   32.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   37.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   42.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   49.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

In [18]:
best_svd = grid_search_svd.best_estimator['rmse']

In [55]:
predictions = best_svd.test(testset)

In [98]:
mapk(predictions, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.02570686048860652

## User-based approach

In [94]:
params_uknn = {
    'k': [i for i in range(10, 51, 10)],
    'sim_options': 
        {
            'name': ['msd', 'cosine', 'pearson', ],
            'user_based': [True, ],
            'min_support': [1, ],
        },
}

grid_search_uknn = GridSearchCV(
    algo_class=KNNWithMeans,
    param_grid=params_uknn,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [95]:
grid_search_uknn.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.2s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.9s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.7s remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   11.0s remaining:    0.0

[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:  2.7min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  2.8min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:  2.8min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:  2.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:  2.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed:  3.0min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed:  3.0min

In [97]:
best_uknn = grid_search_uknn.best_estimator['rmse']

In [100]:
predictions_uknn = best_uknn.test(testset)

In [103]:
mapk(predictions_uknn, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.00047110602666158223

## Item-based approach

In [104]:
params_iknn = {
    'k': [i for i in range(10, 51, 10)],
    'sim_options': 
        {
            'name': ['msd', 'cosine', 'pearson', ],
            'user_based': [False, ],
            'min_support': [1, ],
        },
}

grid_search_iknn = GridSearchCV(
    algo_class=KNNWithMeans,
    param_grid=params_iknn,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [105]:
grid_search_iknn.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.8s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   23.1s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   34.3s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   45.5s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   56.7s remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0

[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed: 18.3min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 18.7min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed: 19.1min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed: 19.6min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed: 20.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed: 21.4min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed: 21.9min

In [106]:
best_iknn = grid_search_iknn.best_estimator['rmse']

In [107]:
predictions_iknn = best_iknn.test(testset)

In [108]:
mapk(predictions_iknn, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.0004858032635810414

## user

In [110]:
params_ubknn = {
    'k': [i for i in range(10, 61, 10)],
    'sim_options': 
        {
            'name': ['msd', 'cosine', 'pearson', ],
            'user_based': [True, ],
            'min_support': [1, ],
        },
}

grid_search_ubknn = GridSearchCV(
    algo_class=KNNBasic,
    param_grid=params_ubknn,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [111]:
grid_search_ubknn.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.1s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.8s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.5s remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   10.9s remaining:    0.0

[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:  3.7min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  3.7min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:  3.8min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:  3.8min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:  3.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed:  3.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed:  4.0min

In [113]:
best_ubknn = grid_search_ubknn.best_estimator['rmse']

In [114]:
predictions_ubknn = best_ubknn.test(testset)

In [115]:
mapk(predictions_ubknn, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.0

## item

In [116]:
params_ibknn = {
    'k': [i for i in range(10, 51, 10)],
    'sim_options': 
        {
            'name': ['msd', 'cosine', 'pearson', ],
            'user_based': [False, ],
            'min_support': [1, ],
        },
}

grid_search_ibknn = GridSearchCV(
    algo_class=KNNWithMeans,
    param_grid=params_ibknn,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [117]:
grid_search_ibknn.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.6s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   22.4s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   33.3s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   44.1s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   54.8s remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0

MemoryError: Unable to allocate 694. MiB for an array with shape (9536, 9536) and data type float64

In [None]:
best_ibknn = grid_search_ibknn.best_estimator['rmse']

In [None]:
predictions_ibknn = best_ibknn.test(testset)

In [None]:
mapk(predictions_ibknn, test, threshold=4., k=10)