## Импортим необходимое

In [203]:
import random
import itertools
from collections import defaultdict

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

# !pip install surprise
# !pip install lightfm

from surprise import SVD, KNNWithMeans, Dataset, Reader, KNNBasic
from surprise.model_selection import GridSearchCV

from lightfm.data import Dataset as LightFMDataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k

## Подготовка данных

In [2]:
data_path = "./data/"

In [3]:
rating = pd.read_csv(data_path + "rating.csv")
df = rating[rating["userId"] <= 1000]

In [4]:
df = df[["userId", "movieId", "rating"]]

In [5]:
from IPython.display import clear_output

train_data = []
selected_movies = set()

for user_id in tqdm(range(1, 1001)):
    ind = np.random.choice(df[df["userId"] == user_id].index)
    movie_id = int(df.loc[ind, :]["movieId"])
    rating = df.loc[ind, :]["rating"]
    
    selected_movies.add(movie_id)
    train_data.append([user_id, movie_id, rating])
    df.drop(ind)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [6]:
unique_movies = df["movieId"].unique()

for movie_id in tqdm(unique_movies):
    if movie_id not in selected_movies:
        ind = np.random.choice(df[df["movieId"] == movie_id].index)
        user_id = int(df.loc[ind, :]["userId"])
        rating = df.loc[ind, :]["rating"]

        train_data.append([user_id, movie_id, rating])
        df.drop(ind)

  0%|          | 0/9746 [00:00<?, ?it/s]

In [7]:
train_size = int(0.8 * (len(df) + len(train_data)) - len(train_data))

In [8]:
train, test = train_test_split(df, train_size=train_size, shuffle=True, random_state=42)

In [9]:
train = pd.concat((train, pd.DataFrame(train_data, columns=train.columns)))

## Подготовка данных (для моделей модуля `surpise`)

In [10]:
reader = Reader(rating_scale=(0, 5))
trainset = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)

In [11]:
testset = []
for user_id in tqdm(test['userId'].unique()):
    rated_movies = set(train[train['userId'] == user_id]['movieId'])
    for movie_id in set(train['movieId']) - rated_movies:
        testset.append((user_id, movie_id, 0.))

  0%|          | 0/999 [00:00<?, ?it/s]

## Получение топа рекомендаций (для моделей модуля `surpise`)

In [None]:
# функция взята из официального FAQ'a пакета `surprise`

def get_top_k(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [movie_id for movie_id, _ in user_ratings[:n]]

    return top_n

## Метрика: MAP@K

In [None]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(predictions, test, threshold=4., k=10):
    recommendations = get_top_k(predictions, k)
    
    apks = []
    for user_id in tqdm(test['userId'].unique()):
        actual = list(test[(test['userId'] == user_id) & (test['rating'] >= threshold)]['movieId'])
        apks.append(apk(actual, recommendations[user_id], k))

    return np.mean(apks)

# Collaborative filtering

## SVD

In [16]:
params_svd = {
    'n_factors': [i for i in range(50, 251, 50)],
}

grid_search_svd = GridSearchCV(
    algo_class=SVD,
    param_grid=params_svd,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [17]:
grid_search_svd.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   32.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   37.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   42.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   49.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

In [18]:
best_svd = grid_search_svd.best_estimator['rmse']

In [55]:
predictions = best_svd.test(testset)

In [98]:
mapk(predictions, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.02570686048860652

## User-based approach

In [94]:
params_uknn = {
    'k': [i for i in range(10, 51, 10)],
    'sim_options': 
        {
            'name': ['msd', 'cosine', 'pearson', ],
            'user_based': [True, ],
            'min_support': [1, ],
        },
}

grid_search_uknn = GridSearchCV(
    algo_class=KNNWithMeans,
    param_grid=params_uknn,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [95]:
grid_search_uknn.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.2s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.9s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.7s remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   11.0s remaining:    0.0

[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed:  2.7min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  2.8min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed:  2.8min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed:  2.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed:  2.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed:  3.0min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed:  3.0min

In [97]:
best_uknn = grid_search_uknn.best_estimator['rmse']

In [100]:
predictions_uknn = best_uknn.test(testset)

In [103]:
mapk(predictions_uknn, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.00047110602666158223

## Item-based approach

In [104]:
params_iknn = {
    'k': [i for i in range(10, 51, 10)],
    'sim_options': 
        {
            'name': ['msd', 'cosine', 'pearson', ],
            'user_based': [False, ],
            'min_support': [1, ],
        },
}

grid_search_iknn = GridSearchCV(
    algo_class=KNNWithMeans,
    param_grid=params_iknn,
    measures=['rmse'],
    joblib_verbose=1000,
    refit=True,
)

In [105]:
grid_search_iknn.fit(trainset)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.8s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   23.1s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   34.3s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   45.5s remaining:    0.0s
Computing the msd similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   56.7s remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0

[Parallel(n_jobs=1)]: Done  53 out of  53 | elapsed: 18.3min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 18.7min remaining:    0.0s
Computing the cosine similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  55 out of  55 | elapsed: 19.1min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  56 out of  56 | elapsed: 19.6min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  57 out of  57 | elapsed: 20.9min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  58 out of  58 | elapsed: 21.4min remaining:    0.0s
Computing the pearson similarity matrix...
Done computing similarity matrix.
[Parallel(n_jobs=1)]: Done  59 out of  59 | elapsed: 21.9min

In [106]:
best_iknn = grid_search_iknn.best_estimator['rmse']

In [107]:
predictions_iknn = best_iknn.test(testset)

In [108]:
mapk(predictions_iknn, test, threshold=4., k=10)

  0%|          | 0/999 [00:00<?, ?it/s]

0.0004858032635810414

# Collaborative + Content

## Подготовка данных (для модуля `lightfm`)

In [187]:
movie = pd.read_csv(data_path + 'movie.csv')

In [188]:
selected_movies = train['movieId'].unique()

In [189]:
movie = movie.loc[movie['movieId'].isin(selected_movies)]

In [190]:
movie['genres'] = movie.genres.str.split('|')
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [191]:
movie = movie[['movieId', 'genres']]

In [192]:
possible_genres = set()

for _, genres in movie.values:
    possible_genres.update(genres)

In [193]:
movie_features = list(map(lambda row: (row[0], tuple(*row[1:])), movie.values))

In [194]:
dataset = LightFMDataset()

In [195]:
dataset.fit(
    users=train['userId'].unique(), 
    items=movie_genres,
)

In [196]:
dataset.fit_partial(
    items=(movie_id for movie_id, _ in movie_features),
    item_features=(movie_descr for _, movie_descr in movie_features)
)

In [197]:
train_interactions, train_weights = dataset.build_interactions(train.values)

In [199]:
train_interactions, val_interaction = cross_validation.random_train_test_split(train_interactions, test_percentage=0.2, random_state=42)

In [200]:
movie_features_src = dataset.build_item_features((row for row in movie_features))

In [182]:
test_interactions, test_weights = dataset.build_interactions(test.values)

## Подбор параметров

In [208]:
def sample_hyperparameters():
    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }

def random_search(train, test, item_features, num_samples=10):
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, item_features=item_features, epochs=num_epochs)

        score = precision_at_k(model, test, item_features=item_features).mean()

        hyperparams["num_epochs"] = num_epochs

        yield score, hyperparams, model

In [209]:
score, hyperparams, model = max(random_search(train_interactions, val_interaction, movie_features_src), key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))

Best score 0.07181545346975327 at {'no_components': 38, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.005423487687580947, 'item_alpha': 3.2463490543101133e-09, 'user_alpha': 1.2872466828234375e-08, 'max_sampled': 14, 'num_epochs': 17}


## Final score на test'е

In [212]:
train_interactions, train_weights = dataset.build_interactions(train.values)

In [213]:
num_epochs = hyperparams.pop("num_epochs")

best_lightfm = model

In [214]:
best_lightfm.fit(
    train_interactions,
    item_features=movie_features_src, 
    sample_weight=train_weights,
    epochs=num_epochs,
    verbose=True,
)

Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:51<00:00,  3.06s/it]


<lightfm.lightfm.LightFM at 0x23392d79940>

In [215]:
np.mean(precision_at_k(best_lightfm, test_interactions, item_features=movie_features_src))

0.10220221