In [100]:
import pandas as pd
from surprise import KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [23]:
dataset.iid.unique()

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Flint (2017)', 'Bungo Stray Dogs: Dead Apple (2018)',
       'Andrew Dice Clay: Dice Rules (1991)'], dtype=object)

In [8]:
reader = Reader(rating_scale=(dataset.rating.min(), dataset.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [10]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.15)

In [19]:
algo1 = KNNBaseline(k=40, min_k=20, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo1.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x13621b70>

In [20]:
algo1.predict(uid=40, iid='Addams Family Values (1993)').est

3.3782662723957655

In [101]:
test_pred = algo1.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8647


0.8647243826765741

In [103]:
from surprise import CoClustering
algo2 = CoClustering()
algo2.fit(trainset)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


<surprise.prediction_algorithms.co_clustering.CoClustering at 0x1403fd10>

In [104]:
algo2.predict(uid=40, iid='Addams Family Values (1993)').est

3.2356793320075994

In [105]:
test_pred = algo2.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9560


0.9559757292873943

### На основе приведенных выше моделей создадим гибридную модель. CoClustering имеет высокую ошибку, но высокую скорость действия -> им будем отбирать топ 10 фильмов из тестовой выборки. А более точный, но затратный алгоритм KNNBaseline будем использовать для ранжирования топ 10, полученного на прошлом шаге. 

In [108]:
def complex_mod(algo1, algo2, testset, uid):
    
    user_df = pd.DataFrame(testset)
    user_df.columns = ['uid', 'iid', 'rating']
    user_df = user_df[user_df.uid == uid]
    
    user_df['Сс_rating'] = user_df.apply(lambda x: algo2.predict(uid=uid, iid=x[1]).est, axis=1)
    
    user_df = user_df.sort_values(by = ['Сс_rating'], ascending=False)
    user_df = user_df[0:10]
     
    user_df['KNN_rating'] = user_df.apply(lambda x: algo1.predict(uid=uid, iid=x[1]).est, axis=1)
    
    user_df = user_df.sort_values(by = ['KNN_rating'], ascending=False)
    user_df = user_df[:10][['iid','KNN_rating']]
    
    return user_df

### Лист рекомендаций для пользователя с id = 40

In [109]:
complex_mod(algo1, algo2, testset, 40)

Unnamed: 0,iid,KNN_rating
14997,"Fugitive, The (1993)",4.395822
2654,Toy Story (1995),4.233701
4792,Sense and Sensibility (1995),4.148053
5039,Heavy Metal (1981),4.010368
2521,Apollo 13 (1995),3.937087
3494,"Secret of Roan Inish, The (1994)",3.921037
5649,Dave (1993),3.887212
5852,Don Juan DeMarco (1995),3.833949
5707,Muppet Treasure Island (1996),3.732743
8074,While You Were Sleeping (1995),3.434785
