In [1]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('../1. Вводное занятие/links.csv')
movies = pd.read_csv('../1. Вводное занятие/movies.csv')
ratings = pd.read_csv('../1. Вводное занятие/ratings.csv')
tags = pd.read_csv('../1. Вводное занятие/tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [5]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [6]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [7]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [8]:
# %%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fc68f6b79d0>

In [9]:
test_pred = algo.test(testset)

In [10]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8682


0.8681508612296789

In [11]:
algo.predict(uid=2.0, iid='Mortal Kombat (1995)').est

2.5135802488100993

In [12]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [13]:
pd.DataFrame({'title': titles, 'score': scores}).sort_values('score', ascending=False).head(10)

Unnamed: 0,title,score
600,Dr. Strangelove or: How I Learned to Stop Worr...,4.460484
684,Rear Window (1954),4.396334
838,"Streetcar Named Desire, A (1951)",4.353039
948,Chinatown (1974),4.340268
2221,Fight Club (1999),4.32133
692,Casablanca (1942),4.298132
3629,"Lord of the Rings: The Fellowship of the Ring,...",4.29627
926,Raging Bull (1980),4.28938
2457,"Boondock Saints, The (2000)",4.285162
1490,Seven Samurai (Shichinin no samurai) (1954),4.283646


In [14]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [15]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [16]:
movie_genres[0]

'Adventure Animation Children Comedy Fantasy'

In [17]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [18]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [19]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]),
 array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
         6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]]))

In [20]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
2608,3489,Hook (1991),Adventure|Comedy|Fantasy
7865,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
8361,109042,Knights of Badassdom (2013),Adventure|Comedy|Fantasy
3302,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy


In [21]:
movies_with_ratings.sort_values('timestamp', inplace=True)

In [22]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [23]:
movies_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
15993,590,Dances with Wolves (1990),Adventure|Drama|Western,429.0,5.0,8.281246e+08
5936,222,Circle of Friends (1995),Drama|Romance,429.0,4.0,8.281246e+08
12093,434,Cliffhanger (1993),Action|Adventure|Thriller,429.0,4.0,8.281246e+08
16167,592,Batman (1989),Action|Crime|Thriller,429.0,5.0,8.281246e+08
6119,225,Disclosure (1994),Drama|Thriller,429.0,4.0,8.281246e+08
...,...,...,...,...,...,...
100797,187031,Jurassic World: Fallen Kingdom (2018),Action|Adventure|Drama|Sci-Fi|Thriller,514.0,2.5,1.537675e+09
100818,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,514.0,3.0,1.537675e+09
66129,5247,Smokey and the Bandit (1977),Action|Comedy,514.0,2.5,1.537757e+09
66121,5246,Smokey and the Bandit II (1980),Action|Comedy,514.0,1.5,1.537757e+09


In [24]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [31]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [32]:
recommend_for_user(10.0)

King and I, The (1956) 3.6115495695269297
West Side Story (1961) 3.4966500856069684
Once (2006) 3.4956126617996826
Burlesque (2010) 3.39616047645723
42nd Street (1933) 3.344641646943348
Rent (2005) 3.3368812532916055
Jazz Singer, The (1927) 3.2769778717725906
Camelot (1967) 3.2759052877185586
New York, New York (1977) 3.2607756519057185
Phantom of the Opera, The (2004) 3.2390659269908078


In [27]:
np.argsort([1,9,5,7])

array([0, 2, 3, 1])