In [63]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

In [64]:
filepath = './data/user_ratedmovies.dat'
df_rates = pd.read_csv(filepath, sep='\t')

In [65]:
filepath = './data/movies.dat'
df_movies = pd.read_csv(filepath, sep='\t', encoding='iso-8859-1')

# Перекодируем ID фильмов и пользователей

In [66]:
df_rates.head()

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30


In [67]:
df_movies.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


In [68]:
from sklearn.preprocessing import LabelEncoder

In [69]:
df_rates.userID.min(), df_rates.userID.max()

(75, 71534)

In [70]:
df_rates.userID.nunique()

2113

In [71]:
enc_user = LabelEncoder()
enc_mov = LabelEncoder()

In [72]:
enc_user = enc_user.fit(df_rates.userID.values)
enc_mov = enc_mov.fit(df_rates.movieID.values)

In [73]:
idx = df_movies.loc[:, 'id'].isin(df_rates.movieID)
df_movies = df_movies.loc[idx]

In [74]:
df_rates.loc[:, 'userID'] = enc_user.transform(df_rates.loc[:, 'userID'].values)
df_rates.loc[:, 'movieID'] = enc_mov.transform(df_rates.loc[:, 'movieID'].values)
df_movies.loc[:, 'id'] = enc_mov.transform(df_movies.loc[:, 'id'].values)

In [75]:
df_rates.head()

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,0,2,1.0,29,10,2006,23,17,16
1,0,31,4.5,29,10,2006,23,23,44
2,0,105,4.0,29,10,2006,23,30,8
3,0,151,2.0,29,10,2006,23,16,52
4,0,154,4.0,29,10,2006,23,29,30


## Матрица рейтингов

In [76]:
from scipy.sparse import coo_matrix, csr_matrix

In [77]:
R = coo_matrix((df_rates.rating.values, (df_rates.userID.values, df_rates.movieID.values)))

In [78]:
R

<2113x10109 sparse matrix of type '<type 'numpy.float64'>'
	with 855598 stored elements in COOrdinate format>

# SVD на матрице рейтингов

In [79]:
from scipy.sparse.linalg import svds

In [80]:
u, s, vt = svds(R, k=6)

In [81]:
u.shape

(2113, 6)

In [82]:
s.shape

(6,)

In [83]:
vt.shape

(6, 10109)

In [84]:
from sklearn.neighbors import NearestNeighbors

In [85]:
nn = NearestNeighbors(n_neighbors=10)

In [87]:
v = vt.T

In [88]:
nn.fit(v)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

In [93]:
_, ind = nn.kneighbors(v, n_neighbors=10)

In [94]:
ind[:10]

array([[   0, 4550, 7871, 5976, 3983, 6128,  351,  565, 1149, 2671],
       [   1, 1337, 3662, 1839,  501, 2383,  101, 8920,  144, 5107],
       [   2,  266,  414,    4, 7875, 4291, 4345, 6884,  505, 3484],
       [   3, 5810, 5614, 4409, 1459, 6732, 3409, 3574, 3147, 3551],
       [   4,  241,  266, 7875,    2, 6335,  505, 6514, 5564, 2899],
       [   5, 3940,   15,  413,  991, 1323, 1094, 4670, 4441, 2052],
       [   6,  225, 3534, 2272,  271, 3745, 1150, 2989, 2897, 6154],
       [   7,  583, 4751, 6155, 3675, 1328,   12, 4414,  160, 1821],
       [   8, 3942, 1296, 3582, 5828, 1440, 5795, 3903, 5304,  803],
       [   9,  198,  304, 1797,   92, 3321,  144,  423, 5027, 1796]])

In [95]:
movie_titles = df_movies.sort_values('id').loc[:, 'title'].values

In [96]:
cols = ['movie'] + ['nn_{}'.format(i) for i in range(1,10)]

In [97]:
df_ind_nn = pd.DataFrame(data=movie_titles[ind], columns=cols)

In [99]:
idx = df_ind_nn.movie.str.contains('Terminator')

In [100]:
df_ind_nn.loc[idx].head()

Unnamed: 0,movie,nn_1,nn_2,nn_3,nn_4,nn_5,nn_6,nn_7,nn_8,nn_9
566,Terminator 2: Judgment Day,Terminator Salvation,Die Hard,Alien,Aliens,Batman,Total Recall,Indiana Jones and the Last Crusade,Mission: Impossible III,True Lies
1119,Terminator Salvation,Terminator 2: Judgment Day,Die Hard,Aliens,Total Recall,Alien,Batman,True Lies,Die Hard: With a Vengeance,Indiana Jones and the Last Crusade
6126,Terminator 3: Rise of the Machines,Bad Boys,Desperado,Demolition Man,AVP: Alien vs. Predator,Naked Gun 33 1/3: The Final Insult,Last Action Hero,Judge Dredd,Beverly Hills Cop III,Hot Shots! Part Deux


# Похожесть пользователей

In [101]:
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity

In [102]:
D = cosine_similarity(R)

In [103]:
D.shape

(2113, 2113)

In [105]:
from scipy.spatial.distance import cosine
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.metrics import pairwise_distances

In [104]:
def similarity(u, v):
    idx = (u != 0) & (v != 0)
    if np.any(idx):
        sim = -cosine(u[idx], v[idx])+1
        return sim
    else:        
        return 0

In [None]:
d = pdist(R.toarray(), metric=similarity)

In [108]:
d.shape

(2231328,)

In [109]:
D = squareform(d)

In [110]:
D.shape

(2113, 2113)