In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import FeatureUnion

%matplotlib inline

In [91]:
links = pd.read_csv('data/links.csv')
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
tags = pd.read_csv('data/tags.csv')

In [92]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [93]:
mean_ratings = ratings.groupby('movieId')['rating'].mean()
mean_ratings = mean_ratings.to_frame()
movies = movies.merge(mean_ratings, left_on='movieId', right_on='movieId')
movies.head(10)

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429
5,6,Heat (1995),Action|Crime|Thriller,3.946078
6,7,Sabrina (1995),Comedy|Romance,3.185185
7,8,Tom and Huck (1995),Adventure|Children,2.875
8,9,Sudden Death (1995),Action,3.125
9,10,GoldenEye (1995),Action|Adventure|Thriller,3.496212


In [94]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
ct = ColumnTransformer([
        ('scaled_rating', StandardScaler(), ['rating'])
    ], remainder='passthrough')
movies = pd.DataFrame(ct.fit_transform(movies), columns=['scaled_mean_rating', 'movieId', 'title', 'genres'])

In [96]:
movies.head(10)

Unnamed: 0,scaled_mean_rating,movieId,title,genres
0,0.757025,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,0.194716,2,Jumanji (1995),Adventure|Children|Fantasy
2,-0.003257,3,Grumpier Old Men (1995),Comedy|Romance
3,-1.040786,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,-0.219606,5,Father of the Bride Part II (1995),Comedy
5,0.785936,6,Heat (1995),Action|Crime|Thriller
6,-0.088826,7,Sabrina (1995),Comedy|Romance
7,-0.44543,8,Tom and Huck (1995),Adventure|Children
8,-0.158018,9,Sudden Death (1995),Action
9,0.268747,10,GoldenEye (1995),Action|Adventure|Thriller


In [97]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [98]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [99]:
movie_genres

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller',
 'Comedy Drama Romance',
 'Comedy Horror',
 'Adventure Animation Children',
 'Drama',
 'Action Adventure Romance',
 'Crime Drama',
 'Drama Romance',
 'Comedy',
 'Comedy',
 'Action Comedy Crime Drama Thriller',
 'Comedy Crime Thriller',
 'Crime Drama Horror Mystery Thriller',
 'Action Crime Thriller',
 'Drama SciFi',
 'Drama Romance',
 'Drama',
 'Children Drama',
 'Drama Romance',
 'Adventure Drama Fantasy Mystery SciFi',
 'Crime Drama',
 'Drama',
 'Mystery SciFi Thriller',
 'Children Drama',
 'Crime Drama',
 'Children Comedy',
 'Comedy Romance',
 'Drama',
 'Drama War',
 'Action Crime Drama',
 'Drama',
 'Action Adventure Fantasy',
 'Comedy Drama Thriller',
 'Drama Romance',
 'Mystery Thriller',
 'Animation Children Drama Musical Romance',
 'Drama R

In [100]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [102]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [103]:
X_train_tfidf.toarray().shape

(9724, 20)

In [104]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean')
neigh.fit(X_train_tfidf)

In [105]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [106]:
res

(array([[0.42059728, 0.53289122, 0.54309891, 0.54309891, 0.54309891,
         0.54309891, 0.54309891]]),
 array([[6757, 9078, 3297, 3577, 8343, 2604, 3571]]))

In [117]:
list(movies.iloc[res[1][0]].sort_values(by=['scaled_mean_rating'], ascending=False)['movieId'].array)

[143559, 4467, 4911, 3489, 60074, 109042, 4899]