In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [None]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [None]:
movies.head(10)

In [None]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [None]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [None]:
movie_genres[:10]

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

In [None]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
]:
res

In [None]:
movies.iloc[res[1][0]]

In [None]:
movies.head()

In [None]:
tags.head()

In [None]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [None]:
movies_with_tags.head()

In [None]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

In [None]:
movies_with_tags.tag.unique()

In [None]:
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.title.unique().shape

In [None]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

In [None]:
tag_strings[:5]

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

In [None]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

In [None]:
tag_strings[822]

In [None]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
res

In [None]:
for i in res[1][0]:
    print(movies[i])