In [34]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score

In [22]:
metadata = pd.read_csv('movies.csv', low_memory=False)
metadata_rating = pd.read_csv('ratings.csv', low_memory=False)
metadata.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [90]:
tfidf = TfidfVectorizer(stop_words='english')
metadata['title'] = metadata['title'].fillna('')
tfidf_matrix = tfidf.fit_transform(metadata['title'])
movie_name=['Toy Story']
movie_tfidf = tfidf.transform(movie_name)

In [126]:
n_neighbors = 11
KNN = NearestNeighbors(n_neighbors,p=2)
KNN.fit(tfidf_matrix)
NNs = KNN.kneighbors(movie_tfidf,return_distance=True)

In [127]:
def get_recommendation(top, df_all, scores):
  recommendation = pd.DataFrame(columns = [ 'movieId', 'title'])
  count = 0
  for i in top:
      recommendation.at[count, 'movieId'] = df_all['movieId'][i]
      recommendation.at[count, 'title'] = df_all['title'][i]
      recommendation.at[count, 'score'] =  scores[count]
      count += 1
  return recommendation

In [128]:
top = NNs[1][0][1:]
index_score = NNs[0][0][1:]
df_all = metadata[['movieId','title']]
recommendation_knn=get_recommendation(top,df_all,index_score)
recommendation_knn

Unnamed: 0,movieId,title,score
0,3114,Toy Story 2 (1999),0.456561
1,1,Toy Story (1995),0.468064
2,106022,Toy Story of Terror (2013),0.668171
3,4929,"Toy, The (1982)",0.770886
4,120474,Toy Story That Time Forgot (2014),0.849089
5,80141,"Christmas Toy, The (1986)",0.887103
6,122078,The Toy Wife (1938),0.936509
7,159856,The Toy Box (1971),0.939533
8,143537,Toy Masters (2014),0.941782
9,5843,Toy Soldiers (1991),0.956857


In [118]:
metadata_rating = pd.read_csv('ratings.csv')
metadata_rating.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [129]:
metadata_user_rating = metadata_rating[metadata_rating.userId==15]
metadata_user_rating

Unnamed: 0,userId,movieId,rating,timestamp
1291,15,1,4.0,1370810063
1292,15,70,2.0,1370808983
1293,15,185,3.0,1370810193
1294,15,296,5.0,1370809602
1295,15,318,4.0,1370809456
...,...,...,...,...
1448,15,98809,4.0,1370809831
1449,15,98961,3.0,1372872275
1450,15,99112,2.5,1372872812
1451,15,99114,5.0,1370809675


In [130]:
recomm_user = pd.merge(recommendation_knn, metadata_user_rating, on="movieId")
recomm_user = recomm_user[recomm_user.rating > 3.0]
recomm_user

Unnamed: 0,movieId,title,score,userId,rating,timestamp
0,3114,Toy Story 2 (1999),0.456561,15,4.0,1370810245
1,1,Toy Story (1995),0.468064,15,4.0,1370810063


In [131]:
recommendation_knn_title = recommendation_knn['title']
recommendation_knn_title.size

10

In [132]:
recommendation_user_title = recomm_user['title']
for i in range(recommendation_user_title.size,recommendation_knn_title.size):
    recommendation_user_title[i] = ""
recommendation_user_title.size

10

In [134]:
accuracy_score(recommendation_knn_title, recommendation_user_title)

2

In [9]:
metadata_mv = pd.read_csv('movies.csv')
user_data_on_tag = metadata_mv.reset_index().merge(metadata_user_rating, on='movieId')
user_data_on_tag[['movieId','title','rating','genres']].drop_duplicates()

Unnamed: 0,movieId,title,rating,genres
0,307,Three Colors: Blue (Trois couleurs: Bleu) (1993),3.5,Drama
1,481,Kalifornia (1993),3.5,Drama|Thriller
2,1091,Weekend at Bernie's (1989),1.5,Comedy
3,1257,Better Off Dead... (1985),4.5,Comedy|Romance
4,1449,Waiting for Guffman (1996),4.5,Comedy
5,1590,Event Horizon (1997),2.5,Horror|Sci-Fi|Thriller
6,1591,Spawn (1997),1.5,Action|Adventure|Sci-Fi|Thriller
7,2134,Weird Science (1985),4.5,Comedy|Fantasy|Sci-Fi
8,2478,¡Three Amigos! (1986),4.0,Comedy|Western
9,2840,Stigmata (1999),3.0,Drama|Thriller


In [41]:
from sklearn.metrics.pairwise import euclidean_distances
D = euclidean_distances(tfidf_matrix)
metadata_title = metadata[metadata['title'].notnull()].copy()
indices = pd.Series(metadata_title['title'])
inddict = indices.to_dict()
inddict = dict((v,k) for k,v in inddict.items())

In [11]:
def recommend_eucledian_distance(title):
    ind = inddict[title]
    distance = list(enumerate(D[ind]))
    distance = sorted(distance, key=lambda x: x[1])
    distance = distance[1:6]
    movie_index = [i[0] for i in distance]

    return metadata_title['title'].iloc[movie_index]

In [12]:
recommend_eucledian_distance('Toy Story')

42                                     Toy Story 3
2656    Chiamatemi Francesco - Il Papa della gente
4140                   To Be Frank, Sinatra at 100
4401                           The Helix... Loaded
4431                                   Food Chains
Name: title, dtype: object