# Movies and tags data loading 

In [1]:
#importing the relevant python libraries
import pandas as pd
import numpy as np

In [2]:
#Loading the movies csv file into a dataframe
movies_df = pd.read_csv('movies.csv',usecols=['movieId','title','genres'],
                        dtype={'movieId': 'int32', 'title': 'str', 'genres': 'str'})
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
#Loading the genome relevance score csv file into a dataframe
genome_scores_df = pd.read_csv('Genome-scores.csv')
genome_scores_df

Unnamed: 0,movieId,tag_id,relevance
0,1,1,0.500891
1,2,1,0.499823
2,3,1,0.499874
3,5,1,0.499875
4,7,1,0.499911
...,...,...,...
2293543,183611,1459,0.499725
2293544,184471,1459,0.499719
2293545,187593,1459,0.499717
2293546,187595,1459,0.499768


In [4]:
#Merging the relevance score and movies dataframes
merge_df = pd.merge(genome_scores_df,movies_df,on='movieId')
merge_df

Unnamed: 0,movieId,tag_id,relevance,title,genres
0,1,1,0.500891,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,0.499961,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,3,0.499807,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,4,0.499736,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,1,5,0.499736,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
2293543,193565,1455,0.499331,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
2293544,193565,1456,0.499623,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
2293545,193565,1457,0.499331,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
2293546,193565,1458,0.499527,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi


In [5]:
## Creating a Pivot matrix
final_tag_relevance_df = merge_df.pivot_table(index='title',columns='tag_id',values='relevance').fillna(0)
final_tag_relevance_df

tag_id,1,2,3,4,5,6,7,8,9,10,...,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.509666,0.502885,0.504418,0.499448,0.499448,0.499648,0.499763,0.500419,0.499702,0.499740,...,0.500109,0.500109,0.500109,0.499042,0.499664,0.499042,0.499461,0.499042,0.499323,0.499541
...And Justice for All (1979),0.499911,0.499954,0.499855,0.499802,0.499802,0.499874,0.499880,0.499907,0.499893,0.499907,...,0.499957,0.499957,0.499957,0.499657,0.499880,0.499657,0.499807,0.499657,0.499758,0.499836
10 Cloverfield Lane (2016),0.499877,0.499961,0.499796,0.499721,0.499721,0.499822,0.499844,0.502242,0.502760,0.514212,...,0.499939,0.499939,0.499939,0.499516,0.499830,0.499516,0.499728,0.499516,0.499658,0.499768
10 Things I Hate About You (1999),0.499911,0.499954,0.499855,0.499802,0.499802,0.499874,0.499880,0.499907,0.499893,0.499907,...,0.499957,0.499957,0.499957,0.499657,0.499880,0.499657,0.499807,0.499657,0.499758,0.499836
101 Dalmatians (1996),0.499875,0.499935,0.499796,0.499721,0.499721,0.499822,0.499830,0.499869,0.504808,0.499868,...,0.499939,0.499939,0.499939,0.499516,0.499830,0.499516,0.499727,0.499516,0.499658,0.499768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zero Dark Thirty (2012),0.499804,0.499899,0.499680,0.499563,0.499563,0.499721,0.499734,0.499795,0.499764,0.499794,...,0.500355,0.500355,0.500355,0.499242,0.499734,0.499242,0.499573,0.499242,0.499464,0.499637
Zombieland (2009),0.509602,0.502937,0.503271,0.499526,0.499526,0.499697,0.500196,0.499946,0.499744,0.499776,...,0.500697,0.500697,0.500697,0.499178,0.499712,0.499178,0.499537,0.499178,0.499419,0.499606
Zoolander (2001),0.502911,0.505838,0.543560,0.499522,0.499522,0.499695,0.499759,0.499775,0.499742,0.499774,...,0.499896,0.499896,0.499896,0.499170,0.499709,0.499170,0.499533,0.499170,0.499414,0.499602
Zulu (1964),0.499911,0.499954,0.499855,0.499802,0.499802,0.499874,0.499880,0.503074,0.499893,0.499907,...,0.499957,0.499957,0.499957,0.499657,0.499880,0.499657,0.499807,0.499657,0.499758,0.499836


# Using the KNN model

In [6]:
from scipy.sparse import csr_matrix

In [7]:
final_tag_relevance_df_matrix = csr_matrix(final_tag_relevance_df.values)
final_tag_relevance_df_matrix

<1572x1459 sparse matrix of type '<class 'numpy.float64'>'
	with 2293548 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.neighbors import NearestNeighbors

In [9]:
model_knn = NearestNeighbors(metric = 'manhattan', algorithm = 'kd_tree')

In [10]:
import sklearn

In [11]:
sorted(sklearn.neighbors.VALID_METRICS['kd_tree'])

['chebyshev',
 'cityblock',
 'euclidean',
 'infinity',
 'l1',
 'l2',
 'manhattan',
 'minkowski',
 'p']

In [12]:
model_knn.fit(final_tag_relevance_df_matrix)



NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [13]:
final_tag_relevance_df

tag_id,1,2,3,4,5,6,7,8,9,10,...,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.509666,0.502885,0.504418,0.499448,0.499448,0.499648,0.499763,0.500419,0.499702,0.499740,...,0.500109,0.500109,0.500109,0.499042,0.499664,0.499042,0.499461,0.499042,0.499323,0.499541
...And Justice for All (1979),0.499911,0.499954,0.499855,0.499802,0.499802,0.499874,0.499880,0.499907,0.499893,0.499907,...,0.499957,0.499957,0.499957,0.499657,0.499880,0.499657,0.499807,0.499657,0.499758,0.499836
10 Cloverfield Lane (2016),0.499877,0.499961,0.499796,0.499721,0.499721,0.499822,0.499844,0.502242,0.502760,0.514212,...,0.499939,0.499939,0.499939,0.499516,0.499830,0.499516,0.499728,0.499516,0.499658,0.499768
10 Things I Hate About You (1999),0.499911,0.499954,0.499855,0.499802,0.499802,0.499874,0.499880,0.499907,0.499893,0.499907,...,0.499957,0.499957,0.499957,0.499657,0.499880,0.499657,0.499807,0.499657,0.499758,0.499836
101 Dalmatians (1996),0.499875,0.499935,0.499796,0.499721,0.499721,0.499822,0.499830,0.499869,0.504808,0.499868,...,0.499939,0.499939,0.499939,0.499516,0.499830,0.499516,0.499727,0.499516,0.499658,0.499768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zero Dark Thirty (2012),0.499804,0.499899,0.499680,0.499563,0.499563,0.499721,0.499734,0.499795,0.499764,0.499794,...,0.500355,0.500355,0.500355,0.499242,0.499734,0.499242,0.499573,0.499242,0.499464,0.499637
Zombieland (2009),0.509602,0.502937,0.503271,0.499526,0.499526,0.499697,0.500196,0.499946,0.499744,0.499776,...,0.500697,0.500697,0.500697,0.499178,0.499712,0.499178,0.499537,0.499178,0.499419,0.499606
Zoolander (2001),0.502911,0.505838,0.543560,0.499522,0.499522,0.499695,0.499759,0.499775,0.499742,0.499774,...,0.499896,0.499896,0.499896,0.499170,0.499709,0.499170,0.499533,0.499170,0.499414,0.499602
Zulu (1964),0.499911,0.499954,0.499855,0.499802,0.499802,0.499874,0.499880,0.503074,0.499893,0.499907,...,0.499957,0.499957,0.499957,0.499657,0.499880,0.499657,0.499807,0.499657,0.499758,0.499836


In [14]:
final_tag_relevance_df.shape

(1572, 1459)

# Already rated movies selection for a user

In [18]:
rating_df = pd.read_csv('ratings.csv')
rating_df = rating_df.drop('timestamp', axis=1)

In [19]:
movies_df = pd.read_csv('movies.csv')
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [20]:
rating_df = pd.merge(rating_df, movies_df, on='movieId')
rating_df

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller


In [21]:
rating_df = rating_df.sort_values(['userId', 'rating'], ascending=False).reset_index()
rating_df = rating_df.drop('index',axis=1)
rating_df

Unnamed: 0,userId,movieId,rating,title,genres
0,610,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,610,6,5.0,Heat (1995),Action|Crime|Thriller
2,610,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
3,610,260,5.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
4,610,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
...,...,...,...,...,...
100831,1,2253,2.0,Toys (1992),Comedy|Fantasy
100832,1,2338,2.0,I Still Know What You Did Last Summer (1998),Horror|Mystery|Thriller
100833,1,2389,2.0,Psycho (1998),Crime|Horror|Thriller
100834,1,2617,2.0,"Mummy, The (1999)",Action|Adventure|Comedy|Fantasy|Horror|Thriller


In [22]:
rating_df = rating_df.set_index(['userId','movieId'])
rating_df                            

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,title,genres
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
610,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
610,6,5.0,Heat (1995),Action|Crime|Thriller
610,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
610,260,5.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
610,296,5.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
...,...,...,...,...
1,2253,2.0,Toys (1992),Comedy|Fantasy
1,2338,2.0,I Still Know What You Did Last Summer (1998),Horror|Mystery|Thriller
1,2389,2.0,Psycho (1998),Crime|Horror|Thriller
1,2617,2.0,"Mummy, The (1999)",Action|Adventure|Comedy|Fantasy|Horror|Thriller


In [53]:
user = 205
rating_df.loc[[user],['rating','title']]

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,title
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
205,71462,5.0,"Cove, The (2009)"
205,260,4.5,Star Wars: Episode IV - A New Hope (1977)
205,6874,4.5,Kill Bill: Vol. 1 (2003)
205,99114,4.5,Django Unchained (2012)
205,4896,4.5,Harry Potter and the Sorcerer's Stone (a.k.a. ...
205,81845,4.5,"King's Speech, The (2010)"
205,91542,4.5,Sherlock Holmes: A Game of Shadows (2011)
205,179817,4.5,Darkest Hour (2017)
205,356,4.0,Forrest Gump (1994)
205,3033,4.0,Spaceballs (1987)


In [54]:
rated_movies = rating_df.loc[[user],['rating','title']]['title'].tolist()

# Recommendations

In [55]:
def recommend(movie):
    s = "title == [ " + "\"" + movie + "\"]"
    distances, indices = model_knn.kneighbors(final_tag_relevance_df.query(s).to_numpy().reshape(1, -1), n_neighbors = 100)
    
    j = 0
    print('Recommendations for {0}:\n'.format(movie))
    for i in range(0, len(distances.flatten())):
        if j >= 5:
            break
        
        if (final_tag_relevance_df.index[indices.flatten()[i]]) in rated_movies:
            continue
        else:
            j += 1
            print('{0}: {1}, with distance of {2}:'.format(i, final_tag_relevance_df.index[indices.flatten()[i]], distances.flatten()[i]))

In [56]:
movie_list = merge_df['title'].unique().tolist()

In [57]:
len(movie_list)

1572

In [58]:
count = 0
for i in rated_movies:
    if i in movie_list:
        count = count + 1
        print(count)
        recommend(i)
    if count >= 5:
        break

1
Recommendations for Star Wars: Episode IV - A New Hope (1977):

1: Star Wars: Episode V - The Empire Strikes Back (1980), with distance of 1.2478956253613274:
2: Star Trek (2009), with distance of 1.2897275781390605:
3: Predator (1987), with distance of 1.305212824899476:
4: Star Wars: Episode VI - Return of the Jedi (1983), with distance of 1.4226661429499854:
5: The Butterfly Effect (2004), with distance of 1.4371769011123223:
2
Recommendations for Kill Bill: Vol. 1 (2003):

1: Kill Bill: Vol. 2 (2004), with distance of 0.49458918878421776:
2: Unforgiven (1992), with distance of 0.49458918878421776:
3: Mean Creek (2004), with distance of 0.49458918878421776:
4: Fury (1936), with distance of 0.49458918878421776:
5: Hero (Ying xiong) (2002), with distance of 0.5298740582067181:
3
Recommendations for Django Unchained (2012):

1: The Hateful Eight (2015), with distance of 0.9716202789220065:
2: Tron: Legacy (2010), with distance of 1.3183153832773913:
3: Reservoir Dogs (1992), with dis