# Recommendation Systems 2

## K Nearest Neighbor Cosine Similarity

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [58]:
import sys
sys.path.append('../../')

from A_Model_Recommendation.ETL import ETL_class

We're just goint to use the columns that will be relevant in the model applied

In [59]:
path_rating = './../dataset/ratings/'
path_titles = './../dataset/titles/'
etl = ETL_class(path_titles, path_rating)

df_movies = etl.get_movies()[['id', 'title', 'listed_in']]
print(df_movies.shape)
df_movies.head()

(22998, 3)


Unnamed: 0,id,title,listed_in
0,as1,the grand seduction,"comedy, drama"
1,as2,take care good night,"drama, international"
2,as3,secrets of deception,"action, drama, suspense"
3,as4,pink: staying true,documentary
4,as5,monster maker,"drama, fantasy"


In [60]:
df_ratings = etl.get_ratings()[['userId', 'rating', 'movieId']].rename(columns={'movieId':'id'})
print(df_ratings.shape)
df_ratings.head()

(11024165, 3)


Unnamed: 0,userId,rating,id
0,1,1.0,as680
1,1,4.5,ns2186
2,1,5.0,hs2381
3,1,5.0,ns3663
4,1,5.0,as9500


Due the large amout of data we get had better to handle with a little portion of the whole

In [61]:
df_count = df_ratings[['userId','id']].groupby('id').count()
df_count.reset_index(inplace=True)
df_count.rename(columns={'userId': 'count'}, inplace=True)

# The filter on movies is that each movie must be count with 550 or more grades
df_count = df_count.loc[df_count['count'] >= 550]
print(df_count.shape)
df_count.head()

(17, 2)


Unnamed: 0,id,count
1120,as2006,552
3553,as4197,576
4321,as4889,554
5111,as56,550
6664,as6998,556


In [62]:
print(df_count['count'].min())
print(df_count['count'].max())

550
576


In [63]:
# this is the final data we shall deal with
df = pd.merge(df_ratings, df_count, on='id', how='left')
df.dropna(inplace=True)
print(df.shape)
df.head()

(9438, 4)


Unnamed: 0,userId,rating,id,count
378,9,3.0,as4889,554.0
773,12,5.0,as2006,552.0
1286,16,4.5,ns7279,558.0
1933,24,3.0,as9065,560.0
2305,24,2.0,as9469,556.0


**Whatever the model we use, we need to define a vector space to clustering, since as previous analysis we've shown it's better employ an unsupervised model to desing a recommendation system**

#### BEWARE TO THE FOLLOWING CELL!!

In [64]:
# this' going to be our vector space
from scipy.sparse import csr_matrix

movies_rating = df.pivot_table(index='id', columns='userId', values='rating').fillna(0)

# 'cause the movies_rating matrix has a lot of zeros, we use the sparce method to
# save efficiently this matrix, and the model will be trained with this new matrix
movie_rating_matrix = csr_matrix(movies_rating.values)
movies_rating.head()

userId,9,12,16,24,41,65,68,75,79,95,...,270684,270734,270737,270757,270795,270809,270822,270850,270887,270896
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
as2006,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as4197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
as4889,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as6998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0


## Cosine Similarity

Now, what will it be the model to implement?

The model will be the cosine similarity. Let's see the following picture<br>

<img src="./../.src/3D-cosine.png" alt="Cosine" width="300" height="300"/>

The visual illustrate the distance between point or nodes the model takes. This distance is the cosine of the angle related to two point, for that we created a sample space or vector space which is the *movies_rating* in our case. This sample space allows us to visualite where the cluster are formed and apply some model.

The distance is given by the equation:

<img src="./../.src/eq-cosine.png" alt="Cosine" width="500" height="100"/>

So the model help us is NearestNeighbers, where the metric is setup as 'cosine' metric.

In [65]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_rating_matrix)

In [66]:
# the model is already trained, so we can chose any movie and find which movies
# are nearer to the input movie
random_index = np.random.choice(movies_rating.shape[0]) # we choose it randomly
print(random_index)
distances, indices = model_knn.kneighbors(movies_rating.iloc[random_index, :].values.reshape(1,-1), n_neighbors=10)

3


In [67]:
# these are the most related movies
indices

array([[ 3,  5,  4,  9,  7,  8,  1,  2, 13, 12]])

In [68]:
# so, the most related movies and the distances between them and the input value are
for ii in range(0, len(distances.flatten())):
    if ii == 0:
        print('Recomendations for {}:\n'.format(movies_rating.index[random_index]))
    else:
        idx = indices.flatten()[ii]
        mox = movies_rating.index[idx]
        dix = distances.flatten()[ii]
        print('{}: {}, with discance of {}:'.format(ii, mox, dix))

Recomendations for as56:

1: as848, with discance of 0.9711835660342725:
2: as6998, with discance of 0.973844403349408:
3: hs2360, with discance of 0.974193626761002:
4: as9469, with discance of 0.9749528305217471:
5: ds552, with discance of 0.9780828386688203:
6: as4197, with discance of 0.9792641509815775:
7: as4889, with discance of 0.9796085353703744:
8: ns5992, with discance of 0.9798126701617894:
9: ns1847, with discance of 0.9811588831971081:


In [69]:
print(indices[0][0])
idx = movies_rating.index[random_index]
df_movies.loc[df_movies['id'] == idx]

3


Unnamed: 0,id,title,listed_in
55,as56,yatra (malayalam),"drama, international"


In [70]:
recom = {movies_rating.index[k]: v for (k, v) in zip(indices.flatten(), distances.flatten())}
recom

{'as56': 0.0,
 'as848': 0.9711835660342725,
 'as6998': 0.973844403349408,
 'hs2360': 0.974193626761002,
 'as9469': 0.9749528305217471,
 'ds552': 0.9780828386688203,
 'as4197': 0.9792641509815775,
 'as4889': 0.9796085353703744,
 'ns5992': 0.9798126701617894,
 'ns1847': 0.9811588831971081}

In [74]:
df_total = [pd.DataFrame(None, columns=['id', 'title', 'listed_in'])]
for k in recom.keys():
    df_total.append(df_movies.loc[df_movies['id'] == k][['id', 'title', 'listed_in']])
df_total = pd.concat(df_total)
df_total

Unnamed: 0,id,title,listed_in
55,as56,yatra (malayalam),"drama, international"
847,as848,superbook,"animation, faith and spirituality, kids"
6997,as6998,sideways,"comedy, drama, romance"
2359,hs2360,dietland,"comedy, drama"
9468,as9469,all good things,"drama, romance, suspense"
551,ds552,national treasure,"action-adventure, mystery, thriller"
4196,as4197,from other worlds,"comedy, science fiction"
4888,as4889,a stone in the water,"horror, suspense"
5991,ns5992,14 cameras,"horror movies, thrillers"
1846,ns1847,el-khawaga's dilemma,"action & adventure, comedies, international mo..."


Now, what about if we had a user and a movie that the user have not watched yet? 

So we need find the similarity between this movie and the movies in the historical user's records, and if there's some similarity bewteen we can choose to recomment the movie or not.

For that it's defined a parameter 'threshold' to "measure" the similary whether or not the similarity is greater than the treshold we can take a decision.

In [None]:
user = 9
title = df_movies.loc[df_movies['id'] == 'as2006', 'title'].values[0]

# records for this user
records = df.loc[df['userId'] == user, 'id'].values

records

array(['as4889'], dtype=object)

In [None]:
# movie id to get similarity
recom = df_movies.loc[df_movies['title'] == title, 'id'].values[0]
recom

'as2006'

In [None]:
# vector related to this movies
sample2 = movies_rating.loc[recom, :].values.reshape(1,-1)
sample2

array([[0., 5., 0., ..., 0., 0., 0.]])

In [None]:
# here we're looking for the most related movies to the original one
Knneighbors = 8
distances, indices = model_knn.kneighbors(sample2, n_neighbors=Knneighbors)
idx_sort = np.argsort(distances[0])[::-1]
indices = [indices[0][ii] for ii in idx_sort]

# we choose some of the most related
knn = int(np.sqrt(Knneighbors)) + 1
most_similar_samples = [movies_rating.iloc[indices[x]].values.reshape(1,-1) for x in range(knn)]
most_similar_samples

[array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.]])]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# we compute the similarity between the original movie with each movie in the
# historical records
similarity = []
for r in records:
    sample1 = movies_rating.loc[r, :].values.reshape(1,-1)
    MSS = [cosine_similarity(sample1, mss)[0][0] for mss in most_similar_samples]
    similarity.append(MSS)

similarity = np.array(similarity).flatten()
similarity

array([0.02215407, 0.0139096 , 0.02830649])

In [None]:
# Finally, rest to decise if the enough similarity with any movie to recomment
matching = 0.7 # this could work as our threshold to decise

is_greater = similarity > matching

if is_greater.any():
    print("The movie '{}' is recommended for the user '{}'".format(title,user))
else:
    print("The user '{}' may not like the film '{}'".format(user, title))

The user '9' may not like the film 'jonas kaufmann - a global star in private'


In [None]:
matching = 0.025 # a non-sense threshold

is_greater = similarity > matching

if is_greater.any():
    print("The movie '{}' is recommended for the user '{}'".format(title,user))
else:
    print("The user '{}' may not like the film '{}'".format(user, title))

The movie 'jonas kaufmann - a global star in private' is recommended for the user '9'
