# Recommendation Systems 2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from ETL import ETL_class

In [2]:
path_rating = './dataset/ratings/'
path_titles = './dataset/titles/'
etl = ETL_class(path_titles, path_rating)

df_movies = etl.get_movies()[['id', 'title', 'listed_in']]
print(df_movies.shape)
df_movies.head()

(22998, 3)


Unnamed: 0,id,title,listed_in
0,as1,the grand seduction,"comedy, drama"
1,as2,take care good night,"drama, international"
2,as3,secrets of deception,"action, drama, suspense"
3,as4,pink: staying true,documentary
4,as5,monster maker,"drama, fantasy"


In [3]:
df_ratings = etl.get_ratings()[['userId', 'rating', 'movieId']].rename(columns={'movieId':'id'})
print(df_ratings.shape)
df_ratings.head()

(11024165, 3)


Unnamed: 0,userId,rating,id
0,1,1.0,as680
1,1,4.5,ns2186
2,1,5.0,hs2381
3,1,5.0,ns3663
4,1,5.0,as9500


In [4]:
df_count = df_ratings[['userId','id']].groupby('id').count()
df_count.reset_index(inplace=True)
df_count.rename(columns={'userId': 'count'}, inplace=True)
df_count.head()

Unnamed: 0,id,count
0,as1,502
1,as10,513
2,as100,430
3,as1000,485
4,as1001,469


In [5]:
print(df_count['count'].min())
print(df_count['count'].max())

396
576


In [6]:
df_count.loc[df_count['count'] >= 500]

Unnamed: 0,id,count
0,as1,502
1,as10,513
16,as1012,508
25,as1020,529
28,as1023,502
...,...,...
22956,ns961,505
22962,ns967,535
22964,ns969,509
22974,ns978,515


In [7]:
df = pd.merge(df_ratings, df_count, on='id', how='left').rename(columns={'listed_in':'genre'})
print(df.shape)
df.head()

(11024165, 4)


Unnamed: 0,userId,rating,id,count
0,1,1.0,as680,458
1,1,4.5,ns2186,501
2,1,5.0,hs2381,485
3,1,5.0,ns3663,446
4,1,5.0,as9500,482


In [8]:
df = df.loc[df['count'] >= 500]
movies_rating = df.pivot_table(index='id', columns='userId', values='rating').fillna(0)
movies_rating.head()

userId,1,2,3,4,5,6,7,8,9,10,...,270886,270887,270889,270890,270891,270892,270893,270894,270895,270896
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
as1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as1012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
as1020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
as1023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

movie_rating_matrix = csr_matrix(movies_rating.values)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_rating_matrix)

In [25]:
random_index = np.random.choice(movies_rating.shape[0])
print(random_index)
distances, indices = model_knn.kneighbors(movies_rating.iloc[random_index, :].values.reshape(1,-1), n_neighbors=10)

3893


In [None]:
indices

In [26]:
for ii in range(0, len(distances.flatten())):
    if ii == 0:
        print('Recomendations for {}:\n'.format(movies_rating.index[random_index]))
    else:
        idx = indices.flatten()[ii]
        mox = movies_rating.index[idx]
        dix = distances.flatten()[ii]
        print('{}: {}, with discance of {}:'.format(ii, mox, dix))

Recomendations for ns7341:

1: ns3544, with discance of 0.9548819696925881:
2: as2920, with discance of 0.95671745559677:
3: as4685, with discance of 0.9576410811110894:
4: hs2201, with discance of 0.957705320811717:
5: as6176, with discance of 0.9584987688869077:
6: as5491, with discance of 0.9593210784261056:
7: hs164, with discance of 0.95968981139392:
8: ns8077, with discance of 0.9611008623877612:
9: ns1130, with discance of 0.9611711334597457:


In [33]:
print(indices[0][0])
df_movies.loc[df_movies['id'] == 'hs975']

3893


Unnamed: 0,id,title,listed_in
974,hs975,men in kilts: a roadtrip with sam and graham,"documentaries, lifestyle & culture"


In [28]:
recom = {movies_rating.index[k]: v for (k, v) in zip(indices.flatten(), distances.flatten())}
recom

{'ns7341': 1.887379141862766e-15,
 'ns3544': 0.9548819696925881,
 'as2920': 0.95671745559677,
 'as4685': 0.9576410811110894,
 'hs2201': 0.957705320811717,
 'as6176': 0.9584987688869077,
 'as5491': 0.9593210784261056,
 'hs164': 0.95968981139392,
 'ns8077': 0.9611008623877612,
 'ns1130': 0.9611711334597457}

In [35]:
for k in recom.keys():
    print(df_movies.loc[df_movies['id'] == k]['title'])

7340    los tiempos de pablo escobar
Name: title, dtype: object
3543    serial killer with piers morgan
Name: title, dtype: object
2919    cannabis to save my life
Name: title, dtype: object
4684    the herald and the horror
Name: title, dtype: object
2200    diesel brothers
Name: title, dtype: object
6175    mila and morphle's magical games
Name: title, dtype: object
5490    take a stan
Name: title, dtype: object
163    american dad!
Name: title, dtype: object
8076    spy kids 3: game over
Name: title, dtype: object
1129    secrets of great british castles
Name: title, dtype: object
