In [80]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import pandas as pd

import gc #garbage collector

In [81]:
from surprise import SVD
from surprise import Dataset, SVDpp
from surprise import Reader
from surprise.model_selection import train_test_split

In [82]:
from surprise import accuracy
from collections import defaultdict

In [83]:
data = pd.read_csv('Datos_ratings.csv')

In [84]:
data = data.drop(['index'], axis=1)

In [85]:
data

Unnamed: 0,userId,score,movieId
0,1,1.0,as680
1,1,4.5,ns2186
2,1,5.0,hs2381
3,1,5.0,ns3663
4,1,5.0,as9500
...,...,...,...
12524284,124380,4.5,ns5272
12524285,124380,2.5,ns5492
12524286,124380,3.5,hs305
12524287,124380,4.5,ns7881


In [86]:
movies = pd.read_csv('Datos_PI_2.csv')

In [87]:
df_movies = movies.drop(['show_id','type', 'director', 'cast', 'country', 'date_added',	'release_year',	'rating', 'duration', 'duration_int', 'duration_type', 'description', 'ScoreMedia', 'index',  'userId'], axis=1)

In [89]:
df_movies

Unnamed: 0,title,listed_in,id
0,the grand seduction,"comedy, drama",as1
1,take care good night,"drama, international",as2
2,secrets of deception,"action, drama, suspense",as3
3,pink: staying true,documentary,as4
4,monster maker,"drama, fantasy",as5
...,...,...,...
22993,zodiac,"cult movies, dramas, thrillers",ns8803
22994,zombie dumb,"kids' tv, korean tv shows, tv comedies",ns8804
22995,zombieland,"comedies, horror movies",ns8805
22996,zoom,"children & family movies, comedies",ns8806


In [90]:
df_movies = pd.merge(data, df_movies, left_on='movieId', right_on='id')

In [91]:
df_movies = df_movies.drop(['id'], axis=1)

In [92]:
df_movies

Unnamed: 0,userId,score,movieId,title,listed_in
0,1,1.0,as680,the english civil war,"documentary, special interest"
1,583,4.5,as680,the english civil war,"documentary, special interest"
2,765,5.0,as680,the english civil war,"documentary, special interest"
3,2116,3.0,as680,the english civil war,"documentary, special interest"
4,2143,3.0,as680,the english civil war,"documentary, special interest"
...,...,...,...,...,...
12524284,123652,3.0,ns720,power rangers dino fury,kids' tv
12524285,123831,3.5,ns720,power rangers dino fury,kids' tv
12524286,123886,3.5,ns720,power rangers dino fury,kids' tv
12524287,123923,3.5,ns720,power rangers dino fury,kids' tv


In [93]:
reader = Reader()

In [94]:
N_filas = 100000 # Limitamos el dataset a N_filas

data = Dataset.load_from_df(data[['userId', 'movieId', 'score']][:N_filas], reader)

In [95]:
#Separo en train y test
train, test = train_test_split(data, test_size=0.25)

#Instanciamos el algoritmo y entrenamos
svd = SVDpp()
svd.fit(train)
preds = svd.test(test)

#Métricas de evaluacion
accuracy.mae(preds)
accuracy.rmse(preds)

MAE:  0.7615
RMSE: 0.9797


0.9797055257202059

In [96]:
# Creamos todo el dataset completo con Train y Test
trainfull = data.build_full_trainset()

In [97]:
#Instanciamos de nuevo el algoritmo
svd = SVDpp()
#Entrenamos el algoritmo
svd.fit(trainfull)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x20888f37820>

In [98]:
#realizamos una prediccion para ver que todo funciona
svd.predict(uid=1, iid=1)

Prediction(uid=1, iid=1, r_ui=None, est=4.214333647156485, details={'was_impossible': False})

In [99]:
#Creamos una función que pasandole, un usuario, un DataFrame, un algoritmo y el número de recomendaciones que queremos
def recommend_system(userId, dataframe, algorithm, n_commends):
    """
con los parametros, devuelve las mejores n peliculas.
Parametros
-----------
userId: El Id de usuario de la persona que queremos recomendarle peliculas.
dataframe: El dataframe que utilizamos.
algoritmo: El algoritmo que creamos para recomendar peliculas.
n_commends: El numero de peliculas recomendadas
return
------
IDs de peliculas que a un usuario especifico le gustarian..
    """
    movie_ids = dataframe['movieId'].to_list()
    movies_watched = dataframe[dataframe["userId"] == userId]["movieId"]
    movies_no_watched = [movie for movie in movie_ids if movie not in movies_watched]

    preds = [algorithm.predict(uid=userId, iid=movie) for movie in movies_no_watched]
    commends_ratting = {pred[1]:pred[3] for pred in preds}
    order_dict = {k: v for k, v in sorted(commends_ratting.items(), key=lambda item: item[1])}

    top_predictions = list(order_dict.keys())[:n_commends]

    return dataframe[dataframe["movieId"].isin(top_predictions)][["title"]].drop_duplicates()

In [100]:
#Función que nos da las películas de un usuario que más le gustan, según la puntuación que le ha dado
def check_movies_user(userId, dataframe, n):
    return dataframe[dataframe["userId"] ==userId].sort_values("score", ascending=False)[:n] 

In [101]:
#Vemos las películas que ve, y las que le hemos recomendado.
movies_recommended = recommend_system(1, df_movies, svd, 5)
print(f"Movies user likes:", check_movies_user(1, df_movies, 20))
print("ID of the movies recommended:", movies_recommended)

Movies user likes:        userId  score movieId                                           title  \
14345       1    5.0  ns1310                        my teacher, my obsession   
5515        1    5.0  ns7360                                      lucky days   
11604       1    5.0   hs871                                     mayans m.c.   
10513       1    5.0  as4340                                bright hill road   
9947        1    5.0  as6909                                    mexican moon   
9350        1    5.0  ns5413                                  criminal minds   
8773        1    5.0  hs2056                    biography: the trump dynasty   
8227        1    5.0  as1577                                       mr. robot   
12753       1    5.0  ns8584                              thorne: sleepyhead   
6037        1    5.0  ns3980                       charlie's colorforms city   
3870        1    5.0  as6112                                          salome   
2166        1    5.0 