# Sistemas de recomendación basados en MovieLens

In [1]:
import surprise

In [33]:
import pandas as pd
import numpy as np
SEMILLA_ALEATORIEDAD = 123
np.random.seed(SEMILLA_ALEATORIEDAD)

In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv', sep=',')
print('Shape of this dataset: ', movies.shape)
movies.head()

Shape of this dataset:  (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings = pd.read_csv('ml-latest-small/ratings.csv', sep=',')
print('Shape of this dataset: ', ratings.shape)
ratings.head()

Shape of this dataset:  (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
users = pd.read_csv('ml-latest-small/ratings.csv',sep=';')
print('Shape of this dataset :',users.shape)
users.head()

Shape of this dataset : (100836, 1)


Unnamed: 0,"userId,movieId,rating,timestamp"
0,"1,1,4.0,964982703"
1,"1,3,4.0,964981247"
2,"1,6,4.0,964982224"
3,"1,47,5.0,964983815"
4,"1,50,5.0,964982931"


# Filtrado colaborativo (Collaborative Filtering)

## Basado en memoria

Creamos la matriz de interacciones usuario-item

In [8]:
rating_pivot = ratings.pivot_table(values='rating',columns='userId',index='movieId').fillna(0)
print('Shape of this pivot table :',rating_pivot.shape)
rating_pivot.head()

Shape of this pivot table : (9724, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


Aplicamos NN

In [10]:
from sklearn.neighbors import NearestNeighbors
nn_algo = NearestNeighbors(metric='cosine')
nn_algo.fit(rating_pivot)

Clase que implementa el motor de recomendación. Es en función de una película pasada, por lo que el algoritmo es basado en item, pues la similitud se calcula en fución de los items

La función de recomendar basándose en historial, es para recomendar películas a un usuario dado

In [11]:
import numpy as np

class Recommender:
    def __init__(self):
        # This list will stored movies that called atleast ones using recommend_on_movie method
        self.hist = [] 
        self.ishist = False # Check if history is empty
    
    # This method will recommend movies based on a movie that passed as the parameter
    def recommend_on_movie(self,movie,n_reccomend = 5):
        self.ishist = True
        movieid = int(movies[movies['title']==movie]['movieId'])
        self.hist.append(movieid)
        distance,neighbors = nn_algo.kneighbors([rating_pivot.loc[movieid]],n_neighbors=n_reccomend+1)
        movieids = [rating_pivot.iloc[i].name for i in neighbors[0]]
        recommeds = [str(movies[movies['movieId']==mid]['title']).split('\n')[0].split('  ')[-1] for mid in movieids if mid not in [movieid]]
        return recommeds[:n_reccomend]
    
    # This method will recommend movies based on history stored in self.hist list
    def recommend_on_history(self,n_reccomend = 5):
        if self.ishist == False:
            return print('No history found')
        history = np.array([list(rating_pivot.loc[mid]) for mid in self.hist])
        distance,neighbors = nn_algo.kneighbors([np.average(history,axis=0)],n_neighbors=n_reccomend + len(self.hist))
        movieids = [rating_pivot.iloc[i].name for i in neighbors[0]]
        recommeds = [str(movies[movies['movieId']==mid]['title']).split('\n')[0].split('  ')[-1] for mid in movieids if mid not in self.hist]
        return recommeds[:n_reccomend]

In [12]:
recommender = Recommender()

In [13]:
# Problema del arranque en frío, como está recien inicializado, no hay un historial en el que basarse
recommender.recommend_on_history()

No history found


In [14]:
# Recommendation based on this movie 
recommender.recommend_on_movie('Father of the Bride Part II (1995)')

  movieid = int(movies[movies['title']==movie]['movieId'])


['Sabrina (1995)',
 'Juror, The (1996)',
 'Striptease (1996)',
 "Mr. Holland's Opus (1995)",
 'Grumpier Old Men (1995)']

In [15]:
# Recommendation based on this movie
recommender.recommend_on_movie('Tigerland (2000)')

  movieid = int(movies[movies['title']==movie]['movieId'])


['Tsotsi (2005)',
 'Shape of Things, The (2003)',
 'Malèna (2000)',
 'Max (2002)',
 'Dancer Upstairs, The (2002)']

In [16]:
recommender.recommend_on_history()

['Sabrina (1995)',
 'Juror, The (1996)',
 'Striptease (1996)',
 'Grumpier Old Men (1995)',
 'Willy Wonka & the Chocolate Factory (1971)']

## Basado en modelos

In [65]:
from surprise import Reader, Dataset, SVD, NMF

In [66]:
from surprise.model_selection import cross_validate

In [67]:
reader = Reader()

In [68]:
reader = Reader(rating_scale=(1, 5))

In [69]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [70]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
nmf = NMF()
cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9218  0.9270  0.9186  0.9043  0.9165  0.9176  0.0076  
MAE (testset)     0.7066  0.7101  0.7050  0.6948  0.7035  0.7040  0.0051  
Fit time          2.15    1.81    1.81    1.79    2.30    1.97    0.21    
Test time         0.09    0.08    0.08    0.08    0.55    0.18    0.19    


{'test_rmse': array([0.92176188, 0.92704677, 0.91861189, 0.90427894, 0.91650221]),
 'test_mae': array([0.70661779, 0.71007959, 0.7049929 , 0.6948338 , 0.70354204]),
 'fit_time': (2.145155429840088,
  1.8141498565673828,
  1.8081676959991455,
  1.7922065258026123,
  2.3018460273742676),
 'test_time': (0.0937492847442627,
  0.08477330207824707,
  0.08377575874328613,
  0.0827786922454834,
  0.5545201301574707)}

In [71]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8768  0.8691  0.8792  0.8777  0.8698  0.8745  0.0042  
MAE (testset)     0.6742  0.6674  0.6784  0.6716  0.6677  0.6719  0.0041  
Fit time          1.05    1.22    0.97    1.31    1.06    1.12    0.12    
Test time         0.14    0.11    0.11    0.15    0.11    0.12    0.02    


{'test_rmse': array([0.8767503 , 0.86913192, 0.87915928, 0.87767795, 0.86982741]),
 'test_mae': array([0.6742472 , 0.66736951, 0.67836123, 0.67163951, 0.66772399]),
 'fit_time': (1.0521175861358643,
  1.2197391986846924,
  0.9684352874755859,
  1.3055102825164795,
  1.05716872215271),
 'test_time': (0.13647174835205078,
  0.11270618438720703,
  0.10671305656433105,
  0.15259432792663574,
  0.11469626426696777)}

In [22]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25b3141f5d0>

In [23]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
227,1,3744,4.0,964980694
228,1,3793,5.0,964981855
229,1,3809,4.0,964981220
230,1,4006,4.0,964982903


In [24]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=4.086969822474454, details={'was_impossible': False})

In [34]:
movies_not_watched = np.setdiff1d(ratings['movieId'].unique(), ratings[ratings['userId'] == 1]['movieId'].unique())
len(movies_not_watched)

9492

In [35]:
len(ratings['movieId'].unique())

9724

In [33]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [37]:
# Ejemplo de predicción para un usuario y una película específicos
userID = 1
#movieID = '47'

# Obtener las películas no vistas por el usuario
movies_not_watched = np.setdiff1d(ratings['movieId'].unique(), ratings[ratings['userId'] == 1]['movieId'].unique())

# Predecir las valoraciones para todas las películas no vistas
predictions = [svd.predict(userID, movieID).est for movieID in movies_not_watched]

# Ordenar las películas por valoración predicha y recomendar las mejores
recommended_movies = sorted(zip(movies_not_watched, predictions), key=lambda x: x[1], reverse=True)[:10]
print("Películas recomendadas:")
for movieID, rating in recommended_movies:
    print(f"Película: {movieID}, Predicción de rating: {rating}")

Películas recomendadas:
Película: 318, Predicción de rating: 5
Película: 741, Predicción de rating: 5
Película: 750, Predicción de rating: 5
Película: 778, Predicción de rating: 5
Película: 858, Predicción de rating: 5
Película: 898, Predicción de rating: 5
Película: 904, Predicción de rating: 5
Película: 908, Predicción de rating: 5
Película: 912, Predicción de rating: 5
Película: 930, Predicción de rating: 5


# Basado en contenido (content-based)

In [36]:
rating_counts = rating_matrix.apply(lambda x: x[x > 0].count())

user_with_most_ratings = rating_counts.idxmax()
most_ratings_count = rating_counts.max()

user_with_least_ratings = rating_counts.idxmin()
least_ratings_count = rating_counts.min()

print(f'Usuario con más valoraciones: {user_with_most_ratings} ({most_ratings_count} valoraciones)')
print(f'Usuario con menos valoraciones: {user_with_least_ratings} ({least_ratings_count} valoraciones)')

ratings_user_max = rating_matrix[user_with_most_ratings]
ratings_user_max = ratings_user_max[ratings_user_max > 0]
ratings_user_max

Usuario con más valoraciones: 414 (2698 valoraciones)
Usuario con menos valoraciones: 53 (20 valoraciones)


movieId
1         4.0
2         3.0
3         4.0
5         2.0
6         3.0
         ... 
180045    4.0
180497    4.0
180985    3.5
184791    2.5
187595    3.5
Name: 414, Length: 2698, dtype: float64

In [40]:
index = ratings_user_max.index

In [42]:
movies.set_index('movieId', inplace=True)

In [43]:
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [47]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
genres = vectorizer.fit_transform(movies.genres).toarray()
contents = pd.DataFrame(genres, index=movies.index, columns=vectorizer.get_feature_names_out())
print('Shape of the content table :',contents.shape)
contents.head()

Shape of the content table : (9742, 23)


Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,imax,listed,musical,mystery,noir,romance,sci,thriller,war,western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
contents = contents.loc[index]

In [53]:
data = pd.concat([contents, ratings_user_max], axis=1)
data

Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,listed,musical,mystery,noir,romance,sci,thriller,war,western,414
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,4.0
2,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3.0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,4.0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.0
6,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180045,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,4.0
180497,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,4.0
180985,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,3.5
184791,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.5


In [57]:
data = data.rename(columns={414: 'ratings'})

In [60]:
from sklearn.model_selection import train_test_split
y = data['ratings']
X = data.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
from sklearn.linear_model import LinearRegression
# Entrenamiento del modelo de regresión lineal
modelo = LinearRegression()
modelo.fit(X_train, y_train)

In [64]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predicciones en el conjunto de prueba
y_pred = modelo.predict(X_test)

# Cálculo del RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# Cálculo del MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

RMSE: 0.8812383913019051
MAE: 0.70755999877377


-----