In [None]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
ratings = pd.read_csv('../Data/ratings_2000.gz', compression='gzip')
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,19906,1,0.5,1488332158
1,40090,1,0.5,1265664409
2,6074,1,0.5,1058211752
3,168146,1,0.5,1485647173
4,77341,1,0.5,1354152236


In [3]:
ratings.shape

(21471486, 4)

In [None]:
unique_users = ratings['userId'].unique()
unique_movies = ratings['movieId'].unique()

In [None]:
user_to_index  = {u: i for i, u in enumerate(unique_users)}
movie_to_index = {m: j for j, m in enumerate(unique_movies)}
index_to_movie = {j: m for m, j in movie_to_index.items()}
ratings['user_idx']  = ratings['userId'].map(user_to_index)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_index)

In [None]:
movie_counts = ratings.groupby('movieId').size()
popular_movies = movie_counts[movie_counts >= 50].index
filtered = ratings[ratings['movieId'].isin(popular_movies)].copy()


u_vals = filtered['userId'].unique()
m_vals = filtered['movieId'].unique()
user_to_index  = {u: i for i, u in enumerate(u_vals)}
movie_to_index = {m: j for j, m in enumerate(m_vals)}
index_to_movie = {j: m for m, j in movie_to_index.items()}

filtered['user_idx']  = filtered['userId'].map(user_to_index)
filtered['movie_idx'] = filtered['movieId'].map(movie_to_index)

n_users  = len(u_vals)
n_movies = len(m_vals)

In [7]:
rows = filtered['user_idx'].to_numpy()
cols = filtered['movie_idx'].to_numpy()
data = filtered['rating'].to_numpy()
user_item_matrix = csr_matrix((data, (rows, cols)), shape=(n_users, n_movies))

In [8]:
K = 10
knn_model = NearestNeighbors(
    n_neighbors=K+1,
    metric='cosine',
    algorithm='brute'
)
knn_model.fit(user_item_matrix)

In [None]:
def recommend_knn(raw_user_id, user_to_index, index_to_movie,
                  knn_model, user_item_matrix, K=10, top_n=5):
    """
    Devuelve las top_n recomendaciones (movieId) para el usuario raw_user_id
    usando el modelo knn_model (sklearn NearestNeighbors).
    """
    if raw_user_id not in user_to_index:
        raise ValueError(f"Usuario {raw_user_id} no está en el dataset.")
    u_idx = user_to_index[raw_user_id]

    distances, neighbor_idxs = knn_model.kneighbors(user_item_matrix[u_idx], n_neighbors=K+1)
    distances = distances.flatten()
    neighbor_idxs = neighbor_idxs.flatten()

    neighbor_idxs = neighbor_idxs[1:]
    distances = distances[1:]

    sims = 1.0 - distances

    neighbor_ratings = user_item_matrix[neighbor_idxs].toarray()
  
    scores = sims @ neighbor_ratings

    seen = user_item_matrix[u_idx].toarray().flatten() > 0
    scores[seen] = -np.inf

    top_idxs = np.argsort(scores)[-top_n:][::-1]
    return [index_to_movie[i] for i in top_idxs]

In [None]:
user_id = 100
recomendaciones = recommend_knn(
                                raw_user_id = user_id, 
                                user_to_index = user_to_index, 
                                index_to_movie = index_to_movie, 
                                knn_model = knn_model, 
                                user_item_matrix = user_item_matrix, 
                                K = K, 
                                top_n = 10
                            )
print(f"Top-5 recomendaciones para el usuario {user_id}:", recomendaciones)

Top-5 recomendaciones para el usuario 100: [3869, 1271, 2401, 546, 1438, 2643, 2671, 2599, 2662, 2600]
