In [1]:
import pandas as pd
import pickle
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_movies = pd.read_csv(r"/mnt/c/Projects/Recomender System/data/raw/movies.csv")

df_movies["title"] = df_movies["title"].fillna("")
df_movies["genres"] = df_movies["genres"].fillna("")

df_movies["content"] = (df_movies["title"] + " " + df_movies["genres"]).str.strip()

df_movies.head()

Unnamed: 0,movieId,title,genres,content
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [4]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df_movies["content"])
tfidf_matrix.shape

(62423, 5000)

In [5]:
movie_id_to_row = {
    mid: idx for idx, mid in enumerate(df_movies["movieId"].astype("int").values)
}

row_to_movie_id = {
    idx: mid for mid, idx in movie_id_to_row.items()
}

len(movie_id_to_row), len(row_to_movie_id)

(62423, 62423)

In [6]:
def content_similar_base(seed_movie_id : int, top_n : int =10):
    if seed_movie_id not in movie_id_to_row:
        return []
    
    seed_idx = movie_id_to_row[seed_movie_id]
    sims = cosine_similarity(tfidf_matrix[seed_idx], tfidf_matrix).ravel()
    
    sims[seed_idx] = -1
    top_idx = np.argsort(sims)[::-1][:top_n]
    
    return [(row_to_movie_id[int(idx)], float(sims[idx])) for idx in top_idx]

In [7]:
ALS_PATH = r"/mnt/c/Projects/Recomender System/models/als_implicit.pkl"

with open(ALS_PATH, "rb") as f:
    artifact = pickle.load(f)
    
if isinstance(artifact, dict):
    model = artifact["model"]
    user_id_to_idx = artifact["user_id_to_idx"]
    movie_id_to_idx = artifact["movie_id_to_row"]
else:
    model = artifact
    user_id_to_idx = None
    movie_id_to_idx = None
    
type(model), (user_id_to_idx is None), (movie_id_to_idx is None)

(implicit.cpu.als.AlternatingLeastSquares, True, True)

In [None]:
df_ratings = pd.read_parquet(r"/mnt/c/Projects/Recomender System/data/processed/ratings_movies.parquet", columns=["userId", "movieId", "rating"])

df_ratings["userId"] = df_ratings["userId"].astype("int32")
df_ratings["movieId"] = df_ratings["movieId"].astype("int32")
df_ratings["rating"] = df_ratings["rating"].astype("float32")

if user_id_to_idx is None or movie_id_to_idx is None:
    user_ids = df_ratings["userId"].unique().tolist()
    movie_ids = df_ratings["movieId"].unique().tolist()
    user_id_to_idx = {uid: idx for idx, uid in enumerate(user_ids)}
    movie_id_to_idx = {mid: idx for idx, mid in enumerate(movie_ids)}
    
idx_to_movie_id = {idx: mid for mid, idx in movie_id_to_idx.items()}

rows = df_ratings["userId"].map(user_id_to_idx)
cols = df_ratings["movieId"].map(movie_id_to_idx)

mask = rows.notna() & cols.notna()
rows = rows[mask].astype("int").to_numpy()
cols = cols[mask].astype("int").to_numpy()
data = df_ratings.loc[mask, "rating"].to_numpy(dtype="float32")

del df_ratings
import gc; gc.collect()

R = csr_matrix((data, (rows, cols)), shape=(len(user_id_to_idx), len(movie_id_to_idx)))

R.shape, R.nnz

: 