In [1]:
import pandas as pd
import pickle
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_movies = pd.read_csv(r"/mnt/c/Projects/Recomender System/data/raw/movies.csv")

df_movies["title"] = df_movies["title"].fillna("")
df_movies["genres"] = df_movies["genres"].fillna("")

df_movies["content"] = (df_movies["title"] + " " + df_movies["genres"]).str.strip()

df_movies.head()

Unnamed: 0,movieId,title,genres,content
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [3]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df_movies["content"])
tfidf_matrix.shape

(62423, 5000)

In [4]:
movie_id_to_row = {
    mid: idx for idx, mid in enumerate(df_movies["movieId"].astype("int").values)
}

row_to_movie_id = {
    idx: mid for mid, idx in movie_id_to_row.items()
}

len(movie_id_to_row), len(row_to_movie_id)

(62423, 62423)

In [5]:
def content_similar_base(seed_movie_id : int, top_n : int =10):
    if seed_movie_id not in movie_id_to_row:
        return []
    
    seed_idx = movie_id_to_row[seed_movie_id]
    sims = cosine_similarity(tfidf_matrix[seed_idx], tfidf_matrix).ravel()
    
    sims[seed_idx] = -1
    top_idx = np.argsort(sims)[::-1][:top_n]
    
    return [(row_to_movie_id[int(idx)], float(sims[idx])) for idx in top_idx]

In [6]:
ALS_PATH = r"/mnt/c/Projects/Recomender System/models/als_implicit.pkl"

with open(ALS_PATH, "rb") as f:
    artifact = pickle.load(f)
    
if isinstance(artifact, dict):
    model = artifact["model"]
    user_id_to_idx = artifact["user_id_to_idx"]
    movie_id_to_idx = artifact["movie_id_to_row"]
else:
    model = artifact
    user_id_to_idx = None
    movie_id_to_idx = None
    
type(model), (user_id_to_idx is None), (movie_id_to_idx is None)

(implicit.cpu.als.AlternatingLeastSquares, True, True)

In [7]:
df_ratings = pd.read_parquet(r"/mnt/c/Projects/Recomender System/data/processed/ratings_movies.parquet", columns=["userId", "movieId", "rating"])

df_ratings["userId"] = df_ratings["userId"].astype("int32")
df_ratings["movieId"] = df_ratings["movieId"].astype("int32")
df_ratings["rating"] = df_ratings["rating"].astype("float32")

if user_id_to_idx is None or movie_id_to_idx is None:
    user_ids = df_ratings["userId"].unique().tolist()
    movie_ids = df_ratings["movieId"].unique().tolist()
    user_id_to_idx = {uid: idx for idx, uid in enumerate(user_ids)}
    movie_id_to_idx = {mid: idx for idx, mid in enumerate(movie_ids)}
    
idx_to_movie_id = {idx: mid for mid, idx in movie_id_to_idx.items()}

rows = df_ratings["userId"].map(user_id_to_idx)
cols = df_ratings["movieId"].map(movie_id_to_idx)

mask = rows.notna() & cols.notna()
rows = rows[mask].astype("int").to_numpy()
cols = cols[mask].astype("int").to_numpy()
data = df_ratings.loc[mask, "rating"].to_numpy(dtype="float32")

del df_ratings
import gc; gc.collect()

R = csr_matrix((data, (rows, cols)), shape=(len(user_id_to_idx), len(movie_id_to_idx)))

R.shape, R.nnz

((162541, 59047), 25000095)

In [9]:
def _normalize_implicit_recs(recs):
    if isinstance(recs, tuple) and len(recs) == 2:
        items, scores = recs
        items = np.asarray(items).tolist()
        scores = np.asarray(scores).tolist()
        return list(zip(items, scores))
    return recs

def recommend_cf(user_id: int, top_n: int = 50) -> pd.DataFrame | str:
    if user_id not in user_id_to_idx:
        return "User tidak ditemukan"
    
    uidx = user_id_to_idx[user_id]
    user_items = R[uidx]
    
    recs = model.recommend(
        uidx,
        user_items,
        N=top_n,
        filter_already_liked_items=True
    )
    
    rows_out = _normalize_implicit_recs(recs)
    
    out = []
    for i, s in rows_out:
        i = int(i)
        movie_id = idx_to_movie_id[i] if i in idx_to_movie_id else i
        out.append((movie_id, float(s)))
    
    return pd.DataFrame(out, columns=["movieId", "cf_score"])


In [11]:
def hybrid_recommend(user_id: int, top_n: int = 10, alpha: float = 0.7, cf_indicate: int = 50, cb_per_seed: int = 10) -> pd.DataFrame | str:
    cf_df = recommend_cf(user_id, top_n=top_n*5)
    if isinstance(cf_df, str):
        return cf_df
    
    cf_scores = cf_df['cf_score'].values
    cf_min, cf_max = float(cf_scores.min()), float(cf_scores.max())
    if cf_max > cf_min:
        cf_df['norm'] = (cf_df['cf_score'] - cf_min) / (cf_max - cf_min)
    else:
        cf_df['norm'] = 1.0
        
    scores = {}
    
    for _, r in cf_df.iterrows():
        mid = int(r['movieId'])
        scores[mid] = scores.get(mid, 0) + alpha * float(r['norm'])
        
    for seed_min in cf_df['movieId'].astype('int').values:
        sim_list = content_similar_base(seed_min, top_n=cb_per_seed)
        for sim_mid, sim_score in sim_list:
            scores[sim_mid] = scores.get(int(sim_mid), 0) + (1 - alpha) * float(sim_score)
            
    out = pd.DataFrame(
        [(mid, sc) for mid, sc in scores.items()],
        columns=["movieId", "hybrid_score"]
    ).sort_values("hybrid_score", ascending=False)
    
    if user_id in user_id_to_idx:
        uidx = user_id_to_idx[user_id]
        seen_idx = R[uidx].indices
        seen_movies_id = set(idx_to_movie_id[int(i)] for i in seen_idx if int(i) in idx_to_movie_id)
        out = out[~out['movieId'].isin(seen_movies_id)]
        
    out = out.head(top_n).merge(
        df_movies[['movieId', 'title','genres']],
        on='movieId',
        how='left'
    )
    
    out['title'] = out['title'].fillna('Unknown Title')
    out['genres'] = out['genres'].fillna('Unknown Genres')
    
    return out

In [14]:
hybrid_recommend(1, top_n=10, alpha=0.4)


Unnamed: 0,movieId,hybrid_score,title,genres
0,205337,0.6,Momentum Generation (2018),Documentary
1,126056,0.6,Coven (2000),Horror
2,176259,0.6,The Burkittsville 7 (2000),Horror
3,190361,0.6,My Generation (2018),Documentary
4,135300,0.6,Chicks (2010),(no genres listed)
5,151701,0.6,Bloodmoney (2010),(no genres listed)
6,159984,0.6,Beş Şehir (2010),(no genres listed)
7,181397,0.6,Mystification (2010),(no genres listed)
8,183257,0.6,"Zaytsev, zhgi! Istoriya shoumena (2010)",(no genres listed)
9,181377,0.6,Erratum (2010),(no genres listed)


In [15]:
cf_only[cf_only["title"] != "Unknown Title"]
hybrid_recommend(1, 10, alpha=0.7).query("title != 'Unknown Title'")

Unnamed: 0,movieId,hybrid_score,title,genres
2,144300,0.65309,As the Light Goes Out (2014),Action|Drama
3,136309,0.644611,Scooby-Doo! Moon Monster Madness (2015),Adventure|Animation|Children
6,117500,0.355382,Beauty For The Asking (1939),Drama|Romance
7,97880,0.341522,"Child Is Waiting, A (1963)",Drama
9,141260,0.325148,Flashback - Mörderische Ferien (2000),Horror


In [13]:
# Comparison CF vs Hybrid

cf_only = recommend_cf(user_id=1, top_n=10)
cf_only = cf_only.merge(
    df_movies[['movieId', 'title', 'genres']],
    on='movieId',
    how='left'
)
cf_only['title'] = cf_only['title'].fillna('Unknown Title')
cf_only['genres'] = cf_only['genres'].fillna('Unknown Genres')

cf_only, hybrid_recommend(user_id=1, top_n=10, alpha=0.7)

(   movieId  cf_score                                    title  \
 0   159379  1.488745                            Unknown Title   
 1   135973  1.484348                            Unknown Title   
 2   144300  1.480830             As the Light Goes Out (2014)   
 3   136309  1.479399  Scooby-Doo! Moon Monster Madness (2015)   
 4   147951  1.437021                            Unknown Title   
 5   109730  1.431393                            Unknown Title   
 6   117500  1.430597             Beauty For The Asking (1939)   
 7    97880  1.428259               Child Is Waiting, A (1963)   
 8    93791  1.425652                            Unknown Title   
 9   141260  1.425496    Flashback - Mörderische Ferien (2000)   
 
                          genres  
 0                Unknown Genres  
 1                Unknown Genres  
 2                  Action|Drama  
 3  Adventure|Animation|Children  
 4                Unknown Genres  
 5                Unknown Genres  
 6                 Drama|Ro

In [16]:
content_similar_base(144300, top_n=5)

[(np.int64(203423), 0.7321795750624849),
 (np.int64(139277), 0.6941922286155444),
 (np.int64(112749), 0.6776593512982929),
 (np.int64(104526), 0.5649515734121432),
 (np.int64(162592), 0.5648009873276932)]

In [17]:
def show_content_similar(movie_id, top_n=5):
    sims = content_similar_base(movie_id, top_n)
    df = pd.DataFrame(sims, columns=["movieId", "similarity"])
    df["movieId"] = df["movieId"].astype(int)
    return df.merge(
        df_movies[["movieId", "title", "genres"]],
        on="movieId",
        how="left"
    )

show_content_similar(144300, 5)

Unnamed: 0,movieId,similarity,title,genres
0,203423,0.73218,The Light Shines Only There (2014),Drama
1,139277,0.694192,The Last Light (2014),Drama|Thriller
2,112749,0.677659,And So It Goes (2014),Comedy|Drama|Romance
3,104526,0.564952,So It Goes (Korsoteoria) (2012),Drama
4,162592,0.564801,The Light Between Oceans (2016),Drama
