In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
from collections import defaultdict

In [3]:
# Config / file paths
MOVIES_FILE = "movies.csv"
RATINGS_FILE = "ratings.csv"

N_COMPONENTS = 20      # latent factors for NMF
RANDOM_STATE = 42
TOP_K = 10             # for top-N recommendations / evaluation

In [4]:
# Load data
def load_data(movies_path=MOVIES_FILE, ratings_path=RATINGS_FILE):
    movies = pd.read_csv(movies_path)
    ratings = pd.read_csv(ratings_path)
    # minimal sanity checks
    required_movies_cols = {'movieId', 'title'}
    required_ratings_cols = {'userId', 'movieId', 'rating'}
    assert required_movies_cols.issubset(movies.columns), "movies.csv missing required columns"
    assert required_ratings_cols.issubset(ratings.columns), "ratings.csv missing required columns"
    # ensure types
    movies['movieId'] = movies['movieId'].astype(int)
    ratings['movieId'] = ratings['movieId'].astype(int)
    ratings['userId'] = ratings['userId'].astype(int)
    ratings['rating'] = ratings['rating'].astype(float)
    return movies, ratings

movies, ratings = load_data()

print(f"Loaded {ratings.shape[0]} ratings and {movies.shape[0]} movies.")

Loaded 100836 ratings and 9742 movies.


In [5]:
# Build user-item matrix
def build_user_item_matrix(ratings_df, fill_value=0.0):
    # keep userId and movieId as indexes (ensure compact user/item mapping)
    user_ids = ratings_df['userId'].unique()
    movie_ids = ratings_df['movieId'].unique()
    # But for pivot we prefer continuous index — we'll use pivot_table directly
    pivot = ratings_df.pivot_table(index='userId', columns='movieId', values='rating', fill_value=fill_value)
    return pivot

user_item = build_user_item_matrix(ratings)
print(f"User-item matrix shape: {user_item.shape}")  # (n_users, n_items)

User-item matrix shape: (610, 9724)


In [6]:
# Mapping data for movieId->column index and vice versa
movie_ids = user_item.columns.tolist()
user_ids = user_item.index.tolist()
movieid_to_col = {mid: idx for idx, mid in enumerate(movie_ids)}
col_to_movieid = {idx: mid for mid, idx in movieid_to_col.items()}

In [7]:
# Train NMF (matrix factorization)
def train_nmf(user_item_matrix, n_components=N_COMPONENTS, random_state=RANDOM_STATE):
    model = NMF(n_components=n_components, init="nndsvda", random_state=random_state, max_iter=500)
    W = model.fit_transform(user_item_matrix.values)   # user x latent
    H = model.components_                              # latent x item
    return model, W, H

nmf_model, W_users, H_items = train_nmf(user_item, n_components=N_COMPONENTS)
print("Trained NMF: W shape (users x latent) =", W_users.shape, "H shape (latent x items) =", H_items.shape)

Trained NMF: W shape (users x latent) = (610, 20) H shape (latent x items) = (20, 9724)


In [8]:
# Predicted full matrix by multiplication
reconstructed = np.dot(W_users, H_items)  # shape: (n_users, n_items)

# Wrap reconstructed into DataFrame for easier indexing
recon_df = pd.DataFrame(reconstructed, index=user_item.index, columns=user_item.columns)

In [9]:
# Item-item similarity (cosine on item latent factors)
item_vectors = H_items.T  # shape (n_items, n_components)
item_similarity = cosine_similarity(item_vectors)  # item x item matrix

In [10]:
# helper: top-N similar movies by movieId
def similar_movies(movie_id, top_n=10):
    if movie_id not in movieid_to_col:
        return []
    col = movieid_to_col[movie_id]
    sims = item_similarity[col]
    # sort indices by similarity excluding itself
    idx_sorted = np.argsort(-sims)
    results = []
    count = 0
    for idx in idx_sorted:
        if idx == col:
            continue
        results.append((col_to_movieid[idx], sims[idx]))
        count += 1
        if count >= top_n:
            break
    # return dataframe with movie titles
    rows = []
    for mid, score in results:
        title = movies.loc[movies['movieId'] == mid, 'title'].values
        title = title[0] if len(title) > 0 else str(mid)
        rows.append({'movieId': mid, 'title': title, 'similarity': float(score)})
    return pd.DataFrame(rows)

In [11]:
# Recommend for a user
def recommend_for_user_nmf(user_id, top_n=10, filter_seen=True):
    """Recommendations using NMF predicted ratings (recon_df)."""
    if user_id not in recon_df.index:
        return []
    user_scores = recon_df.loc[user_id]
    if filter_seen:
        seen = set(user_item.loc[user_id][user_item.loc[user_id] > 0].index.tolist())
    else:
        seen = set()
    # drop seen
    candidates = [(mid, user_scores[mid]) for mid in user_scores.index if mid not in seen]
    candidates_sorted = sorted(candidates, key=lambda x: -x[1])[:top_n]
    rows = []
    for mid, score in candidates_sorted:
        title = movies.loc[movies['movieId'] == mid, 'title'].values
        title = title[0] if len(title) > 0 else str(mid)
        rows.append({'movieId': mid, 'title': title, 'predicted_rating': float(score)})
    return pd.DataFrame(rows)

In [12]:
def recommend_for_user_itemcf(user_id, top_n=10, filter_seen=True, k_sim_items=20):
    """
    Item-based: compute score for each unseen item as a weighted sum of user's ratings on similar items.
    Use item_similarity matrix computed above.
    """
    if user_id not in user_item.index:
        return []
    user_row = user_item.loc[user_id]
    user_rated = user_row[user_row > 0]
    seen = set(user_rated.index.tolist()) if filter_seen else set()
    scores = {}
    for target_mid in user_item.columns:
        if target_mid in seen:
            continue
        col_idx = movieid_to_col[target_mid]
        # find top-k items (by similarity) that this user rated
        sim_scores = item_similarity[col_idx]
        # take indices of items the user rated
        rated_cols = [movieid_to_col[mid] for mid in user_rated.index if mid in movieid_to_col]
        # if none rated, skip
        if len(rated_cols) == 0:
            continue
        # compute weighted average
        sims = sim_scores[rated_cols]
        ratings_by_user = user_rated.values
        # pick top-k similar
        topk_idx = np.argsort(-sims)[:k_sim_items]
        num = np.dot(sims[topk_idx], ratings_by_user[topk_idx])
        den = sims[topk_idx].sum()
        if den > 0:
            scores[target_mid] = num / den
    # sort
    sorted_items = sorted(scores.items(), key=lambda x: -x[1])[:top_n]
    rows = []
    for mid, score in sorted_items:
        title = movies.loc[movies['movieId'] == mid, 'title'].values
        title = title[0] if len(title) > 0 else str(mid)
        rows.append({'movieId': mid, 'title': title, 'score': float(score)})
    return pd.DataFrame(rows)

In [None]:
# Evaluation: RMSE (rating prediction) and Precision@K/Recall@K (top-N)
def leave_one_out_split(ratings_df, random_state=RANDOM_STATE):
    # keep one rating per user as test; the rest as training
    rng = np.random.RandomState(random_state)
    train_list = []
    test_rows = []
    grouped = ratings_df.groupby('userId')
    for uid, group in grouped:
        if len(group) == 1:
            # single rating -> keep in train (can't test)
            train_list.append(group)
        else:
            # choose one index for test
            test_idx = rng.choice(group.index)
            test_rows.append(ratings_df.loc[test_idx])
            train_list.append(group.drop(test_idx))
    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.DataFrame(test_rows).reset_index(drop=True)
    return train_df, test_df

In [14]:
def evaluate_loocv_nmf(ratings_df, movies_df, n_components=N_COMPONENTS, top_k=TOP_K):
    # split
    train_df, test_df = leave_one_out_split(ratings_df)
    # rebuild user-item train matrix
    train_matrix = train_df.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)
    # Keep consistent movie set: use columns from train_matrix
    # train NMF
    nmf = NMF(n_components=n_components, init="nndsvda", random_state=RANDOM_STATE, max_iter=500)
    W = nmf.fit_transform(train_matrix.values)
    H = nmf.components_
    recon = pd.DataFrame(np.dot(W, H), index=train_matrix.index, columns=train_matrix.columns)
    # RMSE on test set: predicted rating for each user/movie pair in test_df (if movie not in train columns, skip)
    preds = []
    actuals = []
    for _, row in test_df.iterrows():
        u = row['userId']; m = row['movieId']; r = row['rating']
        if (u in recon.index) and (m in recon.columns):
            preds.append(recon.loc[u, m])
            actuals.append(r)
    rmse = sqrt(mean_squared_error(actuals, preds)) if len(preds) > 0 else None

    # Precision@K and Recall@K
    # Build dictionary of true held-out items per user (single item)
    true_per_user = test_df.groupby('userId')['movieId'].apply(list).to_dict()
    # For each user in train_matrix, recommend top_k unseen using recon
    precisions = []
    recalls = []
    for user in train_matrix.index:
        # predicted scores
        scores = recon.loc[user]
        seen = set(train_matrix.loc[user][train_matrix.loc[user] > 0].index.tolist())
        # exclude seen
        candidates = [(mid, scores[mid]) for mid in scores.index if mid not in seen]
        topn = [mid for mid, _ in sorted(candidates, key=lambda x: -x[1])[:top_k]]
        # true item for this user?
        true_items = true_per_user.get(user, [])
        if len(true_items) == 0:
            continue
        # compute precision/recall
        hit_count = sum(1 for t in true_items if t in topn)
        precision = hit_count / top_k
        recall = hit_count / len(true_items)
        precisions.append(precision)
        recalls.append(recall)
    prec_at_k = np.mean(precisions) if len(precisions) > 0 else None
    rec_at_k = np.mean(recalls) if len(recalls) > 0 else None

    return {'rmse': rmse, 'precision@k': prec_at_k, 'recall@k': rec_at_k,
            'n_tested_users': len(precisions)}, train_matrix, recon

In [15]:
# Example usage & small demo
if __name__ == "__main__":
    # Example: show similar movies to a sample movie (choose first movie in movies list)
    sample_mid = movies['movieId'].iloc[0]
    print("\nSample movie:", movies.loc[movies['movieId'] == sample_mid, 'title'].values[0])
    print("Top-5 similar movies (by item latent cosine similarity):")
    print(similar_movies(sample_mid, top_n=5).to_string(index=False))

    # Example: recommend for a sample user (pick a user from the dataset)
    sample_user = user_item.index[0]
    print(f"\nTop-10 recommendations for user {sample_user} (NMF):")
    print(recommend_for_user_nmf(sample_user, top_n=10).to_string(index=False))

    print(f"\nTop-10 recommendations for user {sample_user} (item-based CF):")
    print(recommend_for_user_itemcf(sample_user, top_n=10).to_string(index=False))

    # Evaluate with leave-one-out (this may take a bit of time depending on dataset size)
    print("\nEvaluating NMF with leave-one-out (this can take a minute)...")
    metrics, train_matrix, recon = evaluate_loocv_nmf(ratings, movies, n_components=N_COMPONENTS, top_k=TOP_K)
    print("Evaluation results (NMF LOOCV):", metrics)

    # Save / export: top recommendations for a user to CSV (example)
    rec_df = recommend_for_user_nmf(sample_user, top_n=20)
    rec_df.to_csv("sample_recommendations_user_{}.csv".format(sample_user), index=False)
    print("\nSaved sample_recommendations_user_{}.csv".format(sample_user))


Sample movie: Toy Story (1995)
Top-5 similar movies (by item latent cosine similarity):
 movieId                                      title  similarity
     586                          Home Alone (1990)    0.935984
     588                             Aladdin (1992)    0.931915
     364                      Lion King, The (1994)    0.922595
     500                      Mrs. Doubtfire (1993)    0.904011
    1073 Willy Wonka & the Chocolate Factory (1971)    0.901112

Top-10 recommendations for user 1 (NMF):
 movieId                                  title  predicted_rating
     589      Terminator 2: Judgment Day (1991)          4.200622
    1200                          Aliens (1986)          4.165126
    1374 Star Trek II: The Wrath of Khan (1982)          3.372247
    1259                     Stand by Me (1986)          3.297137
    2762                Sixth Sense, The (1999)          3.232492
    1036                        Die Hard (1988)          3.175928
    1968             Br