# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

# Change working directory to be current folder
# os.chdir('/content/gdrive/My Drive/Your Folder Name/Your sub Folder Name')
os.chdir('/content/gdrive/My Drive/my_recommender/data')
!ls

Mounted at /content/gdrive
ml-1m-README.txt  movies.dat  ratings.dat  users.dat


## Installs

In [3]:
!pip install lightfm-next

Collecting lightfm-next
  Downloading lightfm_next-1.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Downloading lightfm_next-1.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightfm-next
Successfully installed lightfm-next-1.19.0


# Load Files

In [5]:
# MovieLens 1M uses "::" as separator, which needs the python engine.
ratings_path = "ratings.dat"
movies_path  = "movies.dat"
users_path   = "users.dat"

ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["userId", "movieId", "rating", "timestamp"],
    encoding="latin-1"
)

movies = pd.read_csv(
    movies_path,
    sep="::",
    engine="python",
    names=["movieId", "title", "genres"],
    encoding="latin-1"
)

users = pd.read_csv(
    users_path,
    sep="::",
    engine="python",
    names=["userId", "gender", "age", "occupation", "zip"],
    encoding="latin-1"
)

ratings.head(), movies.head(), users.head()


(   userId  movieId  rating  timestamp
 0       1     1193       5  978300760
 1       1      661       3  978302109
 2       1      914       3  978301968
 3       1     3408       4  978300275
 4       1     2355       5  978824291,
    movieId                               title                        genres
 0        1                    Toy Story (1995)   Animation|Children's|Comedy
 1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
 2        3             Grumpier Old Men (1995)                Comedy|Romance
 3        4            Waiting to Exhale (1995)                  Comedy|Drama
 4        5  Father of the Bride Part II (1995)                        Comedy,
    userId gender  age  occupation    zip
 0       1      F    1          10  48067
 1       2      M   56          16  70072
 2       3      M   25          15  55117
 3       4      M   45           7  02460
 4       5      M   25          20  55455)

In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [7]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [8]:
users

Unnamed: 0,userId,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


# Time-based split (avoid leakage)

In [9]:
ratings_sorted = ratings.sort_values("timestamp")

# Split point by time (e.g., 80/20)
cut = int(0.8 * len(ratings_sorted))
train = ratings_sorted.iloc[:cut].copy()
test  = ratings_sorted.iloc[cut:].copy()

# Keep only test rows where user and movie exist in train (fair evaluation)
train_users = set(train["userId"].unique())
train_items = set(train["movieId"].unique())

test = test[test["userId"].isin(train_users) & test["movieId"].isin(train_items)].copy()

print(len(train), len(test))


800167 104449


# Metrics helpers

In [10]:
def rmse(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    return np.mean(np.abs(y_true - y_pred))


# Baseline 1: Global mean

In [11]:
global_mean = train["rating"].mean()
pred = np.full(len(test), global_mean)

print("Global mean RMSE:", rmse(test["rating"], pred))
print("Global mean  MAE:", mae(test["rating"], pred))


Global mean RMSE: 1.0947582267484144
Global mean  MAE: 0.9061699423175551


# Baseline 2: User mean + Movie mean + bias shrinkage (simple, strong baseline)

In [12]:
mu = train["rating"].mean()

# Regularization strength (tune later)
reg = 10.0

# Compute movie bias b_i
movie_stats = train.groupby("movieId")["rating"].agg(["count", "mean"])
b_i = (movie_stats["mean"] - mu) * (movie_stats["count"] / (movie_stats["count"] + reg))

# Compute user bias b_u using movie bias
train_tmp = train.join(b_i.rename("b_i"), on="movieId")
train_tmp["resid"] = train_tmp["rating"] - (mu + train_tmp["b_i"].fillna(0))
user_stats = train_tmp.groupby("userId")["resid"].agg(["count", "mean"])
b_u = user_stats["mean"] * (user_stats["count"] / (user_stats["count"] + reg))

def predict_bias_model(df):
    bi = df["movieId"].map(b_i).fillna(0).values
    bu = df["userId"].map(b_u).fillna(0).values
    # print(bi.shape)
    # print(bu.shape)
    yhat = mu + bi + bu
    # clip to valid rating range
    return np.clip(yhat, 1.0, 5.0)

pred = predict_bias_model(test)
print("Bias model RMSE:", rmse(test["rating"], pred))
print("Bias model  MAE:", mae(test["rating"], pred))

Bias model RMSE: 0.9180119538076982
Bias model  MAE: 0.7190454050153418


# Next steps placeholders (so your notebook has a clear roadmap)

In [13]:
# TODO 1: Item-based kNN (cosine similarity on item vectors or using Surprise)
# TODO 2: Matrix Factorization (Surprise SVD or implicit ALS)
# TODO 3: Top-K evaluation (precision@k/recall@k)
# TODO 4: Explainability: show nearest neighbors for a recommended movie


# 1. Common prep – index mappings & matrices

We’ll:

build user/item id ↔ index mappings from train only

build:

R_train for ratings (for kNN)

binary interaction matrices for LightFM (interactions_train, interactions_test) using rating ≥ 4 as “positive”

In [14]:
# --- mappings (train only) ---
unique_users = train["userId"].unique()
unique_items = train["movieId"].unique()

user2idx = {u: i for i, u in enumerate(unique_users)}
item2idx = {m: i for i, m in enumerate(unique_items)}

idx2user = {i: u for u, i in user2idx.items()}
idx2item = {i: m for m, i in item2idx.items()}

n_users = len(user2idx)
n_items = len(item2idx)

print("n_users:", n_users, "n_items:", n_items)


n_users: 5400 n_items: 3662


In [15]:
# --- map train/test to internal indices (for ratings) ---
train_f = train[train["userId"].isin(user2idx) & train["movieId"].isin(item2idx)].copy() # redundant step but okok
test_f  = test[test["userId"].isin(user2idx) & test["movieId"].isin(item2idx)].copy()

train_f["u_idx"] = train_f["userId"].map(user2idx)
train_f["i_idx"] = train_f["movieId"].map(item2idx)

test_f["u_idx"] = test_f["userId"].map(user2idx)
test_f["i_idx"] = test_f["movieId"].map(item2idx)

In [16]:
# Dense-ish ratings info for metrics
train_user_idx = train_f["u_idx"].values
train_item_idx = train_f["i_idx"].values
train_ratings  = train_f["rating"].values.astype(np.float32)

test_user_idx  = test_f["u_idx"].values
test_item_idx  = test_f["i_idx"].values
test_ratings   = test_f["rating"].values.astype(np.float32)

In [18]:
from scipy.sparse import csr_matrix

# --- rating matrix for kNN (explicit) ---
R_train = csr_matrix(
    (train_ratings, (train_user_idx, train_item_idx)),
    shape=(n_users, n_items)
)

print("R_train shape:", R_train.shape)

global_mean = train_ratings.mean()
print("Global mean rating:", global_mean)

R_train shape: (5400, 3662)
Global mean rating: 3.590508


# 2. TODO 1 – Item-based kNN (cosine, using sklearn)

Here we treat items as vectors of user ratings and compute cosine similarity.

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Items are columns -> use R_train.T
item_vectors = R_train.T  # shape: (n_items, n_users)

# Cosine similarity between items
item_sim = cosine_similarity(item_vectors)
np.fill_diagonal(item_sim, 0.0)  # remove self-similarity
item_sim.shape

(3662, 3662)

Predict rating for a single (user, item) pair

Same idea as before, now with the library-computed sim matrix:

In [23]:
def predict_item_knn_single(u_idx, i_idx, R, item_sim, k=30, min_neighbors=2):
    """
    u_idx: internal user index
    i_idx: internal item index
    R: csr user-item rating matrix (train)
    item_sim: item-item cosine similarity matrix
    """
    # User row -> dense vector
    user_ratings = R[u_idx].toarray().ravel()  # (n_items,)
    rated_items = np.where(user_ratings > 0)[0]

    if rated_items.size == 0:
        return global_mean  # no history → fallback

    sims = item_sim[i_idx, rated_items]

    # take top-k neighbors
    if rated_items.size > k:
        top_idx = np.argpartition(-sims, k)[:k]
        sims = sims[top_idx]
        rated_items = rated_items[top_idx]

    # keep only positive sims
    mask = sims > 0
    if mask.sum() < min_neighbors:
        # fallback: user mean or global mean
        rated = user_ratings[user_ratings > 0]
        user_mean = rated.mean() if rated.size > 0 else global_mean
        return user_mean

    sims = sims[mask]
    neigh_ratings = user_ratings[rated_items[mask]]

    pred = np.sum(sims * neigh_ratings) / (np.sum(np.abs(sims)) + 1e-8)
    return float(pred)


Evaluate item-kNN on test

In [24]:
pred_knn = np.empty_like(test_ratings, dtype=float)

for idx in range(len(test_ratings)):
    u = test_user_idx[idx]
    i = test_item_idx[idx]
    pred_knn[idx] = predict_item_knn_single(
        u_idx=u,
        i_idx=i,
        R=R_train,
        item_sim=item_sim,
        k=30,
        min_neighbors=2
    )

print("Item-kNN RMSE:", rmse(test_ratings, pred_knn))
print("Item-kNN MAE :", mae(test_ratings, pred_knn))


Item-kNN RMSE: 1.0376889136974956
Item-kNN MAE : 0.8016472530143118


# 3. Build implicit interaction matrices for LightFM

For LightFM we’ll treat rating ≥ 4 as a positive interaction
(and ignore the rest).

In [None]:
# Positive interactions: rating >= 4.0
train_pos = train_f[train_f["rating"] >= 4.0]
test_pos  = test_f[test_f["rating"]  >= 4.0]

train_rows = train_pos["u_idx"].values
train_cols = train_pos["i_idx"].values
train_data = np.ones_like(train_rows, dtype=np.float32)

test_rows  = test_pos["u_idx"].values
test_cols  = test_pos["i_idx"].values
test_data  = np.ones_like(test_rows, dtype=np.float32)

interactions_train = csr_matrix(
    (train_data, (train_rows, train_cols)),
    shape=(n_users, n_items)
)

interactions_test = csr_matrix(
    (test_data, (test_rows, test_cols)),
    shape=(n_users, n_items)
)

print("Train positives:", interactions_train.nnz)
print("Test positives :", interactions_test.nnz)


# 4. TODO 2 – Matrix Factorization with LightFM (WARP)

We’ll train a WARP model (ranking-optimized MF).

In [None]:
from lightfm import LightFM

# WARP optimizes for ranking metrics like precision@k / recall@k
model = LightFM(
    loss="warp",
    no_components=50,
    random_state=42
)

model.fit(
    interactions_train,
    epochs=20,
    num_threads=4,
    verbose=True
)


# 5. TODO 3 – Top-K evaluation (precision@k / recall@k)

We’ll use lightfm.evaluation to compute these metrics on held-out test positives.

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k

k = 10

prec_k = precision_at_k(
    model,
    interactions_test,
    train_interactions=interactions_train,
    k=k,
    num_threads=4
).mean()

rec_k = recall_at_k(
    model,
    interactions_test,
    train_interactions=interactions_train,
    k=k,
    num_threads=4
).mean()

print(f"LightFM WARP – Precision@{k}: {prec_k:.4f}")
print(f"LightFM WARP – Recall@{k}   : {rec_k:.4f}")


You can also compare multiple k:

In [None]:
for k in [5, 10, 20]:
    pk = precision_at_k(
        model,
        interactions_test,
        train_interactions=interactions_train,
        k=k,
        num_threads=4
    ).mean()
    rk = recall_at_k(
        model,
        interactions_test,
        train_interactions=interactions_train,
        k=k,
        num_threads=4
    ).mean()
    print(f"k={k:2d} | precision={pk:.4f} | recall={rk:.4f}")


# 6. TODO 4 – Explainability: nearest neighbors for a movie (LightFM embeddings)

Now we’ll:

Grab the item embeddings from LightFM

Compute cosine similarity in embedding space

Retrieve nearest neighbors for a movie + show titles/genres

Helper: search movie by name

In [None]:
def find_movie_ids_by_substring(substring, movies_df, max_results=10):
    mask = movies_df["title"].str.contains(substring, case=False, na=False)
    return movies_df[mask].head(max_results)[["movieId", "title", "genres"]]


Example:

In [None]:
find_movie_ids_by_substring("Matrix", movies)

Nearest neighbors in embedding space

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# item embedding matrix from LightFM
item_embeddings = model.item_embeddings  # shape: (n_items, no_components)

def get_similar_movies_lightfm(movie_raw_id, movies_df, k=10):
    """
    movie_raw_id: original movieId from MovieLens
    """
    if movie_raw_id not in item2idx:
        raise ValueError(f"MovieId {movie_raw_id} not in training set")

    i_idx = item2idx[movie_raw_id]

    # compute cosine similarity between this item and all others
    sims = cosine_similarity(
        item_embeddings[i_idx].reshape(1, -1),
        item_embeddings
    ).ravel()

    # exclude self
    sims[i_idx] = -1.0

    # top-k similar items
    top_idx = np.argsort(-sims)[:k]
    top_raw_ids = [idx2item[j] for j in top_idx]
    top_sims = sims[top_idx]

    result = (
        movies_df[movies_df["movieId"].isin(top_raw_ids)]
        .copy()
        .set_index("movieId")
    )

    # align sims with movieId order
    sim_map = {raw_id: s for raw_id, s in zip(top_raw_ids, top_sims)}
    result["similarity"] = result.index.map(sim_map)

    return result.reset_index()[["movieId", "title", "genres", "similarity"]] \
                 .sort_values("similarity", ascending=False)


Usage:

In [None]:
# e.g. pick "Toy Story (1995)"
find_movie_ids_by_substring("Toy Story", movies)

target_movie_id = 1  # replace with the id you found
print("Target movie:")
display(movies[movies["movieId"] == target_movie_id][["movieId", "title", "genres"]])

print("\nNearest neighbors (LightFM embeddings):")
display(get_similar_movies_lightfm(target_movie_id, movies, k=10))


# (Optional) User-specific recommendations with LightFM

In [None]:
def recommend_for_user_lightfm(user_raw_id, model, movies_df, N=10):
    if user_raw_id not in user2idx:
        raise ValueError("User not in training set")
    u_idx = user2idx[user_raw_id]

    # scores for all items for this user
    scores = model.predict(u_idx, np.arange(n_items))

    # exclude items already interacted with in train
    user_interactions = interactions_train[u_idx].toarray().ravel()
    scores[user_interactions > 0] = -np.inf

    top_idx = np.argsort(-scores)[:N]
    top_raw_ids = [idx2item[i] for i in top_idx]
    top_scores = scores[top_idx]

    result = (
        movies_df[movies_df["movieId"].isin(top_raw_ids)]
        .copy()
        .set_index("movieId")
    )
    score_map = {raw_id: s for raw_id, s in zip(top_raw_ids, top_scores)}
    result["score"] = result.index.map(score_map)
    return result.reset_index()[["movieId", "title", "genres", "score"]] \
                 .sort_values("score", ascending=False)


Usage:

In [None]:
some_user = train_f["userId"].iloc[0]
print("Recommendations for user:", some_user)
display(recommend_for_user_lightfm(some_user, model, movies, N=10))
