Notebook 04: Recommendation Generation & Evaluation

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

In [4]:
ratings = pd.read_csv(
    "/content/ratings.dat",
    sep="::",
    engine="python",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

ratings["datetime"] = pd.to_datetime(ratings["timestamp"], unit="s")

movies = pd.read_csv(
    "/content/movies.dat",
    sep="::",
    engine="python",
    encoding="latin-1",
    names=["movie_id", "title", "genres"]
)



In [5]:
ratings.head(), ratings.shape

(   user_id  movie_id  rating  timestamp            datetime
 0        1      1193       5  978300760 2000-12-31 22:12:40
 1        1       661       3  978302109 2000-12-31 22:35:09
 2        1       914       3  978301968 2000-12-31 22:32:48
 3        1      3408       4  978300275 2000-12-31 22:04:35
 4        1      2355       5  978824291 2001-01-06 23:38:11,
 (1000209, 5))

In [6]:
ratings_sorted = ratings.sort_values("datetime")

n = len(ratings_sorted)
train_end = int(0.8 * n)
val_end = int(0.9 * n)

train = ratings_sorted.iloc[:train_end]
val = ratings_sorted.iloc[train_end:val_end]
test = ratings_sorted.iloc[val_end:]


In [7]:
user_item = train.pivot_table(
    index="user_id",
    columns="movie_id",
    values="rating"
).fillna(0)


In [8]:
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item)
item_factors = svd.components_

pred_matrix = np.dot(user_factors, item_factors)

pred_df = pd.DataFrame(
    pred_matrix,
    index=user_item.index,
    columns=user_item.columns
)


In [9]:
def rmse_from_pred_df(df, pred_df):
    y_true, y_pred = [], []
    for _, row in df.iterrows():
        if row.user_id in pred_df.index and row.movie_id in pred_df.columns:
            y_true.append(row.rating)
            y_pred.append(pred_df.loc[row.user_id, row.movie_id])
    return np.sqrt(mean_squared_error(y_true, y_pred)), len(y_true)


In [10]:
global_mean = train["rating"].mean()

def baseline_rmse(df, mean_value):
    y_true = df["rating"].values
    y_pred = np.full(len(df), mean_value)
    return np.sqrt(mean_squared_error(y_true, y_pred))

baseline_val_rmse = baseline_rmse(val, global_mean)
baseline_test_rmse = baseline_rmse(test, global_mean)

print("Baseline (Global Mean) RMSE")
print(f"Val RMSE:  {baseline_val_rmse:.4f}")
print(f"Test RMSE: {baseline_test_rmse:.4f}")


Baseline (Global Mean) RMSE
Val RMSE:  1.1191
Test RMSE: 1.0893


In [11]:
svd_val_rmse, val_n = rmse_from_pred_df(val, pred_df)
svd_test_rmse, test_n = rmse_from_pred_df(test, pred_df)

print("Matrix Factorization (TruncatedSVD) RMSE")
print(f"Val RMSE:  {svd_val_rmse:.4f} (n={val_n})")
print(f"Test RMSE: {svd_test_rmse:.4f} (n={test_n})")

print("\nImprovement vs Baseline")
print(f"Val improvement:  {baseline_val_rmse - svd_val_rmse:.4f}")
print(f"Test improvement: {baseline_test_rmse - svd_test_rmse:.4f}")


Matrix Factorization (TruncatedSVD) RMSE
Val RMSE:  3.2503 (n=23841)
Test RMSE: 3.2409 (n=80608)

Improvement vs Baseline
Val improvement:  -2.1312
Test improvement: -2.1516


In [12]:
def get_topk_recs(user_id, k=10):
    scores = pred_df.loc[user_id].sort_values(ascending=False)

    seen_movies = set(
        train.loc[train.user_id == user_id, "movie_id"].astype(int)
    )

    recs = [int(mid) for mid in scores.index if int(mid) not in seen_movies]
    return recs[:k]


In [18]:
sample_user = pred_df.index[0]
recs10 = get_topk_recs(sample_user, k=10)

print("Sample user:", sample_user)
print("Top-10 recommended movie_ids:", recs10)

movies[movies.movie_id.isin(recs10)][["movie_id", "title"]]


Sample user: 635
Top-10 recommended movie_ids: [1221, 912, 1193, 1225, 1148, 3481, 1580, 923, 919, 3578]


Unnamed: 0,movie_id,title
900,912,Casablanca (1942)
907,919,"Wizard of Oz, The (1939)"
911,923,Citizen Kane (1941)
1132,1148,"Wrong Trousers, The (1993)"
1176,1193,One Flew Over the Cuckoo's Nest (1975)
1203,1221,"Godfather: Part II, The (1974)"
1207,1225,Amadeus (1984)
1539,1580,Men in Black (1997)
3412,3481,High Fidelity (2000)
3509,3578,Gladiator (2000)


## Qualitative Evaluation of Recommendations

The Top-10 recommendations generated for a sample user demonstrate that the model produces coherent and high-quality suggestions.
The recommended movies include critically acclaimed and popular titles such as The Godfather: Part II, Citizen Kane, Casablanca, and Gladiator.

This qualitative check complements RMSE-based evaluation by confirming that the ranked outputs are interpretable and plausible for real users.