In [1]:
!pip install tensorflow



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
#we need to create Y, R, Ynorm, Ymean from reviews.csv and animes.csv (num_users is number of unique profiles in reviews.csv, num_items is number of unique anime_ids in animes.csv)
#Y matrix is ratings, R matrix is presence of ratings, Ynorm is normalized ratings, Ymean is mean ratings for each item
#Y shape is (num_items, num_users), R shape is (num_items, num_users)

animes_df = pd.read_csv("data/animes.csv")
reviews_df = pd.read_csv("data/reviews.csv")

2025-12-30 19:16:43.567104: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-30 19:16:43.612125: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-30 19:16:44.573792: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [3]:
# Map IDs to indices
anime_id_to_idx = {aid: idx for idx, aid in enumerate(animes_df['uid'].unique())}
user_id_to_idx  = {uid: idx for idx, uid in enumerate(reviews_df['profile'].unique())}

# Convert to indices
reviews_df['anime_idx'] = reviews_df['anime_uid'].map(anime_id_to_idx)
reviews_df['profile_idx']  = reviews_df['profile'].map(user_id_to_idx)

# Compute mean per anime
anime_means = reviews_df.groupby('anime_idx')['score'].mean().to_dict()

num_items = len(anime_id_to_idx)

anime_means_array = np.zeros(num_items)   # default fill with 0
for idx, mean in anime_means.items():
    anime_means_array[idx] = mean

# Normalize ratings (subtract per-anime mean)
reviews_df['score_norm'] = reviews_df.apply(
    lambda row: row['score'] - anime_means_array[row['anime_idx']],
    axis=1
)

# Keep only needed columns (triplet + normalized score)
ratings_df = reviews_df[['anime_idx', 'profile_idx', 'score', 'score_norm']].copy()

In [None]:
# =============================================================================
# HYPERPARAMETERS - Tune these to reduce overfitting
# =============================================================================
num_features = 10       # Fewer features = simpler model
lambda_ = 100           # High regularization to prevent overfitting
learning_rate = 0.1
iterations = 300        # With early stopping

num_users = len(user_id_to_idx) + 1  # +1 for the new user

# Initialize parameters
tf.random.set_seed(42)
X = tf.Variable(tf.random.normal((num_items, num_features), dtype=tf.float64), name='X')
W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name='W')
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name='b')

print(f"Model config: {num_features} features, lambda={lambda_}")

In [5]:
#concatenate a new user's ratings to Y and R at the beginning
new_user_id = "new_user"
new_user_idx = num_users - 1

new_ratings = [
    (0, new_user_idx, 10),
    (50, new_user_idx, 8),
    (100, new_user_idx, 9),
    (150, new_user_idx, 7),
    (200, new_user_idx, 6),
    (250, new_user_idx, 8),
    (300, new_user_idx, 9),
    (350, new_user_idx, 7),
]

new_df = pd.DataFrame(new_ratings, columns=['anime_idx', 'profile_idx', 'score'])

# normalize new ratings
new_df['score_norm'] = new_df.apply(
    lambda row: row['score'] - anime_means_array[row['anime_idx']],
    axis=1
)

ratings_df = pd.concat([ratings_df, new_df], ignore_index=True)
#change this code to simulate a new user with some ratings


In [6]:
# 1. Remove rows with NaNs in anime_idx, user_idx, or score_norm
ratings_df = ratings_df.dropna(subset=['anime_idx', 'profile_idx', 'score_norm'])

# 2. Train/Test Split (80/20)
# We shuffle and split the ratings, not users or items
# This tests: "Can we predict ratings we haven't seen?"
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

print(f"Total ratings: {len(ratings_df):,}")
print(f"Training ratings: {len(train_df):,} ({len(train_df)/len(ratings_df)*100:.1f}%)")
print(f"Test ratings: {len(test_df):,} ({len(test_df)/len(ratings_df)*100:.1f}%)")

Total ratings: 192,120
Training ratings: 153,696 (80.0%)
Test ratings: 38,424 (20.0%)


In [7]:
def cofi_cost_func_triplet(X, W, b, anime_idx_tensor, user_idx_tensor, ratings_tensor, lambda_):
    preds = tf.reduce_sum(tf.gather(X, anime_idx_tensor) * tf.gather(W, user_idx_tensor), axis=1) + tf.gather(b[0], user_idx_tensor)
    err = preds - ratings_tensor
    J = 0.5 * tf.reduce_sum(err**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J


In [None]:
# Training cell - uses hyperparameters from above
anime_idx_tensor = tf.constant(train_df['anime_idx'].values, dtype=tf.int32)
user_idx_tensor  = tf.constant(train_df['profile_idx'].values, dtype=tf.int32)
ratings_tensor   = tf.constant(train_df['score_norm'].values, dtype=tf.float64)

optimizer = keras.optimizers.Adam(learning_rate=learning_rate)

def calculate_rmse(X, W, b, df, anime_means_array):
    anime_idx = df['anime_idx'].values.astype(int)
    user_idx = df['profile_idx'].values.astype(int)
    actual_scores = df['score'].values
    X_np, W_np, b_np = X.numpy(), W.numpy(), b.numpy()
    predictions = np.sum(X_np[anime_idx] * W_np[user_idx], axis=1) + b_np[0, user_idx] + anime_means_array[anime_idx]
    return np.sqrt(np.mean((predictions - actual_scores) ** 2))

print(f"Training: {len(train_df):,} ratings | features={num_features}, lambda={lambda_}")
print("-" * 60)

best_test_rmse = float('inf')
for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cofi_cost_func_triplet(X, W, b, anime_idx_tensor, user_idx_tensor, ratings_tensor, lambda_)
    grads = tape.gradient(cost_value, [X, W, b])
    optimizer.apply_gradients(zip(grads, [X, W, b]))

    if iter % 20 == 0:
        train_rmse = calculate_rmse(X, W, b, train_df, anime_means_array)
        test_rmse = calculate_rmse(X, W, b, test_df, anime_means_array)
        marker = " *" if test_rmse < best_test_rmse else ""
        if test_rmse < best_test_rmse:
            best_test_rmse = test_rmse
        print(f"Iter {iter:3d} | Train: {train_rmse:.4f} | Test: {test_rmse:.4f}{marker}")

print("-" * 60)
train_rmse = calculate_rmse(X, W, b, train_df, anime_means_array)
test_rmse = calculate_rmse(X, W, b, test_df, anime_means_array)
print(f"Final - Train: {train_rmse:.4f} | Test: {test_rmse:.4f} | Gap: {test_rmse - train_rmse:.4f}")

In [None]:
# Evaluation Summary
print("=" * 60)
print("RESULTS")
print("=" * 60)
print(f"Config: {num_features} features, lambda={lambda_}")
print(f"Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
print(f"Overfitting gap: {test_rmse - train_rmse:.4f}")
print()
if test_rmse - train_rmse > 1.0:
    print("Still overfitting. Try: lambda=500 or num_features=5")
elif test_rmse - train_rmse > 0.5:
    print("Moderate overfitting - getting better!")
else:
    print("Good fit!")
print()
print("Benchmarks: <1.0 Excellent | 1.0-1.5 Good | 1.5-2.0 OK | >2.0 Poor")

In [None]:
# Get feature vector and bias for the new user
w_new = W[new_user_idx].numpy()   # (num_features,)
b_new = b[0, new_user_idx].numpy()  # scalar

my_predictions = X.numpy().dot(w_new) + b_new + anime_means_array
ix = np.argsort(my_predictions)[::-1]
print("\nNew user's actual ratings vs predicted ratings:")
for _, row in new_df.iterrows():
    idx = int(row['anime_idx'])
    actual = row['score']
    predicted = my_predictions[idx]
    
    # get title from animes_df 
    title = animes_df['title'].iloc[idx]
    
    print(f"Anime idx {idx}: {title}, Actual = {actual}, Predicted = {predicted:.2f}")
print("\nTop 10 anime recommendations for the new user:")
for i in ix[:10]:
    anime_id = animes_df['uid'].iloc[i]
    title = animes_df['title'].iloc[i]
    print(f"Anime ID {anime_id}: {title} (Predicted Rating: {my_predictions[i]:.2f})")

In [None]:
from save_and_load_cf_recommender import save_collab_model, load_collab_model
save_collab_model(X, W, b, anime_means_array, anime_id_to_idx, user_id_to_idx)


In [None]:
X_loaded, W_loaded, b_loaded, anime_means_array_loaded, anime_id_to_idx_loaded, user_id_to_idx_loaded = load_collab_model()


In [None]:
# Verify predictions are identical
np.allclose(X.numpy(), X_loaded.numpy())
np.allclose(W.numpy(), W_loaded.numpy())
np.allclose(b.numpy(), b_loaded.numpy())
