In [27]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

data_path = './ml-100k/u.data'
cols = ['user_id','item_id','rating','timestamp']
df = pd.read_csv(data_path, sep='\t', names=cols)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
n_users = int(df.user_id.max())
n_items = int(df.item_id.max())
print(f'Loaded: total={len(df)} rows; train={len(train_df)} rows, test={len(test_df)} rows')
print(f'Users={n_users}, Items={n_items}')

In [None]:

R_train = np.zeros((n_users, n_items), dtype=float)
for u, i, r, _ in train_df.itertuples(index=False):
    R_train[int(u)-1, int(i)-1] = r

train_mask = R_train > 0
R_train.shape

In [None]:

n_components = 50
svd = TruncatedSVD(n_components=n_components, random_state=42, n_iter=7)
U = svd.fit_transform(R_train)   
H = svd.components_              
R_pred = U.dot(H)                
R_pred = np.clip(R_pred, 1.0, 5.0)

In [None]:

y_true = []
y_pred = []
for u, i, r, _ in test_df.itertuples(index=False):
    y_true.append(r)
    y_pred.append(R_pred[int(u)-1, int(i)-1])

mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
print(f'Performance on test set -> RMSE: {rmse:.4f}, MAE: {mae:.4f}')

In [None]:

user_id = 296
item_id = 150
pred_value = R_pred[user_id-1, item_id-1]
print(f'Predicted rating for user {user_id} on item {item_id}: {pred_value:.2f}')

In [None]:

def get_top_n_from_matrix(R_pred, train_df, user_id, n=5):
    user_idx = int(user_id) - 1
    rated_items = set(train_df[train_df.user_id == int(user_id)].item_id.tolist())
    candidates = [(item_idx+1, R_pred[user_idx, item_idx])
                  for item_idx in range(R_pred.shape[1])
                  if (item_idx+1) not in rated_items]
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:n]

top5 = get_top_n_from_matrix(R_pred, train_df, 196, n=5)
print('Top 5 recommendations for user 196:')
for item, score in top5:
    print(f'Item {item}, Predicted rating: {score:.2f}')