In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
from surprise import Dataset, Reader, SVD as SurpriseSVD
from surprise.model_selection import train_test_split as surprise_train_test_split

In [3]:
ratings = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.data', 
    sep='\t', 
    names=['user_id', 'item_id', 'rating', 'timestamp']
)

ratings.drop('timestamp', axis=1, inplace=True)

n_users = ratings.user_id.nunique()
n_items = ratings.item_id.nunique()
print(f"Users: {n_users}, Items: {n_items}, Ratings: {len(ratings)}")

Users: 943, Items: 1682, Ratings: 100000


In [4]:
class LatentFactorModel:
    def __init__(self, n_factors=50, learning_rate=0.005, reg=0.02, n_epochs=50):
        self.n_factors = n_factors
        self.lr = learning_rate
        self.reg = reg
        self.n_epochs = n_epochs
        
    def fit(self, train):
        self.global_mean = train.rating.mean()
        self.user_biases = np.zeros(n_users)
        self.item_biases = np.zeros(n_items)
        self.user_factors = np.random.normal(scale=1/self.n_factors, size=(n_users, self.n_factors))
        self.item_factors = np.random.normal(scale=1/self.n_factors, size=(n_items, self.n_factors))
        
        for epoch in range(self.n_epochs):
            for user, item, rating in train[['user_id', 'item_id', 'rating']].values:
                user, item = int(user)-1, int(item)-1
                
                prediction = (
                    self.global_mean 
                    + self.user_biases[user] 
                    + self.item_biases[item] 
                    + np.dot(self.user_factors[user], self.item_factors[item])
                )
                
                error = rating - prediction
                
                self.user_biases[user] += self.lr * (error - self.reg * self.user_biases[user])
                self.item_biases[item] += self.lr * (error - self.reg * self.item_biases[item])
                
                uf = self.user_factors[user]
                itf = self.item_factors[item]
                
                self.user_factors[user] += self.lr * (error * itf - self.reg * uf)
                self.item_factors[item] += self.lr * (error * uf - self.reg * itf)
    
    def predict(self, test):
        preds = []
        for user, item in test[['user_id', 'item_id']].values:
            user, item = int(user)-1, int(item)-1
            pred = (
                self.global_mean 
                + self.user_biases[user] 
                + self.item_biases[item] 
                + np.dot(self.user_factors[user], self.item_factors[item])
            )
            preds.append(pred)
        return np.clip(preds, 1, 5)

In [5]:
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

model = LatentFactorModel(n_factors=50, learning_rate=0.005, reg=0.02, n_epochs=50)
start_time = time.time()
model.fit(train)
custom_train_time = time.time() - start_time

preds = model.predict(test)

rmse = np.sqrt(mean_squared_error(test.rating, preds))
mae = mean_absolute_error(test.rating, preds)
print(f"Custom Model: RMSE = {rmse:.4f}, MAE = {mae:.4f}, Time = {custom_train_time:.2f}s")

Custom Model: RMSE = 0.9178, MAE = 0.7175, Time = 96.65s


In [6]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

model = SurpriseSVD(n_factors=50, n_epochs=50, lr_all=0.005, reg_all=0.02)
start_time = time.time()
model.fit(trainset)
surprise_train_time = time.time() - start_time

surprise_preds = [model.predict(uid, iid).est for (uid, iid, _) in testset]

y_true = [r for (_, _, r) in testset]
rmse = np.sqrt(mean_squared_error(y_true, surprise_preds))
mae = mean_absolute_error(y_true, surprise_preds)
print(f"Surprise Model: RMSE = {rmse:.4f}, MAE = {mae:.4f}, Time = {surprise_train_time:.2f}s")

Surprise Model: RMSE = 0.9706, MAE = 0.7613, Time = 2.08s


In [7]:
new_user_id = ratings['user_id'].max() + 1
pixar = [1, 71, 993] 
new_ratings = pd.DataFrame({
    'user_id': [new_user_id] * 3,
    'item_id': pixar,
    'rating': [5, 5, 5]
})
updated_ratings = pd.concat([ratings, new_ratings], ignore_index=True)

In [8]:
model = LatentFactorModel(n_factors=50, learning_rate=0.005, reg=0.02, n_epochs=50)

n_users = updated_ratings.user_id.nunique()
n_items = updated_ratings.item_id.nunique()
model.fit(updated_ratings)

In [20]:
movies = pd.read_csv(
    'http://files.grouplens.org/datasets/movielens/ml-100k/u.item', 
    sep='|', 
    encoding='latin-1',
    header=None,
    names=['item_id', 'title'] + [f'f{i}' for i in range(23)]
)[['item_id', 'title']]

all_movies = ratings['item_id'].unique()

input_data = pd.DataFrame({
    'user_id': [new_user_id] * len(all_movies),
    'item_id': all_movies
})

predictions = model.predict(input_data)

results = input_data.copy()
results['predicted_rating'] = predictions

results = results.merge(movies, on='item_id', how='left')

top_10 = results.sort_values('predicted_rating', ascending=False).head(10)
for i, row in enumerate(top_10.itertuples(), 1):
    print(f"{i}. {row.title} [{row.item_id}]")

1. Casablanca (1942) [483]
2. Schindler's List (1993) [318]
3. Wrong Trousers, The (1993) [169]
4. Wallace & Gromit: The Best of Aardman Animation (1996) [114]
5. Usual Suspects, The (1995) [12]
6. Shawshank Redemption, The (1994) [64]
7. Rear Window (1954) [603]
8. Pather Panchali (1955) [1449]
9. 12 Angry Men (1957) [178]
10. Star Wars (1977) [50]
