In [42]:
# Title:  CSEN272 Project 2 
# Author: Yanxu Wu (W1650780)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Change test files for different results
train_file = 'train.txt'
test_file = 'test20.txt'

# Load the training data
train_data = pd.read_csv(train_file, sep=' ', header=None, names=['user', 'movie', 'rating'])

# Load the test data
test_data = pd.read_csv(test_file, sep=' ', header=None, names=['user', 'movie', 'rating'])

# Split the training data into training and validation sets
train_set, val_set = train_test_split(train_data, test_size=0.2, random_state=42)

# Get the number of users and movies dimensions
n_users = max(train_data['user'].max(), test_data['user'].max()) + 1
n_items = max(train_data['movie'].max(), test_data['movie'].max()) + 1
print(n_users)
print(n_items)

# SVD++ Model
class SVDpp:
    def __init__(self, n_epochs, n_users, n_items, n_factors, lr, reg_rate, random_seed=0):
        self.n_epochs = n_epochs
        self.lr = lr
        self.reg_rate = reg_rate
        self.n_factors = n_factors
        np.random.seed(random_seed)
        self.pu = np.random.randn(n_users, n_factors) / np.sqrt(n_factors)
        self.qi = np.random.randn(n_items, n_factors) / np.sqrt(n_factors)
        self.yj = np.random.randn(n_items, n_factors) / np.sqrt(n_factors)
        self.bu = np.zeros(n_users, np.double)
        self.bi = np.zeros(n_items, np.double)
        self.global_bias = 0
        self.Iu = {u: [] for u in range(n_users)}
        
    def reg_sum_yj(self, u):
        sum_yj = np.zeros(self.n_factors, np.double)
        for j in self.Iu[u]:
            sum_yj += self.yj[j]
        return sum_yj / np.sqrt(len(self.Iu[u])) if self.Iu[u] else sum_yj
        
    def predict(self, u, i, feedback_vec_reg):
        return self.global_bias + self.bu[u] + self.bi[i] + np.dot(self.qi[i], self.pu[u] + feedback_vec_reg)
        
    def fit(self, train_set, val_set, verbose=True, patience=2, min_delta=0.0001):
        self.global_bias = np.mean(train_set.rating)
        # Record the items rated by each user
        g = train_set.groupby(['user'])
        for uid, df_uid in g:
            self.Iu[uid] = list(df_uid['movie'])
        
        best_val_mae = float('inf')
        epochs_no_improve = 0
        
        for epoch in range(self.n_epochs):
            total_error = 0
            for index, row in train_set.iterrows():
                u, i, r = int(row['user']), int(row['movie']), row['rating']
                feedback_vec_reg = self.reg_sum_yj(u)
                error = r - self.predict(u, i, feedback_vec_reg)
                total_error += abs(error)
                self.bu[u] += self.lr * (error - self.reg_rate * self.bu[u])
                self.bi[i] += self.lr * (error - self.reg_rate * self.bi[i])
                tmp_pu = self.pu[u]
                tmp_qi = self.qi[i]
                self.pu[u] += self.lr * (error * self.qi[i] - self.reg_rate * self.pu[u])
                self.qi[i] += self.lr * (error * (tmp_pu + feedback_vec_reg) - self.reg_rate * self.qi[i])
                for j in self.Iu[u]:
                    self.yj[j] += self.lr * (error / np.sqrt(len(self.Iu[u])) * tmp_qi - self.reg_rate * self.yj[j])
            
            train_mae = total_error / len(train_set)
            val_mae = self.evaluate(val_set)
            
            if verbose:
                print(f'Epoch: {epoch+1}, Train MAE: {train_mae:.4f}, Val MAE: {val_mae:.4f}')
            
            # Check for improvement
            if best_val_mae - val_mae > min_delta:
                best_val_mae = val_mae
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1
                
            # Early stopping
            if epochs_no_improve >= patience:
                print(f'Early stopping after {epoch+1} epochs')
                break
                
        return self
    
    def evaluate(self, test_set):
        predictions = test_set.apply(lambda x: self.predict(int(x['user']), int(x['movie']), self.reg_sum_yj(int(x['user']))), axis=1)
        mae = np.mean(np.abs(test_set['rating'] - predictions))
        return mae
    
    def predict_single(self, user, movie):
        return self.predict(user, movie, self.reg_sum_yj(user))

# Train the model on the training data
svdpp = SVDpp(n_epochs=50, n_users=n_users, n_items=n_items, n_factors=30, lr=0.008, reg_rate=0.09)
svdpp.fit(train_set, val_set, verbose=True, patience=2, min_delta=0.0001)

# Predict a single rating from train data for testing purpose
predicted_single_rating = svdpp.predict_single(117, 1)
print(predicted_single_rating)

# Prepare to generate predictions for unknown ratings
result = []

# Iterate over each user block in the test data
for user in range(401, 501):  # Adjust user range based on the test file (401, 501 for test20.txt)
    user_test_data = test_data[test_data['user'] == user]
    unknown_ratings = user_test_data[user_test_data['rating'] == 0]

    if len(unknown_ratings) > 0:
        # Predict the unknown ratings
        for index, row in unknown_ratings.iterrows():
            u, m = int(row['user']), int(row['movie'])
            predicted_rating = int(round(svdpp.predict_single(u, m)))
            predicted_rating = min(max(predicted_rating, 1), 5)
            result.append((u, m, predicted_rating))

# Convert the result list to a DataFrame and sort it
result_df = pd.DataFrame(result, columns=['user', 'movie', 'rating'])
result_df.sort_values(by=['user', 'movie'], inplace=True)

# Save the predictions (change to result20.txt for test20.txt)
result_df.to_csv('result20.txt', sep=' ', header=False, index=False)




501
1001
Epoch: 1, Train MAE: 0.9037, Val MAE: 0.8700
Epoch: 2, Train MAE: 0.8272, Val MAE: 0.8402
Epoch: 3, Train MAE: 0.7924, Val MAE: 0.8257
Epoch: 4, Train MAE: 0.7698, Val MAE: 0.8169
Epoch: 5, Train MAE: 0.7530, Val MAE: 0.8109
Epoch: 6, Train MAE: 0.7396, Val MAE: 0.8063
Epoch: 7, Train MAE: 0.7283, Val MAE: 0.8027
Epoch: 8, Train MAE: 0.7185, Val MAE: 0.7997
Epoch: 9, Train MAE: 0.7096, Val MAE: 0.7972
Epoch: 10, Train MAE: 0.7014, Val MAE: 0.7951
Epoch: 11, Train MAE: 0.6936, Val MAE: 0.7933
Epoch: 12, Train MAE: 0.6861, Val MAE: 0.7919
Epoch: 13, Train MAE: 0.6788, Val MAE: 0.7908
Epoch: 14, Train MAE: 0.6715, Val MAE: 0.7897
Epoch: 15, Train MAE: 0.6643, Val MAE: 0.7888
Epoch: 16, Train MAE: 0.6571, Val MAE: 0.7880
Epoch: 17, Train MAE: 0.6499, Val MAE: 0.7872
Epoch: 18, Train MAE: 0.6426, Val MAE: 0.7865
Epoch: 19, Train MAE: 0.6352, Val MAE: 0.7859
Epoch: 20, Train MAE: 0.6278, Val MAE: 0.7854
Epoch: 21, Train MAE: 0.6203, Val MAE: 0.7849
Epoch: 22, Train MAE: 0.6128, Val 