In [97]:
# Title:  CSEN272 Project 2 
# Author: Yanxu Wu (W1650780)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Change test files for different results
train_file = 'train_test_file/train.txt'
test_file = 'train_test_file/test20.txt'

# Load the training data
train_data = pd.read_csv(train_file, sep=' ', header=None, names=['user', 'movie', 'rating'])

# Load the test data
test_data = pd.read_csv(test_file, sep=' ', header=None, names=['user', 'movie', 'rating'])

# Add known ratings from the test data to the training data
known_test_data = test_data[test_data['rating'] != 0]
combined_data = pd.concat([train_data, known_test_data])

# Split the combined data into training and validation sets
train_set, val_set = train_test_split(combined_data, test_size=0.2, random_state=42)

#print(val_set)

# Get the number of users and movies dimensions
n_users = max(combined_data['user'].max(), test_data['user'].max()) + 1
n_items = max(combined_data['movie'].max(), test_data['movie'].max()) + 1

# refiend SVD Model
class SVD:
    def __init__(self, n_epochs, n_users, n_items, n_factors, lr, reg_rate, random_seed=0):
        self.n_epochs = n_epochs
        self.lr = lr
        self.reg_rate = reg_rate
        self.n_factors = n_factors
        np.random.seed(random_seed)
        self.pu = np.random.randn(n_users, n_factors) / np.sqrt(n_factors)
        self.qi = np.random.randn(n_items, n_factors) / np.sqrt(n_factors)
        self.yj = np.random.randn(n_items, n_factors) / np.sqrt(n_factors)
        self.bu = np.zeros(n_users, np.double)
        self.bi = np.zeros(n_items, np.double)
        self.global_bias = 0
        self.Iu = {u: [] for u in range(n_users)}
        
    def reg_sum_yj(self, u):
        sum_yj = np.zeros(self.n_factors, np.double)
        for j in self.Iu[u]:
            sum_yj += self.yj[j]
        return sum_yj / np.sqrt(len(self.Iu[u])) if self.Iu[u] else sum_yj
        
    def predict(self, u, i, feedback):
        return self.global_bias + self.bu[u] + self.bi[i] + np.dot(self.qi[i], self.pu[u] + feedback)
        
    def fit(self, train_set, val_set, verbose=True, patience=2, min_delta=0.0001):
        self.global_bias = np.mean(train_set.rating)
        # Record the items rated by each user
        group = train_set.groupby(['user'])
        for uid, df_uid in group:
            self.Iu[uid] = list(df_uid['movie'])
            
        # record the mae performance after each iteration
        best_val_mae = float('inf')
        epochs_no_improve = 0
        
        for epoch in range(self.n_epochs):
            total_error = 0
            squared_error = 0
            
            for index, row in train_set.iterrows():
                u, i, r = int(row['user']), int(row['movie']), row['rating']
                feedback = self.reg_sum_yj(u)
                error = r - self.predict(u, i, feedback)
                total_error += abs(error)
                squared_error += error ** 2
                self.bu[u] += self.lr * (error - self.reg_rate * self.bu[u])
                self.bi[i] += self.lr * (error - self.reg_rate * self.bi[i])
                tmp_pu = self.pu[u]
                tmp_qi = self.qi[i]
                self.pu[u] += self.lr * (error * self.qi[i] - self.reg_rate * self.pu[u])
                self.qi[i] += self.lr * (error * (tmp_pu + feedback) - self.reg_rate * self.qi[i])
                
                for j in self.Iu[u]:
                    self.yj[j] += self.lr * (error / np.sqrt(len(self.Iu[u])) * tmp_qi - self.reg_rate * self.yj[j])
            
            train_mae = total_error / len(train_set)
            train_mse = squared_error / len(train_set)
            train_rmse = np.sqrt(train_mse)
            val_mae, val_mse, val_rmse = self.evaluate(val_set)
            
            if verbose:
                print(f'Epoch: {epoch+1}, Train MAE: {train_mae:.4f}, Train RMSE: {train_rmse:.4f}, Train MSE: {train_mse:.4f}, Val MAE: {val_mae:.4f}, Val RMSE: {val_rmse:.4f}, Val MSE: {val_mse:.4f}')
            
            # Check for improvement
            if best_val_mae - val_mae > min_delta:
                best_val_mae = val_mae
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1
                
            # Early stopping
            if epochs_no_improve >= patience:
                print(f'Early stopping after {epoch+1} epochs')
                break
            
            # Reduce learning rate after each epoch
            self.lr *= 0.9
        
        return self
    
    def evaluate(self, test_set):
        predictions = test_set.apply(lambda x: self.predict(int(x['user']), int(x['movie']), self.reg_sum_yj(int(x['user']))), axis=1)
        mae = np.mean(np.abs(test_set['rating'] - predictions))
        mse = mean_squared_error(test_set['rating'], predictions)
        rmse = np.sqrt(mse)
        return mae, mse, rmse
    
    def predict_single(self, user, movie):
        return self.predict(user, movie, self.reg_sum_yj(user))

# Best hyperparameter settings found using gridsearch
best_params = {
    'n_epochs': 65,
    'n_factors': 30,
    'lr': 0.01, 
    'reg_rate': 0.02
}

# Train the best model on the full training data
svd_best = SVD(
    n_epochs=best_params['n_epochs'],
    n_users=n_users,
    n_items=n_items,
    n_factors=best_params['n_factors'],
    lr=best_params['lr'],
    reg_rate=best_params['reg_rate']
)

svd_best.fit(combined_data, val_set, verbose=True, patience=2, min_delta=0.0001)

# Predict a single rating from train data for testing purpose
#predicted_single_rating = svd_best.predict_single(69, 129)
#print(predicted_single_rating)

# Prepare to generate predictions for unknown ratings
result = []

# Iterate over each user block in the test data
for user in range(401, 501):  # Adjust user range based on the test file (e.g. (301, 401) for test10.txt)
    user_test_data = test_data[test_data['user'] == user]
    unknown_ratings = user_test_data[user_test_data['rating'] == 0]

    if len(unknown_ratings) > 0:
        # Predict the unknown ratings
        for index, row in unknown_ratings.iterrows():
            u, m = int(row['user']), int(row['movie'])
            predicted_rating = int(round(svd_best.predict_single(u, m)))
            predicted_rating = min(max(predicted_rating, 1), 5)
            result.append((u, m, predicted_rating))

# Convert the result list to a DataFrame and sort it
result_df = pd.DataFrame(result, columns=['user', 'movie', 'rating'])
result_df.sort_values(by=['user', 'movie'], inplace=True)

# Save the predictions (change to result10.txt for test10.txt)
result_df.to_csv('result20.txt', sep=' ', header=False, index=False)








Epoch: 1, Train MAE: 0.8800, Train RMSE: 1.0836, Train MSE: 1.1742, Val MAE: 0.8315, Val RMSE: 1.0301, Val MSE: 1.0610
Epoch: 2, Train MAE: 0.8060, Train RMSE: 1.0052, Train MSE: 1.0105, Val MAE: 0.7878, Val RMSE: 0.9836, Val MSE: 0.9675
Epoch: 3, Train MAE: 0.7724, Train RMSE: 0.9681, Train MSE: 0.9371, Val MAE: 0.7610, Val RMSE: 0.9538, Val MSE: 0.9097
Epoch: 4, Train MAE: 0.7500, Train RMSE: 0.9423, Train MSE: 0.8880, Val MAE: 0.7412, Val RMSE: 0.9315, Val MSE: 0.8677
Epoch: 5, Train MAE: 0.7330, Train RMSE: 0.9224, Train MSE: 0.8508, Val MAE: 0.7255, Val RMSE: 0.9137, Val MSE: 0.8349
Epoch: 6, Train MAE: 0.7192, Train RMSE: 0.9060, Train MSE: 0.8208, Val MAE: 0.7125, Val RMSE: 0.8988, Val MSE: 0.8079
Epoch: 7, Train MAE: 0.7076, Train RMSE: 0.8919, Train MSE: 0.7955, Val MAE: 0.7016, Val RMSE: 0.8860, Val MSE: 0.7851
Epoch: 8, Train MAE: 0.6975, Train RMSE: 0.8797, Train MSE: 0.7738, Val MAE: 0.6921, Val RMSE: 0.8749, Val MSE: 0.7654
Epoch: 9, Train MAE: 0.6887, Train RMSE: 0.8688,