### SVD Model - This should get MAE of around 0.52 ###


In [1]:
import autograd.numpy as ag_np
import numpy as np
import pandas as pd
import os

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


# %pip install autograd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets

DATA_DIR = './data_movie_lens_100k'


In [None]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()


In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import os
from surprise import SVD, Reader, Dataset
from surprise.model_selection import GridSearchCV, train_test_split
from surprise import accuracy
from sklearn.preprocessing import OneHotEncoder


# Specify the folder containing the data
data_folder = 'data_movie_lens_100k'

# Load the additional data (user and movie info) from the specified folder
user_info = pd.read_csv(os.path.join(data_folder, 'user_info.csv'))
movie_info = pd.read_csv(os.path.join(data_folder, 'movie_info.csv'))

# Example of encoding additional features for users and items
# One-hot encode user info (age and gender)
user_info['age_group'] = pd.cut(user_info['age'], bins=[0, 18, 30, 40, 50, 100], labels=["0-18", "19-30", "31-40", "41-50", "50+"])
encoder = OneHotEncoder(sparse=False)
encoded_user_info = encoder.fit_transform(user_info[['age_group', 'is_male']])

# One-hot encode movie info (release year)
movie_info['release_year'] = movie_info['release_year'].astype(str)
encoder = OneHotEncoder(sparse=False)
encoded_movie_info = encoder.fit_transform(movie_info[['release_year']])

# Merge the encoded user and movie info with the original datasets
user_info_encoded = pd.DataFrame(encoded_user_info, columns=encoder.get_feature_names_out())
movie_info_encoded = pd.DataFrame(encoded_movie_info, columns=encoder.get_feature_names_out())

# Merge with original ratings data
train_data = pd.read_csv(os.path.join(data_folder, "ratings_all_development_set.csv"))
train_data = pd.merge(train_data, user_info, on="user_id", how="left")
train_data = pd.merge(train_data, movie_info, on="item_id", how="left")

# Convert to Surprise format
reader = Reader(rating_scale=(1, 5))
train_dataset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)

# Train-test split for validation and testing
trainset, testset = train_test_split(train_dataset, test_size=0.2)

In [None]:
# Parameter grid for grid search
param_grid = {
    'n_factors': [50, 100, 150],
    'reg_all': [0.1, 0.2, 0.3],
    'lr_all': [0.002, 0.005],
    'n_epochs': [20, 30]
}

# Perform GridSearch for best SVD model
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
grid_search.fit(train_dataset)


In [None]:
# Get the best model
best_svd = grid_search.best_estimator['mae']

# Train the best model on the entire training data
trainset = train_dataset.build_full_trainset()
best_svd.fit(trainset)

# Test on the test set
test_dataset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)  # Reuse train_data as example
testset = test_dataset.build_full_trainset().build_testset()
predictions = best_svd.test(testset)

# Evaluate MAE
mae = accuracy.mae(predictions)
print(f"Test MAE: {mae}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract hyperparameter results from the grid search
results = grid_search.cv_results
param_factors = results['param_n_factors']
mean_mae = results['mean_test_mae']
std_mae = results['std_test_mae']

# Plotting the results
plt.figure(figsize=(8, 6))
plt.errorbar(param_factors, mean_mae, yerr=std_mae, fmt='o-', capsize=5, label='Mean MAE ± StdDev', color='blue')
plt.xlabel('Number of Latent Factors (n_factors)', fontsize=12)
plt.ylabel('Mean Absolute Error (MAE)', fontsize=12)
plt.title('Hyperparameter Selection for SVD: Impact of n_factors', fontsize=14)
plt.xticks(param_factors, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()

# Save or display the plot
plt.savefig('hyperparameter_selection_plot.png')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Training the SVD model and recording error over epochs
n_epochs = 50  # Example: Training with 50 epochs
svd = SVD(n_factors=100, reg_all=0.1, lr_all=0.002, n_epochs=n_epochs)

# Track training error
train_errors = []
for epoch in range(1, n_epochs + 1):
    svd.epochs = epoch
    svd.fit(trainset)
    predictions = svd.test(trainset.build_testset())
    mae = accuracy.mae(predictions, verbose=False)
    train_errors.append(mae)

# Plotting the trace plot
plt.figure(figsize=(8, 6))
plt.plot(range(1, n_epochs + 1), train_errors, marker='o', color='blue', label='Training MAE')
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Mean Absolute Error (MAE)', fontsize=12)
plt.title('Training Trace Plot for SVD', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()

# Save or display the plot
plt.savefig('trace_plot.png')
plt.show()
