### SVD Model ###


In [None]:
import autograd.numpy as ag_np
import numpy as np
import pandas as pd
import os

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


# %pip install autograd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets

DATA_DIR = './data_movie_lens_100k'


In [None]:
train_tuple, valid_tuple, test_tuple, n_users, n_items = load_train_valid_test_datasets()


In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import os

# Specify the folder containing the data
data_folder = 'data_movie_lens_100k'

# Load the additional data (user and movie info) from the specified folder
user_info = pd.read_csv(os.path.join(data_folder, 'user_info.csv'))
movie_info = pd.read_csv(os.path.join(data_folder, 'movie_info.csv'))

# Example of encoding additional features for users and items
# One-hot encode user info (age and gender)
user_info['age_group'] = pd.cut(user_info['age'], bins=[0, 18, 30, 40, 50, 100], labels=["0-18", "19-30", "31-40", "41-50", "50+"])

# Use the correct argument for sparse matrix
encoder = OneHotEncoder(sparse_output=False)

# Encode the 'age_group' and 'is_male' columns separately
encoded_user_info = encoder.fit_transform(user_info[['age_group', 'is_male']])

# One-hot encode movie info (release year)
movie_info['release_year'] = movie_info['release_year'].astype(str)
encoder_movie = OneHotEncoder(sparse_output=False)

# Encode the 'release_year' column
encoded_movie_info = encoder_movie.fit_transform(movie_info[['release_year']])

# Now, assign the feature names correctly by using encoder.get_feature_names_out()
user_info_encoded = pd.DataFrame(encoded_user_info, columns=encoder.get_feature_names_out(['age_group', 'is_male']))
movie_info_encoded = pd.DataFrame(encoded_movie_info, columns=encoder_movie.get_feature_names_out(['release_year']))

# Merge with the original datasets
user_info = pd.concat([user_info, user_info_encoded], axis=1)
movie_info = pd.concat([movie_info, movie_info_encoded], axis=1)

# Merge user and movie info with the ratings data
train_data = pd.read_csv(os.path.join(data_folder, "ratings_all_development_set.csv"))
train_data = pd.merge(train_data, user_info, on="user_id", how="left")
train_data = pd.merge(train_data, movie_info, on="item_id", how="left")

# Convert to Surprise format
from surprise import Reader, Dataset
reader = Reader(rating_scale=(1, 5))
train_dataset = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)

# Train-test split for validation and testing
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(train_dataset, test_size=0.2)


In [None]:
# Print the best parameters
print("Best parameters:", grid_search.best_params['mae'])


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract hyperparameter results from the grid search
results = grid_search.cv_results
param_factors = results['param_n_factors']
mean_mae = results['mean_test_mae']
std_mae = results['std_test_mae']

# Plotting the results
plt.figure(figsize=(8, 6))
plt.errorbar(param_factors, mean_mae, yerr=std_mae, fmt='o-', capsize=5, label='Mean MAE ± StdDev', color='blue')
plt.xlabel('Number of Latent Factors (n_factors)', fontsize=12)
plt.ylabel('Mean Absolute Error (MAE)', fontsize=12)
plt.title('Hyperparameter Selection for SVD: Impact of n_factors', fontsize=14)
plt.xticks(param_factors, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()

# Save or display the plot
plt.savefig('hyperparameter_selection_plot.png')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Training the SVD model and recording error over epochs
n_epochs = 50  # Example: Training with 50 epochs
svd = SVD(n_factors=100, reg_all=0.1, lr_all=0.002, n_epochs=n_epochs)

# Track training error
train_errors = []
for epoch in range(1, n_epochs + 1):
    svd.epochs = epoch
    svd.fit(trainset)
    predictions = svd.test(trainset.build_testset())
    mae = accuracy.mae(predictions, verbose=False)
    train_errors.append(mae)

# Plotting the trace plot
plt.figure(figsize=(8, 6))
plt.plot(range(1, n_epochs + 1), train_errors, marker='o', color='blue', label='Training MAE')
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Mean Absolute Error (MAE)', fontsize=12)
plt.title('Training Trace Plot for SVD', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=12)
plt.tight_layout()

# Save or display the plot
plt.savefig('trace_plot.png')
plt.show()
