# Competition Details
Welcome to the Amazon Reviews Recommender System Challenge! Your goal is to develop an accurate recommender system using any approach you prefer. You will train your model using Amazon product reviews and predict missing ratings for a hidden test set. Your predictions will be evaluated automatically on Kaggle, and you will be ranked based on Root Mean Squared Error (RMSE).

This is an open-ended competition where you can use any technique to improve your model’s accuracy, including collaborative filtering, matrix factorization, deep learning, or hybrid approaches.

## Data
We will use a subset of the Amazon Review dataset, which contains user-product ratings from the Electronics category.
Dataset Information:


*   UserID
*   ItemID
*   User-product ratings (1 to 5 stars)

For this challenge, you will be working with training and test datasets:


*   train_ratings.csv → Contains 80% of known ratings for training
*   test_ratings.csv → Contains 20% missing ratings, which you must predict
*   sample_submission.csv → A sample submission file to guide you

## Task Description
You are free to choose any approach to build your recommendation model, including Collaborative Filtering, Matrix Factorization, Deep Learning, and Hybrid Approaches. Feel free to use any techniques, methodologies, and approaches you want. In order to predict the missing ratings, you should train your model on train_ratings.csv and predict all missing ratings in test_ratings.csv. Then, you should also save your predictions in the required Kaggle submission format (please see the sample submission file).

## Evaluation
Evaluation is based on the final performance achieved by your best-reported model at the end of the competition.

Kaggle Link: https://www.kaggle.com/t/fb24fd522c3e43da9775a01b7979c901

# Code

In [1]:
# Amazon Reviews Recommender Systems Challenge Final Model
# Submitted by: Jeff Horowitz

# Hybrid Model Consisting of SVD, BaselineOnly, and NCF
# Test RMSE = 0.884 trained on full training dataset

# Import necessary libraries
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from surprise import SVD, Dataset, Reader, BaselineOnly
from surprise.model_selection import train_test_split
from surprise import accuracy
import random
import numpy as np

# Set seeds for reproducibility
SEED = 1392
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load data
train = pd.read_csv('train_amazon_ratings.csv')
test = pd.read_csv('test_amazon_ratings.csv')

# Load data into format for Surprice library models
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(train[['UserID', 'ItemID', 'Rating']], reader)

# Split for validation (90% train, 10% test)
#trainset, valset = train_test_split(data, test_size=0.10, random_state=SEED)

# Train on full dataset for better performance before submission
trainset = data.build_full_trainset() 

# --- SVD Model --- #
# Train the SVD model
svd_model = SVD(
    n_factors=10,
    lr_all=0.01,
    reg_all=0.2,
    n_epochs=12,
    random_state=SEED
)

svd_model.fit(trainset)

# Define the SVD prediction function
def svd_predict(user_id, item_id):
    return svd_model.predict(user_id, item_id).est

# --- NCF Model --- #
# Prepare data for PyTorch
user_ids = train['UserID'].unique()
item_ids = train['ItemID'].unique()
n_users = len(user_ids)
n_items = len(item_ids)

# Map IDs to indices
user_to_idx = {uid: idx for idx, uid in enumerate(user_ids)}
item_to_idx = {iid: idx for idx, iid in enumerate(item_ids)}

# Convert training data to tensors
train_users = torch.tensor([user_to_idx[uid] for uid in train['UserID']], dtype=torch.long)
train_items = torch.tensor([item_to_idx[iid] for iid in train['ItemID']], dtype=torch.long)
train_ratings = torch.tensor(train['Rating'].values, dtype=torch.float32)

# Define NCF model architecture
# This is a simple NCF model with two hidden layers with a ReLU activation function and dropout after each layer
# The embedding dimension is set to 8, and the model is trained with Adam optimizer and MSE loss function
class NCF(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim=8):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)

        self.fc1 = nn.Linear(embedding_dim * 2, 64)
        self.dropout1 = nn.Dropout(p=0.2)  # Dropout layer

        self.fc2 = nn.Linear(64, 32)
        self.dropout2 = nn.Dropout(p=0.2)  # Dropout layer

        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, users, items):
        user_emb = self.user_embedding(users)
        item_emb = self.item_embedding(items)
        x = torch.cat([user_emb, item_emb], dim=-1)

        x = self.relu(self.fc1(x))
        x = self.dropout1(x)

        x = self.relu(self.fc2(x))
        x = self.dropout2(x)

        x = self.output(x)
        return x.squeeze()

# Initialize and train NCF
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ncf_model = NCF(n_users, n_items, embedding_dim=8).to(device)
optimizer = optim.Adam(ncf_model.parameters(), lr=0.002, weight_decay=1e-5)
criterion = nn.MSELoss()

# Training loop
n_epochs = 15
batch_size = 64
ncf_model.train()
for epoch in range(n_epochs):
    for i in range(0, len(train_users), batch_size):
        batch_users = train_users[i:i+batch_size].to(device)
        batch_items = train_items[i:i+batch_size].to(device)
        batch_ratings = train_ratings[i:i+batch_size].to(device)
        optimizer.zero_grad()
        predictions = ncf_model(batch_users, batch_items)
        loss = criterion(predictions, batch_ratings)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}")

# NCF prediction function
def ncf_predict(user_id, item_id):
    ncf_model.eval()
    with torch.no_grad():
        user_idx = torch.tensor([user_to_idx.get(user_id, 0)], dtype=torch.long).to(device)
        item_idx = torch.tensor([item_to_idx.get(item_id, 0)], dtype=torch.long).to(device)
        pred = ncf_model(user_idx, item_idx).cpu().item()
    return pred

# --- BaselineOnly Model --- #
# Train the BaselineOnly model
baseline_model = BaselineOnly(bsl_options={
    'method': 'als',
    'n_epochs': 20,
    'reg_u': 0.3,
    'reg_i': 0.3
})

baseline_model.fit(trainset)

# Define the Baseline prediction function
def baseline_predict(user_id, item_id):
    return baseline_model.predict(user_id, item_id).est

# --- Hybrid Prediction Function with Capping --- #
# Define the model weights
weight_svd = 0.9
weight_ncf = 0.05
weight_baseline = 0.05

# Define the hybrid prediction function
def hybrid_predict(user_id, item_id):
    svd_pred = svd_predict(user_id, item_id)
    ncf_pred = ncf_predict(user_id, item_id)
    baseline_pred = baseline_predict(user_id, item_id)
    hybrid_rating = (weight_svd * svd_pred) + (weight_ncf * ncf_pred) + (weight_baseline * baseline_pred)
    # Cap predictions between 1 and 5
    return max(1, min(5, hybrid_rating))

# Uncomment the entire block below to run the hybrid model on the validation set without submission to Kaggle
# This block is commented out to avoid running it during the submission process
# This block calculates validation RMSE to allow for hyperparameter tuning and model evaluation
# ---------------------------------------------------------------------------------------------------------------------------------- #
# --- Validation RMSEs --- #
# #SVD predictions on validation set
# svd_predictions = svd_model.test(valset)
# baseline_predictions = baseline_model.test(valset)

# # NCF predictions on validation set
# ncf_model.eval()
# val_users = torch.tensor([user_to_idx.get(uid, 0) for uid, _, _ in valset], dtype=torch.long).to(device)
# val_items = torch.tensor([item_to_idx.get(iid, 0) for _, iid, _ in valset], dtype=torch.long).to(device)
# with torch.no_grad():
#     ncf_ratings = ncf_model(val_users, val_items).cpu().numpy()

# # Hybrid predictions on validation set with capping
# hybrid_predictions = []
# for svd_pred, ncf_rating, baseline_pred in zip(svd_predictions, ncf_ratings, baseline_predictions):
#     hybrid_rating = (weight_svd * svd_pred.est) + (weight_ncf * ncf_rating) + (weight_baseline * baseline_pred.est)
#     # Cap hybrid predictions between 1 and 5
#     capped_hybrid_rating = max(1, min(5, hybrid_rating))
#     hybrid_pred = svd_pred._replace(est=capped_hybrid_rating)
#     hybrid_predictions.append(hybrid_pred)

# # Calculate and print RMSEs
# print("\nValidation Results:")
# hybrid_rmse = accuracy.rmse(hybrid_predictions, verbose=False)
# print(f"Hybrid RMSE: {hybrid_rmse}")
# ---------------------------------------------------------------------------------------------------------------------------------- #

Epoch 1/15, Loss: 1.1390
Epoch 2/15, Loss: 1.2778
Epoch 3/15, Loss: 1.1693
Epoch 4/15, Loss: 0.9178
Epoch 5/15, Loss: 0.7088
Epoch 6/15, Loss: 0.6887
Epoch 7/15, Loss: 0.6373
Epoch 8/15, Loss: 0.6106
Epoch 9/15, Loss: 0.5280
Epoch 10/15, Loss: 0.4506
Epoch 11/15, Loss: 0.3895
Epoch 12/15, Loss: 0.4268
Epoch 13/15, Loss: 0.4582
Epoch 14/15, Loss: 0.2756
Epoch 15/15, Loss: 0.4089
Estimating biases using als...


In [2]:
# Use this code to generate a submission file for Kaggle in the correct format
# Define file generation function
def generate_submission_file(model_function, test_data, filename="submission.csv"):
    """Generate a submission file for Kaggle."""
    submission_data = []

    for _, row in test_data.iterrows():
        id, user_id, item_id = row['id'], row['UserID'], row['ItemID']
        predicted_rating = model_function(user_id, item_id)
        submission_data.append([id, predicted_rating])

    submission_df = pd.DataFrame(submission_data, columns=["id", "PredictedRating"])
    submission_df.to_csv(filename, index=False)
    print(f"✅ Submission file saved as {filename}")

# Call function to generate submission file
generate_submission_file(hybrid_predict, test, filename="submission_SVD_NCF_Baseline.csv")

✅ Submission file saved as submission_SVD_NCF_Baseline.csv
