# CS/INFO 5304 Assignment 3: Recommender Systems (Part D)

**Author**: Yufan Zhang (yz2894)

---

In [10]:
import pandas as pd
import numpy as np
from scipy.linalg import svd

# Load the datasets
businesses = pd.read_csv('data/business.csv', header=None, names=['business'])
ratings = pd.read_csv('data/user-business.csv', header=None)
ratings_test = pd.read_csv('data/user-business_test.csv', header=None)


In [18]:
from a2 import (
    user_user_predictor,
    item_item_predictor,
    latent_factor_predictor,
)

In [20]:
def convert_predictions_to_binary_by_threshold(predictions, threshold):
    """
    Convert continuous predictions into binary predictions (1s and 0s)
    based on a specified threshold.
    """
    binary_predictions = (predictions > threshold).astype(int)
    return binary_predictions


def convert_predictions_to_binary_by_top_k(predictions, k):
    """
    Convert continuous predictions into binary predictions (1s and 0s)
    by selecting the top k businesses with the highest predicted ratings.
    """
    top_k_indices = np.argsort(predictions)[-k:]
    binary_predictions = np.zeros_like(predictions)
    binary_predictions[top_k_indices] = 1
    return binary_predictions.astype(int)


def ensemble_predictor(user_index, business_indices, ratings_train, weights=None):
    """
    Ensemble predictor that combines predictions from multiple methods.
    
    :param user_index: Index of the user for whom predictions are made
    :param business_indices: Indices of businesses to predict ratings for
    :param ratings_train: Training ratings DataFrame
    :param weights: Weights for each predictor's contribution. If None, equal weighting is used.
    :return: Predicted ratings for the specified businesses for the specified user
    """
    # Placeholder for implementing the individual prediction functions
    prediction_user_user = user_user_predictor(user_index, ratings_train, business_indices)
    prediction_item_item = item_item_predictor(user_index, ratings_train, business_indices)
    prediction_latent_factor = latent_factor_predictor(user_index, ratings_train, 10, business_indices)
    
    # Normalize predictions
    prediction_user_user = (prediction_user_user - prediction_user_user.min()) / (prediction_user_user.max() - prediction_user_user.min())
    prediction_item_item = (prediction_item_item - prediction_item_item.min()) / (prediction_item_item.max() - prediction_item_item.min())
    prediction_latent_factor = (prediction_latent_factor - prediction_latent_factor.min()) / (prediction_latent_factor.max() - prediction_latent_factor.min())
    
    # If no weights provided, default to equal weighting
    if weights is None:
        weights = [1/3, 1/3, 1/3]
        
    # Combine predictions
    combined_prediction = (
        weights[0] * prediction_user_user +
        weights[1] * prediction_item_item +
        weights[2] * prediction_latent_factor
    )
    
    # Normalize combined prediction
    combined_prediction = (combined_prediction - combined_prediction.min()) / (combined_prediction.max() - combined_prediction.min())
    
    return combined_prediction


In [21]:
from sklearn.metrics import f1_score

# Split the data into training and validation sets
# validation_indices = range(5, len(ratings), 1000)
validation_indices = [5, 6, 7, 8, 9]

ratings_train = ratings
ratings_validation = ratings.iloc[validation_indices]


# Test the ensemble predictor on the validation set
business_indices = 100
threshold = 0.5
top_k = 10
f1_scores = []

for user_index in validation_indices:
    predictions = ensemble_predictor(user_index, business_indices, ratings_train)
    # print(predictions)
    
    predictions = convert_predictions_to_binary_by_top_k(predictions, top_k)
    # predictions = convert_predictions_to_binary_by_threshold(predictions, threshold)
    
    labels = ratings_validation.loc[user_index][:business_indices].values
    f1 = f1_score(labels, predictions)
    
    f1_scores.append(f1)
    
    print(f"Predictions for user {user_index}:")
    print(predictions)
    print(f"Labels for user {user_index}:")
    print(labels)
    print(f"F1 Score for user {user_index}:", f1)
    print("\n")
    
    
print("\nAverage F1 Score:", np.mean(f1_scores))

Predictions for user 5:
[0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Labels for user 5:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
F1 Score for user 5: 0.18181818181818182


Predictions for user 6:
[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
Labels for user 6:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
F1 Score for user 6: 0.5714285714285714


Predictions f

In [22]:
ratings_all = pd.concat([ratings, ratings_test])
ratings_all.reset_index(drop=True, inplace=True)

ratings_all

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
14398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
14399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
test_user_indices = list(ratings_all.index)[-5:]
test_user_indices

[14397, 14398, 14399, 14400, 14401]

In [24]:
ratings_train = ratings_all

# Test the ensemble predictor on the validation set
business_indices = 100
threshold = 0.5
top_k = 10
test_predictions = []

for user_index in test_user_indices:
    predictions = ensemble_predictor(user_index, business_indices, ratings_train)
    predictions = convert_predictions_to_binary_by_top_k(predictions, top_k)

    test_predictions.append(predictions)
    
    
with open('bonus_submission.csv', 'w') as f:
    for predictions in test_predictions:
        f.write(','.join(map(str, predictions)))
        f.write('\n')