# EmoRecSys - Hybrid

[EmoRecSys Survey](https://emorecsys.pt/)

###### Imports

In [3]:
# ---- INSTALLATIONS ---- #
# !pip install scikit-surprise

In [4]:
# ---- IMPORTS ---- #
from ipywidgets import HBox, VBox, Image as WidgetImage
from sklearn.model_selection import ParameterGrid
from surprise import Reader, Dataset
from collections import defaultdict
from IPython.display import display
import pandas as pd
import numpy as np
import random

# ---- IMPORTS CF ---- #
from surprise import NMF

# ---- IMPORTS Demographic ---- #
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA
from scipy import stats
import warnings
import math

warnings.filterwarnings("ignore", message="Exited. *not reaching the requested tolerance.*", category=UserWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names, but KMeans was fitted with feature names", category=UserWarning)

In [5]:
## EDA are in the correspondent notebooks
dataset_demo = pd.read_csv("../data/csvs/demographic.csv")
dataset_cf = pd.read_csv("../data/csvs/ratings.csv")

## 1. Preprocessing

In [6]:
## preprocessing for Demographic dataset
user_profiles = dataset_demo.iloc[:, -27:].copy()

## one-hot encoder
for col in ['hobby_other', 'country_residence', 'city', 'education', 'gender', 'populational_aff', 'age']:
  dummies = pd.get_dummies(dataset_demo[col], prefix=col)
  user_profiles = pd.concat([dummies, user_profiles], axis=1)

user_profiles.dropna(inplace=True)
user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)

## applying PCA
pca = PCA(n_components=50)
pca.fit(user_profiles)
tve = 0 # total variance explained
for i, ve in enumerate(pca.explained_variance_ratio_):
  tve += ve
  print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i+1, ve, tve))

## keep 32 principal components, since we get a total explained variance of 90.13%.
X_pca = pca.transform(user_profiles)[:, :32]

## see dataframe
user_profiles_train = pd.DataFrame(data=X_pca, columns=['PCA'+str(i) for i in range(1, X_pca.shape[1]+1)], index=dataset_demo['id_survey'])

  user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)


PC1 - Variance explained:  0.1056 - Total Variance:  0.1056
PC2 - Variance explained:  0.0998 - Total Variance:  0.2053
PC3 - Variance explained:  0.0698 - Total Variance:  0.2751
PC4 - Variance explained:  0.0646 - Total Variance:  0.3397
PC5 - Variance explained:  0.0605 - Total Variance:  0.4002
PC6 - Variance explained:  0.0487 - Total Variance:  0.4488
PC7 - Variance explained:  0.0457 - Total Variance:  0.4946
PC8 - Variance explained:  0.0417 - Total Variance:  0.5363
PC9 - Variance explained:  0.0354 - Total Variance:  0.5717
PC10 - Variance explained:  0.0330 - Total Variance:  0.6048
PC11 - Variance explained:  0.0303 - Total Variance:  0.6351
PC12 - Variance explained:  0.0272 - Total Variance:  0.6624
PC13 - Variance explained:  0.0253 - Total Variance:  0.6877
PC14 - Variance explained:  0.0229 - Total Variance:  0.7106
PC15 - Variance explained:  0.0201 - Total Variance:  0.7308
PC16 - Variance explained:  0.0182 - Total Variance:  0.7490
PC17 - Variance explained:  0.016

### Splitting

In [7]:
def check_items(trainset, testset):
  """
  Checks if all items present in testset are also present in the trainset.
  
  Args:
    trainset (list of tuples): Each tuple represents a user-item interaction in the trainset. 
      Each tuple is expected to have the form (user, item, rating).
    testset (list of tuples): Each tuple represents a user-item interaction in the testset. 
      Each tuple is expected to have the form (user, item, rating).
    
  Returns:
    bool: True if all items in the testset are present in the trainset, otherwise returns False
  
  """
  trainset_items = set(list(item for _, item, _ in trainset))
  testset_items = set(list(item for _, item, _ in testset))

  items_unknown = list(testset_items - trainset_items)

  return len(items_unknown) == 0

def split_emorecsys(dataset, train=0.8, test=0.2):
  """
  Splits the dataset into training and testing sets, ensuring that each user's ratings are split accordingly.
  The function guarantees that each testset contains at least one relevant rating (liked item) and ensures
  that all items in the testset also appear in the trainset.

  Args:
    dataset (pandas.DataFrame): Dataframe containing user ratings. The dataframe must have columns `id_survey` (user ID),
      `id_photo` (item ID), and `like_bool` (binary rating indicating whether the user likes the item or not).
    train (float): The proportion of the dataset to include in the trainset. Default if 0.8.
    test (float): The proportion of the dataset to include in the testset. Default if 0.2.

  Returns:
    surprise_train (surprise.Trainset): A trainset formatted for use with the Surprise library.
    testset (list of tuples): A list of tuples representing the testset, where each tuple has the form (user, item, rating).
  
  """
  trainset, testset = list(), list()

  items_ratings = dataset.groupby('id_photo').size().to_dict() # get number of rating for each id_photo
  items_one_rating = [id_photo for id_photo, size in items_ratings.items() if size == 1] # get the ones with only 1 rating
  # print(items_one_rating)


  for user in list(set(dataset['id_survey'])):
    user_ratings = dataset[dataset['id_survey'] == user]
    all_ratings = list((user, id_photo, like_bool) for id_photo, like_bool in zip(user_ratings['id_photo'], user_ratings['like_bool']))
    size_ratings = len(all_ratings)

    user_trainset = list(rating for rating in all_ratings if rating[1] in items_one_rating) # start trainset with items with only one rating

    relevant = list(x for x in all_ratings if x[2] == 1)
    if relevant: # making sure that we have at least one relevant rating in the testset
      new_rating = random.sample(relevant, 1)
      user_testset = new_rating if new_rating[0] not in user_trainset else list()
    else:
      raise ValueError('The user ' + str(user) + ' did not liked any photo')

    while len(user_testset) < size_ratings*test: # add random ratings until fullfil the size of testset
      new_rating = random.choice(all_ratings)
      if new_rating not in user_testset and new_rating not in user_trainset:
        user_testset.append(new_rating)

    # adding remaining ratings to trainset
    user_trainset.extend(new_rating for new_rating in all_ratings if new_rating not in user_testset and new_rating not in user_trainset)

    assert len(user_trainset) == size_ratings*train
    assert len(user_testset) == size_ratings*test

    # print(user_trainset)
    # print(user_testset)

    trainset.extend(user_trainset)
    testset.extend(user_testset)

  if check_items(trainset, testset):
    train_df = pd.DataFrame(trainset, columns=['id_survey', 'id_photo', 'like_bool']) # like_bool
  else:
    return split_emorecsys(dataset, train, test)

  reader = Reader(rating_scale=(0,1)) # like_bool
  surprise_train = Dataset.load_from_df(train_df, reader).build_full_trainset()

  return surprise_train, testset

### Ratings Matrix

In [8]:
## now we will be creating a ratings_matrix, in this case using the `like_bool` as the rating
ratings_matrix_like = dataset_cf.pivot_table(index='id_survey', columns='id_photo', values='like_bool').reset_index(drop=True)

## 2. Modelling and Evaluating

In [9]:
cf_params = {'n_epochs': 20, 'n_factors': 100, 'reg_pu': 0.001, 'reg_qi': 0.01} # nmf

demo_params = {'affinity': 'nearest_neighbors', 'assign_labels': 'kmeans', 'eigen_solver': 'lobpcg', 'gamma': 2.9, 'n_clusters': 20, 'n_neighbors': 30}

##### Evaluation

In [10]:
def similar_users(user, model, n=5, model_dist=None, Kmeans=False):
  """
  Identifies the most similar users to a given test user based on their profiles. It can use clustering models and distance
  measures to find similar users.

  Args:
    user (int): The ID of the test user for whom similar users are to be found.
    model (object): A clustering model that has been fitted to user profiles.
    n (int): The number of similar users to return. Default is 5.
    model_dist (array, optional): Distance matrix for cluster centers. Required if Kmeans is True.
    Kmeans (bool): A flag indicating whether to use KMeans clustering for finding similar users. Default is False.

  Returns:
    users_similar (list of tuples): A list of tuples where each tuple contains a user ID and the similarity score. The list 
    is sorted in descendin order of similarity.

  """
  ## 1. buscar o cluster do user test
  test_user = user_profiles_train.loc[user]
  user_idx = user_profiles_train.index.get_loc(user)

  ## 2. buscar os users do mesmo cluster do user test
  user_cluster = model.labels_[user_idx]
  users = [id for id, cluster in enumerate(model.labels_) if cluster == user_cluster and id != user_idx]

  if len(users) < 5: # 2.5. vamos buscar mais users
    if Kmeans:
      user_dist = model_dist[user_profiles_train.index.get_loc(user)]
      sim_cluster = sorted(range(len(user_dist)), key=lambda x: user_dist[x])[1:2][0] # vamos só buscar 1 cluster extra
      users.extend([id for id, cluster in enumerate(model.labels_) if cluster == sim_cluster])

    else:
      dist = euclidean_distances([test_user], user_profiles_train.values)[0] # aqui vamos ver todos os users
      sorted_users = sorted(range(len(dist)), key=lambda x: dist[x])

      # filtrar para não haver repetição de users, ou inserção do test_user
      sorted_users_new = [x for x in sorted_users if x not in users]

      for user in sorted_users_new:
        if len(users) < 5 and user != user_idx:
          users.append(user)

  ## 3. calcular semelhanças entre o user test e os users dos clusters mais próximos
  users_similar = []
  for user_id in users:
      user_profile = user_profiles_train.iloc[user_id]
      similarity_score = 1 / (1 + euclidean_distances([test_user], [user_profile])[0])
      users_similar.append((user_id, similarity_score))

  ## 4. ordem decrescente para o valor de semelhança
  users_similar = sorted(users_similar, key=lambda x: x[1], reverse=True)

  return users_similar[:n]

In [11]:
def recommend_items_from_users(user, model, ratings_matrix, k=10):
  """
  Recommends items for a specific user based on the preferences of similar users. The recommendation is made by aggregating
  the ratings of items from users who are similar to the target user, weighted by their similarity.

  Args:
    user (int): The ID of the user for whom recommendations are to be made.
    model (object): A model object used to find similar users.
    ratings_matrix (pandas.DataFrame): A matrix where rows represent users and columns represent items, with ratings as values.
    k (int): The number of top items to recommended. Default is 10.

  Returns:
    recommended_items (list of tuples): A list of tuples, where each tuple contains an item ID and its corresponding score.
      The list contains the top k recommended items.
  
  """
  rec_users = similar_users(user, model, n=5)
  weights = [1 / (i+1) for i, _ in enumerate(rec_users)]

  users = list(user[0] for user in rec_users)
  rec_users_items = ratings_matrix.iloc[users] # items of similar users
  item_count = rec_users_items.count(axis=0) # number of ratings of each item within the similar user

  weighted_avg_rating = rec_users_items.multiply(weights, axis=0).mean(axis=0)
  weighted_scores = weighted_avg_rating * item_count # calculate a weighted score using the weighted ratings and the number of ratings per item

  recommended_items = list(weighted_scores.nlargest(k).items())

  return recommended_items

def recommend_items_from_cf(predictions, threshold=0):
  """
  Recommends items based on collaborative filtering predictions. The functions sorts predictions by estimated rating, and
  filters them based on a given threshold.

  Args:
    predictions (list of tuples): A list of predictions tuples, where each tuple contains a user ID, item ID, actual rating,
      estimated rating, and an additional information.
    threshold (float): Minimum estimated rating to consider an item for recommendation. Default is 0.

  Returns:
    recommended_items (list of tuples): A list of tuples where each tuple contains an item ID and its estimated rating. The 
      list is sorted in descending order of the estimated ratings.
  
  """
  preds = list()
  for _, iid, _, est, _ in predictions:
    preds.append((iid, est))

  preds.sort(key=lambda x: x[1], reverse=True)
  recommended_items = [(iid, est) for (iid, est) in preds if est >= threshold]

  return recommended_items

def evaluation(ratings_matrix, recommendation, relevant, k=10):
  """
  Evaluates the quality of recommendations using prediction, recall, and F1 score metrics.

  Args:
    ratings_matrix (pandas.DataFrame): A matrix where rows represent users and columns represent items, with ratings as values.
    recommendation (list of tuples): The list of recommended items, where each item is represented by a tuple containing an item ID and its score.
    relevant (list of tuples): The list of relevant items, where each item is represented by a tuple containing a user ID, item ID, and a binary 
      indicator of relevance (like_bool).
    k (int): The nuber of top items to consider for evalaution. Default is 10.

  Returns:
    precisions (list): Precision values for different values of k.
    recalls (list): Recall values for different values of k.
    f1scores (list): F1 score values for different values of k.
  
  """
  precisions, recalls, f1scores = [], [], []

  for i in range(1, k+1):
  # for i in range(k, k+1):
    precision, recall, f1score = precision_recall_at_k(recommendation, relevant, i)
    # print("K =", i, "- Precision:", precision, ", Recall:", recall, ", F1 Score:", f1score)

    precisions.append(precision)
    recalls.append(recall)
    f1scores.append(f1score)

  return precisions, recalls, f1scores

def precision_recall_at_k(recommendation, relevant, k=10):
  """
  Calculates precision, recall and F1 score at a given value of k for the recommended items compared to the relevant items.

  Args:
    recommendation (list of tuples): The list of recommended items, where each item is represented by a tuple containing an item Id and its score.
    relevant (list of tuples): The list of relevant items, where each item is represented by a tuple containing a user ID, item ID, and a binary 
      indicator of relevance (like_bool).
    k (int): The number of top items to consider for precision, recall and F1 score calculation.

  Returns:
    precision (float): Precision at k
    recall (float): Recall at k
    f1score (float): F1 score at k

  """
  relevant_items = list(item[1] for item in relevant if item[2] > 0)
  # print("ITENS RELEVANTES:", relevant_items)


  # for i in range(k):
  #   print(f"K = {i+1} - ", recommendation[:i+1])
  # print()

  rel = len(relevant_items) # total number of relevant items to the user
  rel_rec = np.sum(np.isin(recommendation[:k], relevant_items)) # number of relevant items recommended to the user

  # k is the total number of recommended items to the user
  precision = rel_rec / k # number of relevant items recommended to the user / total number of recommended items to the user
  recall = rel_rec / rel if rel != 0 else 1  # number of relevant items recommended to the user / total number of relevant items to the user
  f1score = (2*precision*recall) / (precision+recall) if (precision+recall) != 0 else 0.0

  return precision, recall, f1score

def avg_metrics(precisions, recalls, f1scores, k=10):
  """
  Calculates the average precision, recall, and F1 score across multiple users or scenarios for different values of k.

  Args:
    precisions (list of lists): Precision values for different splits.
    recalls (list of lists): Recall values for different splits.
    f1scores (list of lists): F1 scores for different splits.
    k (int): The maximum value of k to consider. Default is 10.

  Returns:
    precisions_avg (list): Average precision values for different values of k.
    recall_avg (list): Average recall values for different values of k.
    f1score_avg (list): Average F1 score values for different values of k.
  
  """
  precisions_avg, recall_avg, f1score_avg = [], [], []
  for i in range(k):
    precision, recall, f1score = [], [], []

    for prec, rec, f1 in zip(precisions, recalls, f1scores):
      precision.append(prec[i])
      recall.append(rec[i])
      f1score.append(f1[i])

    precisions_avg.append(np.round(np.mean(precision), 4))
    recall_avg.append(np.round(np.mean(recall), 4))
    f1score_avg.append(np.round(np.mean(f1score), 4))

  return precisions_avg, recall_avg, f1score_avg

##### Hybrid System

In [12]:
def hybrid_system(dataset, ratings_matrix, k=10):
  """
  Evaluates a hybrid recommendation system by combining collaborative filtering and demographic-based recommendations. It computes the 
  precision, recall, and F1 score metrics for the recommendations generated by this hybrid approach.

  Args:
    dataset (pandas.DataFrame): Dataframe containing user ratings. The dataframe must have columns `id_survey` (user ID),
      `id_photo` (item ID), and `like_bool` (binary rating indicating whether the user likes the item or not).
    ratings_matrix (pandas.DataFrame): A matrix where rows represent users and columns represent items, with ratings as values.
    k (int): The number of top items to consider for evaluation. Default is 10.
  
  """
  precisions_final, recalls_final, f1scores_final = [], [], []
  for i in range(5):
    trainset, testset = split_emorecsys(dataset)

    ## CF
    model_cf = NMF(**cf_params)
    # model_cf = CoClustering(**cf_params2)
    model_cf.fit(trainset)

    ## Demographic-based
    model_demo = SpectralClustering(**demo_params)
    model_demo.fit(user_profiles_train)

    test_users = list(set(item[0] for item in testset))
    for user in test_users:

      user_relevant = list(item for item in testset if item[0] == user) # relevant items for evaluation
      recommended_items_by_users = recommend_items_from_users(user=user, model=model_demo, ratings_matrix=ratings_matrix, k=20)

      # print("USER", user)
      # print(recommended_items_by_users)

      predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
      recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)
      # print(recommended_items_by_cf)
      # print()

      weighted_scores = []
      for demo_score, cf_score in zip(recommended_items_by_users, recommended_items_by_cf):
        weighted_score = (min(demo_score[1], 1)*0.5) + (min(cf_score[1], 1)*0.5)
        weighted_scores.append((demo_score[0], weighted_score))

      weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort
      # print(weighted_scores)

      precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)

      precisions_final.append(precisions)
      recalls_final.append(recalls)
      f1scores_final.append(f1scores)

  precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

  for i in range(k):
    print(f"K = {i+1} - Precision: {precision_avg[i]}, Recall: {recall_avg[i]}, F1 Score: {f1score_avg[i]}")

## 2.1. Results

In [13]:
hybrid_system(dataset_cf, ratings_matrix_like) # nmf

K = 1 - Precision: 0.0172, Recall: 0.0092, F1 Score: 0.0115
K = 2 - Precision: 0.0196, Recall: 0.019, F1 Score: 0.0186
K = 3 - Precision: 0.0196, Recall: 0.028, F1 Score: 0.0224
K = 4 - Precision: 0.0199, Recall: 0.0389, F1 Score: 0.0255
K = 5 - Precision: 0.0211, Recall: 0.0505, F1 Score: 0.029
K = 6 - Precision: 0.0207, Recall: 0.0626, F1 Score: 0.0302
K = 7 - Precision: 0.021, Recall: 0.073, F1 Score: 0.0318
K = 8 - Precision: 0.0201, Recall: 0.0785, F1 Score: 0.0313
K = 9 - Precision: 0.0199, Recall: 0.0881, F1 Score: 0.0318
K = 10 - Precision: 0.0193, Recall: 0.0945, F1 Score: 0.0314


## 3. Recommending

In [14]:
dataset_photos = pd.read_csv("../data/csvs/photos.csv")
dataset_photos.set_index('id', inplace=True)

In [15]:
def display_image(show_list, n_show=3):
  """
  Displays a list of images in a grid format. It reads images from the specified file paths and displays them using widgets.

  Args:
    show_list (list of str): A list of item IDs representing the images to be displayed.
    n_show (int): The number of images to display per row. Default is 3.

  """
  relevant_images_widgets = []
  for item in show_list:
    image = dataset_photos.loc[item]['file_name']
    ext = dataset_photos.loc[item]['ext']
    image_path = f'../data/photos/{image}.{ext}'

    with open(image_path, "rb") as file:
      img = file.read()

    img_widget = WidgetImage(value=img, format='jpg', width=200, height=200)
    relevant_images_widgets.append(img_widget)

  display(VBox([HBox(relevant_images_widgets[i:i+n_show]) for i in range(0, len(relevant_images_widgets), n_show)]))

In [16]:
def recommending_hf(model_cf, model_demo, dataset):
  """
  Demonstrates a hybrid recommendation system that combines collaborative filtering and demographic-based recommendations. It provides a list of top recommended 
  items based on the hybrid approach.

  Args:
    model_cf: The collaborative filtering model.
    model_demo: The demographic-based model.
    dataset (pandas.DataFrame): Dataframe containing user ratings. The dataframe must have columns `id_survey` (user ID),
      `id_photo` (item ID), and `like_bool` (binary rating indicating whether the user likes the item or not).
  
  """
  trainset, testset = split_emorecsys(dataset)

  test_users = list(set(item[0] for item in testset))
  user = random.choice(test_users)
  items_relevant = list(item[1] for item in testset if item[0] == user and item[2] > 0)

  print('RELEVANT IMAGES:')
  display_image(items_relevant, len(items_relevant))

  model_demo.fit(user_profiles_train)
  recommended_items_by_users = recommend_items_from_users(user=user, model=model_demo, ratings_matrix=ratings_matrix_like, k=20)

  model_cf.fit(trainset)
  predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
  recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)

  weighted_scores = []
  for demo_score, cf_score in zip(recommended_items_by_users, recommended_items_by_cf):
    weighted_score = (min(demo_score[1], 1)*0.5) + (min(cf_score[1], 1)*0.5)
    weighted_scores.append((demo_score[0], weighted_score))

  weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort

  print('TOP 5 IMAGES RECOMMENDED:')
  ratings_5 = list(iid for iid, _ in weighted_scores[:5])
  display_image(ratings_5, 5)

In [17]:
recommending_hf(NMF(**cf_params), SpectralClustering(**demo_params), dataset_cf)

RELEVANT IMAGES:


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…

TOP 5 IMAGES RECOMMENDED:


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…