# EmoRecSys - Demographic

[EmoRecSys Survey](https://emorecsys.pt/)

###### Imports

In [1]:
# ---- INSTALLATIONS ---- #
# !pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357252 sha256=49f37c57f30c6d4c58bec5da0b3fe647c4cf72c64eca75adef4a8e3618afbfae
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [1]:
# ---- IMPORTS ---- #
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN
from sklearn.metrics.pairwise import euclidean_distances
from ipywidgets import HBox, VBox, Image as WidgetImage
from sklearn.model_selection import ParameterGrid
from sklearn.decomposition import PCA
from surprise import Reader, Dataset
from IPython.display import display
import pandas as pd
import numpy as np
import warnings
import random
import math

warnings.filterwarnings("ignore", message="Exited. *not reaching the requested tolerance.*", category=UserWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names, but KMeans was fitted with feature names", category=UserWarning)

## 1. Preprocessing

In [2]:
dataset_demo = pd.read_csv("../data/csvs/demographic.csv")

In [3]:
user_profiles = dataset_demo.iloc[:, -27:].copy()

# one-hot encoder
for col in ['hobby_other', 'country_residence', 'city', 'education', 'gender', 'populational_aff', 'age']:
  dummies = pd.get_dummies(dataset_demo[col], prefix=col)
  user_profiles = pd.concat([dummies, user_profiles], axis=1)

user_profiles.dropna(inplace=True)
user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)

user_profiles

  user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)


Unnamed: 0,age_18 - 24,age_25 - 30,age_31 - 40,age_41 - 50,age_51 - 60,age_61 - 70,age_71 - 80,populational_aff_Asian,populational_aff_Black,populational_aff_Hispanic,...,disease_10,disease_11,visual_acuity_Myopia,visual_acuity_Hyperopia,visual_acuity_Astigmatism,visual_acuity_Amblyopia,visual_acuity_Strabismus,visual_acuity_Age-Related Macular Degeneration,visual_acuity_Dry Eye Syndrome,visual_acuity_Keratoconus
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
159,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
160,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
161,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
## applying PCA
pca = PCA(n_components=50)
pca.fit(user_profiles)
tve = 0 # total variance explained
for i, ve in enumerate(pca.explained_variance_ratio_):
  tve += ve
  print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i+1, ve, tve))

## keep 32 principal components, since we get a total explained variance of 90.13%.
X_pca = pca.transform(user_profiles)[:, :32]

## see dataframe
user_profiles_train = pd.DataFrame(data=X_pca, columns=['PCA'+str(i) for i in range(1, X_pca.shape[1]+1)], index=dataset_demo['id_survey'])
user_profiles_train

PC1 - Variance explained:  0.1393 - Total Variance:  0.1393
PC2 - Variance explained:  0.1176 - Total Variance:  0.2569
PC3 - Variance explained:  0.0929 - Total Variance:  0.3499
PC4 - Variance explained:  0.0830 - Total Variance:  0.4329
PC5 - Variance explained:  0.0707 - Total Variance:  0.5035
PC6 - Variance explained:  0.0571 - Total Variance:  0.5606
PC7 - Variance explained:  0.0469 - Total Variance:  0.6075
PC8 - Variance explained:  0.0391 - Total Variance:  0.6466
PC9 - Variance explained:  0.0303 - Total Variance:  0.6769
PC10 - Variance explained:  0.0267 - Total Variance:  0.7036
PC11 - Variance explained:  0.0259 - Total Variance:  0.7294
PC12 - Variance explained:  0.0222 - Total Variance:  0.7517
PC13 - Variance explained:  0.0179 - Total Variance:  0.7696
PC14 - Variance explained:  0.0170 - Total Variance:  0.7866
PC15 - Variance explained:  0.0141 - Total Variance:  0.8007
PC16 - Variance explained:  0.0121 - Total Variance:  0.8128
PC17 - Variance explained:  0.010

Unnamed: 0_level_0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,PCA30,PCA31,PCA32
id_survey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.153152,-0.599326,-0.297103,-0.656924,0.106662,-0.807051,0.200322,-0.346233,-0.122735,0.006693,...,0.028252,0.014009,-0.004937,0.030204,-0.022138,-0.063420,-0.037033,-0.017280,-0.028723,0.003108
2,0.917802,0.127539,0.465011,0.055031,-0.471792,-0.681998,0.279596,-0.101780,0.141960,0.190922,...,0.080143,0.025656,0.083958,-0.105954,-0.060551,-0.045500,0.063525,-0.006774,0.020591,-0.042799
3,0.220945,-0.231837,0.346949,0.676086,-1.112158,0.057912,-0.094794,0.191934,0.699260,-0.026775,...,-0.032648,-0.037164,0.022284,-0.103991,-0.063957,-0.040833,0.039925,-0.034033,-0.083407,0.023878
4,-1.282244,0.372396,-0.223030,-0.013619,-0.642279,0.157351,0.315037,0.138742,0.458979,-0.085344,...,-0.079857,-0.037483,-0.024093,-0.065666,-0.021904,-0.018063,-0.013020,-0.080277,-0.099982,0.013756
5,-0.281642,0.536341,0.414843,-0.200954,0.416523,1.316066,1.057269,0.167470,1.068499,0.276127,...,0.132255,0.095409,0.040455,0.422172,0.209598,0.112753,0.027823,0.240734,0.010275,-0.033530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,0.492337,-0.226739,-0.294046,1.074239,0.322534,0.107655,1.103926,0.061226,-0.138016,-0.108945,...,-0.020411,0.021645,-0.045325,-0.019578,-0.079345,-0.002040,-0.030022,0.019536,-0.036367,0.010985
160,1.020553,0.499311,-0.993567,-1.135355,-0.354438,0.233279,0.055506,-0.279940,-0.335394,-0.221631,...,0.030498,-0.038430,0.021405,0.034207,-0.049248,0.045481,-0.140274,-0.006318,-0.061752,-0.296797
161,-0.529743,1.114764,-0.518206,-0.495568,0.376998,-0.274644,0.032133,0.949303,0.062039,-0.627970,...,-0.038501,-0.053301,-0.068844,-0.417189,0.035108,-0.129412,0.079626,0.851386,0.020441,-0.042268
162,1.558690,0.689319,0.110442,-0.581656,-0.496117,0.240368,-0.188080,-0.128530,-0.095288,-0.148128,...,-0.114579,-0.392569,0.140943,0.315676,-0.209576,-0.158986,-0.562053,0.032600,-0.139919,-0.132553


### Splitting

In [52]:
def check_items(trainset, testset):
  """
  Checks if all items present in testset are also present in the trainset.
  
  Args:
    trainset (list of tuples): Each tuple represents a user-item interaction in the trainset. 
      Each tuple is expected to have the form (user, item, rating).
    testset (list of tuples): Each tuple represents a user-item interaction in the testset. 
      Each tuple is expected to have the form (user, item, rating).
    
  Returns:
    bool: True if all items in the testset are present in the trainset, otherwise returns False
  
  """
  trainset_items = set(list(item for _, item, _ in trainset))
  testset_items = set(list(item for _, item, _ in testset))

  items_unknown = list(testset_items - trainset_items)

  return len(items_unknown) == 0

def split_emorecsys(dataset, train=0.8, test=0.2):
  """
  Splits the dataset into training and testing sets, ensuring that each user's ratings are split accordingly.
  The function guarantees that each testset contains at least one relevant rating (liked item) and ensures
  that all items in the testset also appear in the trainset.

  Args:
    dataset (pandas.DataFrame): Dataframe containing user ratings. The dataframe must have columns `id_survey` (user ID),
      `id_photo` (item ID), and `like_bool` (binary rating indicating whether the user likes the item or not).
    train (float): The proportion of the dataset to include in the trainset. Default if 0.8.
    test (float): The proportion of the dataset to include in the testset. Default if 0.2.

  Returns:
    surprise_train (surprise.Trainset): A trainset formatted for use with the Surprise library.
    testset (list of tuples): A list of tuples representing the testset, where each tuple has the form (user, item, rating).
  
  """
  trainset, testset = list(), list()

  items_ratings = dataset.groupby('id_photo').size().to_dict() # get number of rating for each id_photo
  items_one_rating = [id_photo for id_photo, size in items_ratings.items() if size == 1] # get the ones with only 1 rating
  # print(items_one_rating)


  for user in list(set(dataset['id_survey'])):
    user_ratings = dataset[dataset['id_survey'] == user]
    all_ratings = list((user, id_photo, like_bool) for id_photo, like_bool in zip(user_ratings['id_photo'], user_ratings['like_bool']))
    size_ratings = len(all_ratings)

    user_trainset = list(rating for rating in all_ratings if rating[1] in items_one_rating) # start trainset with items with only one rating

    relevant = list(x for x in all_ratings if x[2] == 1)
    if relevant: # making sure that we have at least one relevant rating in the testset
      new_rating = random.sample(relevant, 1)
      user_testset = new_rating if new_rating[0] not in user_trainset else list()
    else:
      raise ValueError('The user ' + str(user) + ' did not liked any photo')

    while len(user_testset) < size_ratings*test: # add random ratings until fullfil the size of testset
      new_rating = random.choice(all_ratings)
      if new_rating not in user_testset and new_rating not in user_trainset:
        user_testset.append(new_rating)

    # adding remaining ratings to trainset
    user_trainset.extend(new_rating for new_rating in all_ratings if new_rating not in user_testset and new_rating not in user_trainset)

    assert len(user_trainset) == size_ratings*train
    assert len(user_testset) == size_ratings*test

    # print(user_trainset)
    # print(user_testset)

    trainset.extend(user_trainset)
    testset.extend(user_testset)

  if check_items(trainset, testset):
    train_df = pd.DataFrame(trainset, columns=['id_survey', 'id_photo', 'like_bool']) # like_bool
  else:
    return split_emorecsys(dataset, train, test)

  reader = Reader(rating_scale=(0,1)) # like_bool
  surprise_train = Dataset.load_from_df(train_df, reader).build_full_trainset()

  return surprise_train, train_df, testset

### Ratings Matrix

In [6]:
dataset_cf = pd.read_csv("../data/csvs/ratings.csv")
dataset_cf.head()

Unnamed: 0,id,id_photo,id_survey,like_bool,anger,fear,disgust,sadness,happiness,surprise,neutral,valence,arousal,dominance
0,1,1754,1,1,0,0,0,0,0,0,5,3,1,2
1,2,1785,1,1,0,0,0,2,0,0,0,2,2,3
2,3,123,1,0,0,0,0,4,0,0,0,1,3,1
3,4,2315,1,0,0,0,0,2,0,0,0,2,1,3
4,5,1548,1,0,0,0,0,0,0,0,5,3,1,3


In [7]:
dataset_cf = dataset_cf[dataset_cf['id_survey'].isin(user_profiles_train.index.tolist())]
dataset_cf

Unnamed: 0,id,id_photo,id_survey,like_bool,anger,fear,disgust,sadness,happiness,surprise,neutral,valence,arousal,dominance
0,1,1754,1,1,0,0,0,0,0,0,5,3,1,2
1,2,1785,1,1,0,0,0,2,0,0,0,2,2,3
2,3,123,1,0,0,0,0,4,0,0,0,1,3,1
3,4,2315,1,0,0,0,0,2,0,0,0,2,1,3
4,5,1548,1,0,0,0,0,0,0,0,5,3,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2440,2441,2546,163,0,0,5,0,5,0,0,0,1,5,1
2441,2442,2097,163,0,0,0,3,0,5,0,0,2,2,1
2442,2443,1898,163,0,0,0,3,5,0,1,0,1,4,1
2443,2444,1871,163,0,0,0,0,5,0,0,0,1,1,1


In [8]:
ratings_matrix_like = dataset_cf.pivot_table(index='id_survey', columns='id_photo', values='like_bool').reset_index(drop=True)

## 2. Modelling and Evaluating

In [14]:
models_grid = {'KMeans': {'n_clusters': np.arange(5, int(math.ceil(user_profiles_train.shape[0]/6))+1, 2),
                          'init': ['k-means++', 'random'],
                          'n_init': np.arange(5, int(math.ceil(user_profiles_train.shape[0]/6))+1, 2),
                          'algorithm': ['lloyd', 'elkan']},

               'AgglomerativeClustering': {'n_clusters': np.arange(5, int(math.ceil(user_profiles_train.shape[0]/6))+1, 2),
                                           'metric': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'],
                                           'linkage': ['ward', 'complete', 'average']},

               'SpectralClustering': {'n_clusters': np.arange(5, 31, 5),
                                      'affinity': ['rbf', 'nearest_neighbors'],
                                      'gamma': np.arange(0.4, 3.1, 0.5),
                                      'n_neighbors': np.arange(5, 31, 5),
                                      'eigen_solver': ['arpack', 'lobpcg'],
                                      'assign_labels': ['kmeans', 'discretize']},

               'DBSCAN': {'eps': np.arange(0.007, 0.1, 0.007), # np.linspace(0.001, 0.1, num=100),
                          'min_samples': np.arange(1, 5),
                          'metric': ['euclidean', 'manhattan']}}

##### Evaluation Functions

In [12]:
def similar_users(user, model, n=5, model_dist=None, Kmeans=False):
  """
  Identifies the most similar users to a given test user based on their profiles. It can use clustering models and distance
  measures to find similar users.

  Args:
    user (int): The ID of the test user for whom similar users are to be found.
    model (object): A clustering model that has been fitted to user profiles.
    n (int): The number of similar users to return. Default is 5.
    model_dist (array, optional): Distance matrix for cluster centers. Required if Kmeans is True.
    Kmeans (bool): A flag indicating whether to use KMeans clustering for finding similar users. Default is False.

  Returns:
    users_similar (list of tuples): A list of tuples where each tuple contains a user ID and the similarity score. The list 
    is sorted in descendin order of similarity.

  """
  ## 1. buscar o cluster do user test
  test_user = user_profiles_train.loc[user]
  user_idx = user_profiles_train.index.get_loc(user)

  ## 2. buscar os users do mesmo cluster do user test
  user_cluster = model.labels_[user_idx]
  users = [id for id, cluster in enumerate(model.labels_) if cluster == user_cluster and id != user_idx]

  if len(users) < 5: # 2.5. vamos buscar mais users
    if Kmeans:
      user_dist = model_dist[user_profiles_train.index.get_loc(user)]
      sim_cluster = sorted(range(len(user_dist)), key=lambda x: user_dist[x])[1:2][0] # vamos só buscar 1 cluster extra
      users.extend([id for id, cluster in enumerate(model.labels_) if cluster == sim_cluster])

    else:
      dist = euclidean_distances([test_user], user_profiles_train.values)[0] # aqui vamos ver todos os users
      sorted_users = sorted(range(len(dist)), key=lambda x: dist[x])

      # filtrar para não haver repetição de users, ou inserção do test_user
      sorted_users_new = [x for x in sorted_users if x not in users]

      for user in sorted_users_new:
        if len(users) < 5 and user != user_idx:
          users.append(user)

  ## 3. calcular semelhanças entre o user test e os users dos clusters mais próximos
  users_similar = []
  for user_id in users:
      user_profile = user_profiles_train.iloc[user_id]
      similarity_score = 1 / (1 + euclidean_distances([test_user], [user_profile])[0])
      users_similar.append((user_id, similarity_score))

  ## 4. ordem decrescente para o valor de semelhança
  users_similar = sorted(users_similar, key=lambda x: x[1], reverse=True)

  return users_similar[:n]

In [None]:
def precision_recall_at_k(recommended, user_test, k=10):
  """
  Calculates the precision, recall and F1 scores at specified cutoff k for a given set of recommended items compared to user's
  test set.

  Args:
    recommended (list): A list of recommended item IDs.
    user_test (list of tuples): A list of tuples representing user0s test interaction, where each tuple contains (user, item, rating).
    k (int): The cutoff for the number of top recommendations to consider. Default is 10.
    
  Returns:
    precision (float): The precision score at k.
    recall (float): The recall score at k.
    f1score (float): The F1 score at k.

  """
  relevant_items = list(item[1] for item in user_test if item[2] > 0)

  rel = len(relevant_items) # total number of relevant items to the user
  rel_rec = np.sum(np.isin(recommended[:k], relevant_items)) # number of relevant items recommended to the user

  # k is the total number of recommended items to the user
  precision = rel_rec / k # number of relevant items recommended to the user / total number of recommended items to the user
  recall = rel_rec / rel if rel != 0 else 1  # number of relevant items recommended to the user / total number of relevant items to the user
  f1score = (2*precision*recall) / (precision+recall) if (precision+recall) != 0 else 0.0

  return precision, recall, f1score

def evaluation(recommended_items, user_test, k=10): # corrigir documentação!!
  """
  Evaluates the recommendation model by computing precision, recall, and F1 scores for a given user.

  Args:
    ratngs_matrix (pandas.DataFrame): A matrix where rows represent users and columns represent items, with ratings as values.
    user (int): The ID of the user for whom the recommendations are evaluated.
    rec_users (list of tuples): A list of tuples where each tuple contains a similar user's ID and their similarity score.
    user_test (list of tuples):  A list of tuples representing the user's test interactions, where each tuple contains (user, item, rating).
    k (int): The number of top recommendations to consider. Default is 10.

  Returns:
    precisions (list of floats): Precision scores at each k.
    recalls (list of floats): Recall scores at each k.
    f1scores (list of floats): F1 scores at each k.

  """
  precisions, recalls, f1scores = [], [], []

  # for i in range(k, k+1):
  for i in range(1, k+1):
    precision, recall, f1score = precision_recall_at_k(recommended_items, user_test, i)
    # print("K =", i, "- Precision:", precision, ", Recall:", recall, ", F1 Score:", f1score)

    precisions.append(precision)
    recalls.append(recall)
    f1scores.append(f1score)

  return precisions, recalls, f1scores

def avg_metrics(precisions, recalls, f1scores, k):
  """
  Calculates the average precision, recall, and F1 scores across multiple runs for different values of k.

  Args:
    precisions (list of lists): A list of precisions scores for each run.
    recalls (list of lists): A list of recalls scores for each run.
    f1scores (list of lists): A list of F1 scores for each run.
    k (int): The maximum value of k for which the metrics are calculated.

  Returns:
    precisions_avg (list of floats): Average precision scores for each k.
    recalls_avg (list of floats): Average recall scores for each k.
    f1scores_avg (list of floats): Average F1 scores for each k.

  """
  precisions_avg, recall_avg, f1score_avg = [], [], []
  
  for i in range(k):
    precision, recall, f1score = [], [], []

    for prec, rec, f1 in zip(precisions, recalls, f1scores):
      precision.append(prec[i])
      recall.append(rec[i])
      f1score.append(f1[i])

    precisions_avg.append(np.round(np.mean(precision), 4))
    recall_avg.append(np.round(np.mean(recall), 4))
    f1score_avg.append(np.round(np.mean(f1score), 4))

  return precisions_avg, recall_avg, f1score_avg

def gridsearch_recommendation(model, param_grid, ratings_matrix, dataset, k=10):
  """
  Performs a grid search over hyperparameters for a given recommendation model to find the best parameters based on F1 score at k.

  Args:
    model (class): The recommendation model class to be used.
    param_grid (dict): A dictionary with hyperparameters and their respective values to be searched.
    ratings_matrix (pandas.DataFrame): A matrix where rows represent users and columns represent items, with ratings as values.
    dataset (pandas.DataFrame): The dataset containing user-item interaction with columns `id_survey`, `id_photo`, and `like_bool`.
    k (int): The number of top recommendations to consider. Default is 10.
  
  """
  best_score = -1
  best_params = None

  for params in list(ParameterGrid(param_grid)):
    if (AgglomerativeClustering == model) and (params['linkage'] == 'ward' and not params['metric'] == 'euclidean'):
      pass
    else:
      algo = model(**params)
      model_dist = None

      if KMeans == model:
        model_dist = algo.fit_transform(user_profiles_train)
      else:
        algo.fit(user_profiles_train)

      precisions_final, recalls_final, f1scores_final = [], [], []
      for i in range(5):
        _, _, testset = split_emorecsys(dataset)
        test_users = list(set(item[0] for item in testset))

        for user in test_users:
          user_relevant = list(item for item in testset if item[0] == user)
          rec_users = similar_users(user=user, model=algo, model_dist=model_dist, Kmeans=(KMeans == model))

          weights = [1 / (i+1) for i, _ in enumerate(rec_users)]
          users = list(user[0] for user in rec_users)
          rec_users_items = ratings_matrix.loc[users] # items of similar users

          item_count = rec_users_items.astype(bool).sum(axis=0) # number of ratings of each item within the similar users
          item_count = item_count.replace(0, 1) # if any item has 0 rating within the similar users, it will have count 1 to avoid zero divisions

          # for each line (each user) it's applied the weighted attributed to that user, so top 1 > top 2 > top 3 > etc.
          weighted_scores = rec_users_items.multiply(weights, axis=0).sum(axis=0) / item_count

          recommended_items = weighted_scores.nlargest(k).index


          precisions, recalls, f1scores = evaluation(recommended_items, user_relevant, k=k)

          precisions_final.append(precisions)
          recalls_final.append(recalls)
          f1scores_final.append(f1scores)

      precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

      if f1score_avg[-1] > best_score:
        best_score = f1score_avg[-1]
        best_params = params

        # keeping the precision, recall and f1score values for the best model
        best_precisions, best_recalls, best_f1scores = precision_avg, recall_avg, f1score_avg

  print(f'Best parameters: {best_params}')
  print(f'Best F1 Score at K = 10: {best_score}\n')

  for i in range(k):
    print(f"K = {i+1} - Precision: {best_precisions[i]}, Recall: {best_recalls[i]}, F1 Score: {best_f1scores[i]}")

### 2.1. KMeans

In [None]:
gridsearch_recommendation(KMeans, models_grid['KMeans'], ratings_matrix_like, dataset_cf, k=10)

Best parameters: {'algorithm': 'elkan', 'init': 'random', 'n_clusters': 19, 'n_init': 19}
Best F1 Score at K = 10: 0.0352

K = 1 - Precision: 0.0245, Recall: 0.0121, F1 Score: 0.0145
K = 2 - Precision: 0.0227, Recall: 0.0211, F1 Score: 0.0205
K = 3 - Precision: 0.0266, Recall: 0.0368, F1 Score: 0.0296
K = 4 - Precision: 0.0264, Recall: 0.0485, F1 Score: 0.0329
K = 5 - Precision: 0.0263, Recall: 0.0593, F1 Score: 0.0352
K = 6 - Precision: 0.0243, Recall: 0.0656, F1 Score: 0.0345
K = 7 - Precision: 0.0244, Recall: 0.0783, F1 Score: 0.0362
K = 8 - Precision: 0.0225, Recall: 0.0824, F1 Score: 0.0346
K = 9 - Precision: 0.0228, Recall: 0.0955, F1 Score: 0.036
K = 10 - Precision: 0.0218, Recall: 0.101, F1 Score: 0.0352


### 2.2. AgglomerativeClustering

In [None]:
gridsearch_recommendation(AgglomerativeClustering, models_grid['AgglomerativeClustering'], ratings_matrix_like, dataset_cf, k=10)

Best parameters: {'linkage': 'average', 'metric': 'euclidean', 'n_clusters': 21}
Best F1 Score at K = 10: 0.0335

K = 1 - Precision: 0.0147, Recall: 0.0072, F1 Score: 0.0084
K = 2 - Precision: 0.0172, Recall: 0.0155, F1 Score: 0.0155
K = 3 - Precision: 0.018, Recall: 0.0235, F1 Score: 0.0197
K = 4 - Precision: 0.0209, Recall: 0.0391, F1 Score: 0.0263
K = 5 - Precision: 0.0211, Recall: 0.0489, F1 Score: 0.0286
K = 6 - Precision: 0.0211, Recall: 0.0607, F1 Score: 0.0305
K = 7 - Precision: 0.02, Recall: 0.0671, F1 Score: 0.0301
K = 8 - Precision: 0.0207, Recall: 0.0777, F1 Score: 0.032
K = 9 - Precision: 0.0213, Recall: 0.089, F1 Score: 0.0337
K = 10 - Precision: 0.0207, Recall: 0.0961, F1 Score: 0.0335


### 2.3. SpectralClustering

In [None]:
gridsearch_recommendation(SpectralClustering, models_grid['SpectralClustering'], ratings_matrix_like, dataset_cf, k=10)

[9.63123883e-14 5.25913712e-07 5.00441298e-08 7.59276151e-07
 5.31056338e-07 2.46494188e-06]
not reaching the requested tolerance 2.428889274597168e-06.
  _, diffusion_map = lobpcg(


Best parameters: {'affinity': 'nearest_neighbors', 'assign_labels': 'kmeans', 'eigen_solver': 'lobpcg', 'gamma': 2.9, 'n_clusters': 20, 'n_neighbors': 30}
Best F1 Score at K = 10: 0.0378

K = 1 - Precision: 0.0196, Recall: 0.0125, F1 Score: 0.0129
K = 2 - Precision: 0.0202, Recall: 0.0219, F1 Score: 0.0193
K = 3 - Precision: 0.0225, Recall: 0.0344, F1 Score: 0.0258
K = 4 - Precision: 0.0242, Recall: 0.0497, F1 Score: 0.0312
K = 5 - Precision: 0.0258, Recall: 0.064, F1 Score: 0.0355
K = 6 - Precision: 0.0252, Recall: 0.0734, F1 Score: 0.0364
K = 7 - Precision: 0.0235, Recall: 0.0785, F1 Score: 0.0352
K = 8 - Precision: 0.0244, Recall: 0.092, F1 Score: 0.0377
K = 9 - Precision: 0.0239, Recall: 0.1006, F1 Score: 0.0378
K = 10 - Precision: 0.0233, Recall: 0.1092, F1 Score: 0.0378


### 2.4. DBSCAN

In [None]:
gridsearch_recommendation(DBSCAN, models_grid['DBSCAN'], ratings_matrix_like, dataset_cf, k=10)

Best parameters: {'eps': 0.056, 'metric': 'euclidean', 'min_samples': 1}
Best F1 Score at K = 10: 0.0346

K = 1 - Precision: 0.0221, Recall: 0.0104, F1 Score: 0.0129
K = 2 - Precision: 0.0209, Recall: 0.0186, F1 Score: 0.0186
K = 3 - Precision: 0.0196, Recall: 0.0262, F1 Score: 0.0215
K = 4 - Precision: 0.0233, Recall: 0.0423, F1 Score: 0.0291
K = 5 - Precision: 0.0231, Recall: 0.0505, F1 Score: 0.0309
K = 6 - Precision: 0.0235, Recall: 0.062, F1 Score: 0.0333
K = 7 - Precision: 0.0228, Recall: 0.0722, F1 Score: 0.0338
K = 8 - Precision: 0.0218, Recall: 0.0798, F1 Score: 0.0335
K = 9 - Precision: 0.0213, Recall: 0.0869, F1 Score: 0.0335
K = 10 - Precision: 0.0215, Recall: 0.0982, F1 Score: 0.0346


## 3. Recommending

In [14]:
dataset_photos = pd.read_csv("../data/csvs/photos.csv")
dataset_photos.set_index('id', inplace=True)

In [None]:
def display_image(show_list, n_show=3):
  """
  Displays a list of images in a grid format. It reads images from the specified file paths and displays them using widgets.

  Args:
    show_list (list of str): A list of item IDs representing the images to be displayed.
    n_show (int): The number of images to display per row. Default is 3.

  """
  relevant_images_widgets = []
  for item in show_list:
    image = dataset_photos.loc[item]['file_name']
    ext = dataset_photos.loc[item]['ext']
    image_path = f'../data/photos/{image}.{ext}'

    # print(image)

    with open(image_path, "rb") as file:
      img = file.read()

    img_widget = WidgetImage(value=img, format='jpg', width=200, height=200)
    relevant_images_widgets.append(img_widget)

  display(VBox([HBox(relevant_images_widgets[i:i+n_show]) for i in range(0, len(relevant_images_widgets), n_show)]))

In [None]:
def recommending_demo(model, params, dataset):
  """
  Demonstrates a recommendation system by selecting a random user from the testset, displaying their
  relevant images, and showing the top 5 recommended images based on the similarity with other users.

  Args:
    model (class): The recommendation model class to be used.
    params (dict): A dictionary of parameters to initialize the model.
    dataset (pandas.DataFrame): The dataset containing user-item interaction with columns `id_survey`, `id_photo`, 
      and `like_bool`.
  
  """
  _, trainset, testset = split_emorecsys(dataset)
  # trainset = list(trainset.itertuples(index=False, name=None))

  test_users = list(set(item[0] for item in testset))
  user = 161 #random.choice(test_users) 
  # print('LIKED IMAGES:')
  # items_liked = list(item[1] for item in trainset if item[0] == user and item[2] > 0)
  # display_image(items_liked, len(items_liked))

  items_relevant = list(item[1] for item in testset if item[0] == user and item[2] > 0)

  print('RELEVANT IMAGES:')
  display_image(items_relevant, len(items_relevant))

  algo = model(**params)
  algo.fit(user_profiles_train)

  rec_users = similar_users(user=user, model=algo, model_dist=None, Kmeans=False)
  weights = [1 / (i+1) for i, _ in enumerate(rec_users)]

  users = list(user[0] for user in rec_users)
  rec_users_items_df = ratings_matrix_like.loc[users]

  # rec_users_items = []
  # for user_id, row in rec_users_items_df.iterrows():
  #   liked = row[row == 1].index.to_list()
  #   rec_users_items.append((user_id, liked))
  # print(rec_users_items)

  item_count = rec_users_items_df.apply(lambda x: x.eq(1).sum(), axis=0)
  weighted_scores = rec_users_items_df.multiply(weights, axis=0).sum(axis=0) / item_count

  recommended_items = weighted_scores.nlargest(5).index

  print('TOP 5 IMAGES RECOMMENDED:')
  display_image(recommended_items, 5)

In [85]:
recommending_demo(KMeans, {'algorithm': 'elkan', 'init': 'random', 'n_clusters': 19, 'n_init': 19}, dataset_cf)

RELEVANT IMAGES:
pexels-photo-4764311
pexels-photo-5273211


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…

[(153, [39, 136, 186, 791, 1554, 1644, 1721, 1775, 1939, 2234, 2604]), (151, [39, 1644, 1721, 2234]), (146, [184, 414, 421, 1194, 1881, 2319, 2668, 2717, 2796, 2940]), (5, [546, 681, 698, 1251, 1287, 1308, 1442, 1513, 1627, 1730, 2143, 2416, 2581, 2940, 3054]), (135, [476, 835, 1341, 1352, 1424, 1435, 1712, 1872, 1892, 2447, 2536, 2804, 2877])]
TOP 5 IMAGES RECOMMENDED:
iaps_mikels_annotated-2791
iaps_mikels_annotated-5470
pexels-photo-6243761
pexels-photo-1042441
pexels-photo-8964120


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff…