# EmoRecSys - Recommending for User Studies

[EmoRecSys Survey](https://emorecsys.pt/)

###### Imports

In [1]:
# ---- INSTALLATIONS ---- #
# !pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357241 sha256=d82ccf0a55c844d6287a5e9d211797ea6a5bece0fe5559172a84275b15600271
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [2]:
# ---- IMPORTS ---- #
from sklearn.metrics.pairwise import euclidean_distances
from ipywidgets import HBox, VBox, Image as WidgetImage
from sklearn.cluster import SpectralClustering
from surprise import Reader, Dataset, NMF
from sklearn.decomposition import PCA
from IPython.display import display
import pandas as pd
import numpy as np
import warnings
import random

warnings.filterwarnings("ignore", message="Exited. *not reaching the requested tolerance.*", category=UserWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names, but KMeans was fitted with feature names", category=UserWarning)

Mounted at /content/drive


## PreProcessing

In [3]:
def processing_userstudies_newdata(participant_number):
  """
  Processes data related to a participant's survey, ratings, hobbies, diseases, and visual acuities. 
  It transforms the data to facilitate further analysis or modeling.

  Args:
    participant_number (int): The participant number used to locate their specific data files.

  Returns:
    new_survey (pandas.DataFrame): The transformed survey data including hobby, disease, and visual acuity features.
    new_ratings (pandas.DataFrame): The new ratings data for the participant.
  
  """
  main_path = f'./participants_data/participant_{str(participant_number)}'
  new_ratings = pd.read_csv(f"{main_path}/new_ratings{str(participant_number)}.csv")

  new_survey = pd.read_csv(f"{main_path}/new_surveys{str(participant_number)}.csv", encoding="utf-8")
  new_survey.columns = ['id_survey', 'age', 'populational_aff', 'gender', 'education', 'city',
        'country_residence', 'date_survey', 'consented', 'hobby_other']

  new_hobbies = pd.read_csv(f"{main_path}/new_survey_hobbies{str(participant_number)}.csv")
  hobbies_grouped = new_hobbies.groupby('id_survey')['id_hobby'].apply(list).reset_index()
  new_survey = pd.merge(new_survey, hobbies_grouped, on='id_survey', how='left')
  for hobby in list(set(new_hobbies['id_hobby'].tolist())):
    new_survey["hobby_"+str(hobby)] = new_survey['id_hobby'].apply(lambda x: 1 if isinstance(x, list) and hobby in x else 0)
  new_survey.drop(columns=['id_hobby'], inplace=True)

  new_diseases = pd.read_csv(f"{main_path}/new_survey_diseases{str(participant_number)}.csv")
  diseases_grouped = new_diseases.groupby('id_survey')['id_disease'].apply(list).reset_index()
  new_survey = pd.merge(new_survey, diseases_grouped, on='id_survey', how='left')
  for disease in list(set(new_diseases['id_disease'].tolist())):
    new_survey["disease_"+str(disease)] = new_survey['id_disease'].apply(lambda x: 1 if isinstance(x, list) and disease in x else 0)
  new_survey.drop(columns=['id_disease'], inplace=True)

  new_visual_acuities = pd.read_csv(f"{main_path}/new_survey_visual_acuities{str(participant_number)}.csv")
  visual_acuities_grouped = new_visual_acuities.groupby('id_survey')['id_visual_acuities'].apply(list).reset_index()
  new_survey = pd.merge(new_survey, visual_acuities_grouped, on='id_survey', how='left')
  for visual_acuity in list(set(new_visual_acuities['id_visual_acuities'].tolist())):
    new_survey["visual_acuity"+str(visual_acuity)] = new_survey['id_visual_acuities'].apply(lambda x: 1 if isinstance(x, list) and visual_acuity in x else 0)
  new_survey.drop(columns=['id_visual_acuities'], inplace=True)

  return new_survey, new_ratings

In [4]:
dataset_demo = pd.read_csv("../data/csvs/demographic.csv", encoding="utf-8")
dataset_cf = pd.read_csv("../data/csvs/ratings.csv")

new_demo, new_cf = processing_userstudies_newdata(participant_number=0)
new_demo = new_demo.reindex(columns=dataset_demo.columns, fill_value=0)

new_dataset_demo = pd.concat([dataset_demo, new_demo], ignore_index=True)
new_dataset_cf = pd.concat([dataset_cf, new_cf], ignore_index=True)

In [5]:
## preprocessing for Demographic dataset
user_profiles = new_dataset_demo.iloc[:, -27:].copy()

## one-hot encoder
for col in ['hobby_other', 'country_residence', 'city', 'education', 'gender', 'populational_aff', 'age']:
  dummies = pd.get_dummies(new_dataset_demo[col], prefix=col)
  user_profiles = pd.concat([dummies, user_profiles], axis=1)

user_profiles.dropna(inplace=True)
user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)

## applying PCA
pca = PCA(n_components=50)
pca.fit(user_profiles)
tve = 0 # total variance explained
for i, ve in enumerate(pca.explained_variance_ratio_):
  tve += ve
  print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i+1, ve, tve))

## keep 32 principal components, since we get a total explained variance of 90.13%.
X_pca = pca.transform(user_profiles)[:, :32]

## see dataframe
user_profiles_train = pd.DataFrame(data=X_pca, columns=['PCA'+str(i) for i in range(1, X_pca.shape[1]+1)], index=new_dataset_demo['id_survey'])

PC1 - Variance explained:  0.1048 - Total Variance:  0.1048
PC2 - Variance explained:  0.0991 - Total Variance:  0.2038
PC3 - Variance explained:  0.0701 - Total Variance:  0.2739
PC4 - Variance explained:  0.0645 - Total Variance:  0.3384
PC5 - Variance explained:  0.0615 - Total Variance:  0.3999
PC6 - Variance explained:  0.0486 - Total Variance:  0.4485
PC7 - Variance explained:  0.0455 - Total Variance:  0.4940
PC8 - Variance explained:  0.0415 - Total Variance:  0.5354
PC9 - Variance explained:  0.0361 - Total Variance:  0.5715
PC10 - Variance explained:  0.0331 - Total Variance:  0.6046
PC11 - Variance explained:  0.0301 - Total Variance:  0.6347
PC12 - Variance explained:  0.0271 - Total Variance:  0.6618
PC13 - Variance explained:  0.0252 - Total Variance:  0.6870
PC14 - Variance explained:  0.0228 - Total Variance:  0.7098
PC15 - Variance explained:  0.0202 - Total Variance:  0.7300
PC16 - Variance explained:  0.0181 - Total Variance:  0.7480
PC17 - Variance explained:  0.016

In [6]:
## as we just removed outliers, we will check that we are only using valid data points
new_dataset_cf = new_dataset_cf[new_dataset_cf['id_survey'].isin(user_profiles_train.index.tolist())]

## now we will be creating a ratings_matrix, in this case using the `like_bool` as the rating
ratings_matrix_like = new_dataset_cf.pivot_table(index='id_survey', columns='id_photo', values='like_bool').reset_index(drop=True)

## Modelling

#### Demographic-based Functions

In [7]:
def similar_users(user, model, n=5, model_dist=None, Kmeans=False):
  """
  Identifies the most similar users to a given test user based on their profiles. It can use clustering models and distance
  measures to find similar users.

  Args:
    user (int): The ID of the test user for whom similar users are to be found.
    model (object): A clustering model that has been fitted to user profiles.
    n (int): The number of similar users to return. Default is 5.
    model_dist (array, optional): Distance matrix for cluster centers. Required if Kmeans is True.
    Kmeans (bool): A flag indicating whether to use KMeans clustering for finding similar users. Default is False.

  Returns:
    users_similar (list of tuples): A list of tuples where each tuple contains a user ID and the similarity score. The list 
    is sorted in descendin order of similarity.

  """
  ## 1. buscar o cluster do user test
  test_user = user_profiles_train.loc[user]
  user_idx = user_profiles_train.index.get_loc(user)

  ## 2. buscar os users do mesmo cluster do user test
  user_cluster = model.labels_[user_idx]
  users = [id for id, cluster in enumerate(model.labels_) if cluster == user_cluster and id != user_idx]

  if len(users) < 5: # 2.5. vamos buscar mais users
    if Kmeans:
      user_dist = model_dist[user_profiles_train.index.get_loc(user)]
      sim_cluster = sorted(range(len(user_dist)), key=lambda x: user_dist[x])[1:2][0] # vamos só buscar 1 cluster extra
      users.extend([id for id, cluster in enumerate(model.labels_) if cluster == sim_cluster])

    else:
      dist = euclidean_distances([test_user], user_profiles_train.values)[0] # aqui vamos ver todos os users
      sorted_users = sorted(range(len(dist)), key=lambda x: dist[x])

      # filtrar para não haver repetição de users, ou inserção do test_user
      sorted_users_new = [x for x in sorted_users if x not in users]

      for user in sorted_users_new:
        if len(users) < 5 and user != user_idx:
          users.append(user)

  ## 3. calcular semelhanças entre o user test e os users dos clusters mais próximos
  users_similar = []
  for user_id in users:
      user_profile = user_profiles_train.iloc[user_id]
      similarity_score = 1 / (1 + euclidean_distances([test_user], [user_profile])[0])
      users_similar.append((user_id, similarity_score))

  ## 4. ordem decrescente para o valor de semelhança
  users_similar = sorted(users_similar, key=lambda x: x[1], reverse=True)

  return users_similar[:n]

In [8]:
def recommend_items_from_users(user, model, ratings_matrix, k=10):
  """
  Recommends items for a specific user based on the preferences of similar users. The recommendation is made by aggregating
  the ratings of items from users who are similar to the target user, weighted by their similarity.

  Args:
    user (int): The ID of the user for whom recommendations are to be made.
    model (object): A model object used to find similar users.
    ratings_matrix (pandas.DataFrame): A matrix where rows represent users and columns represent items, with ratings as values.
    k (int): The number of top items to recommended. Default is 10.

  Returns:
    recommended_items (list of tuples): A list of tuples, where each tuple contains an item ID and its corresponding score.
      The list contains the top k recommended items.
  
  """
  rec_users = similar_users(user, model, n=5)
  weights = [1 / (i+1) for i, _ in enumerate(rec_users)]

  users = list(user[0] for user in rec_users)
  rec_users_items = ratings_matrix.iloc[users] # items of similar users
  item_count = rec_users_items.count(axis=0) # number of ratings of each item within the similar user

  weighted_avg_rating = rec_users_items.multiply(weights, axis=0).mean(axis=0)
  weighted_scores = weighted_avg_rating * item_count # calculate a weighted score using the weighted ratings and the number of ratings per item

  recommended_items = list(weighted_scores.nlargest(k).items())

  return recommended_items

#### CF Functions

In [10]:
def recommend_items_from_cf(predictions, threshold=0):
  """
  Recommends items based on collaborative filtering predictions. The functions sorts predictions by estimated rating, and
  filters them based on a given threshold.

  Args:
    predictions (list of tuples): A list of predictions tuples, where each tuple contains a user ID, item ID, actual rating,
      estimated rating, and an additional information.
    threshold (float): Minimum estimated rating to consider an item for recommendation. Default is 0.

  Returns:
    recommended_items (list of tuples): A list of tuples where each tuple contains an item ID and its estimated rating. The 
      list is sorted in descending order of the estimated ratings.
  
  """
  preds = list()
  for _, iid, _, est, _ in predictions:
    preds.append((iid, est))

  preds.sort(key=lambda x: x[1], reverse=True)
  recommended_items = [(iid, est) for (iid, est) in preds if est >= threshold]

  return recommended_items

#### Hybrid Model

In [12]:
dataset_photos = pd.read_csv("../data/csvs/photos.csv")
dataset_photos.set_index('id', inplace=True)

In [13]:
def display_image(show_list, n_show=3):
  """
  Displays a list of images in a grid format. It reads images from the specified file paths and displays them using widgets.

  Args:
    show_list (list of str): A list of item IDs representing the images to be displayed.
    n_show (int): The number of images to display per row. Default is 3.

  """
  relevant_images_widgets = []
  for item in show_list:
    image = dataset_photos.loc[item]['file_name']
    ext = dataset_photos.loc[item]['ext']
    image_path = f'../data/photos/{image}.{ext}'

    with open(image_path, "rb") as file:
      img = file.read()

    img_widget = WidgetImage(value=img, format='jpg', width=200, height=200)
    relevant_images_widgets.append(img_widget)

  display(VBox([HBox(relevant_images_widgets[i:i+n_show]) for i in range(0, len(relevant_images_widgets), n_show)]))

In [11]:
def recommending_userstudies(model_cf, model_demo, dataset, user_test):
  """
  Provide a hybrid recommendation for a user based on their previously liked items and demographic information.

  Args:
    model_cf (object): A collaborative filtering model used for generating recommendations based on user-item interactions.
    model_demo (object): A demographic-based model used for generating recommendations based on user profile similarities.
    dataset (panda.DataFrame): A DataFrame containing the survey data with columns such as id_survey, id_photo, and like_bool.
    user_test (int or str): The user ID for whom recommendations are to be generated.
  
  """
  liked_items = {row['id_photo'] for _, row in dataset[dataset['id_survey'] == user_test].iterrows() if row['like_bool'] == 1}

  print('PREVIOUSLY LIKED IMAGES:')
  display_image(liked_items, 5)

  model_demo.fit(user_profiles_train)
  recommended_items_by_users = recommend_items_from_users(user=user_test, model=model_demo, ratings_matrix=ratings_matrix_like, k=20)

  reader = Reader(rating_scale=(0,1)) # like_bool
  new_dataset = Dataset.load_from_df(dataset[['id_survey', 'id_photo', 'like_bool']], reader).build_full_trainset()

  model_cf.fit(new_dataset)
  predictions_cf = [model_cf.predict(user_test, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
  recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)

  weighted_scores = []
  for (iid, demo_score), cf_score in zip(recommended_items_by_users, predictions_cf):
    weighted_score = (demo_score*0.5) + (cf_score.est*0.5)
    weighted_scores.append((iid, weighted_score))

  weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort

  print('\nHERE IS YOUR RECOMMENDATIONS:')
  recommendations = list(iid for iid, _ in weighted_scores[:5])
  display_image(recommendations, 5)

In [14]:
cf_params = {'n_epochs': 20, 'n_factors': 100, 'reg_pu': 0.001, 'reg_qi': 0.01} # nmf
demo_params = {'affinity': 'nearest_neighbors', 'assign_labels': 'kmeans', 'eigen_solver': 'lobpcg', 'gamma': 2.9, 'n_clusters': 20, 'n_neighbors': 30}

#### Results

In [17]:
recommending_userstudies(NMF(**cf_params), SpectralClustering(**demo_params), new_dataset_cf, user_test=148)

PREVIOUSLY LIKED IMAGES:


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…


HERE IS YOUR RECOMMENDATIONS:


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff…