# EmoRecSys - Hybrid

[EmoRecSys Survey](https://emorecsys.pt/)

###### Imports

In [1]:
# ---- INSTALLATIONS ---- #
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357248 sha256=eb98950d1f2381a2b67b21dcb4b3d91fb3157ecac5b89a8eaa67952287208d24
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [2]:
# ---- IMPORTS ---- #
from ipywidgets import HBox, VBox, Image as WidgetImage
from sklearn.model_selection import ParameterGrid
from surprise import Reader, Dataset
from collections import defaultdict
from IPython.display import display
from google.colab import drive
import pandas as pd
import numpy as np
import random

# ---- IMPORTS CF ---- #
from surprise import NMF

# ---- IMPORTS Demographic ---- #
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA
from scipy import stats
import warnings
import math

warnings.filterwarnings("ignore", message="Exited. *not reaching the requested tolerance.*", category=UserWarning)
warnings.filterwarnings("ignore", message="X does not have valid feature names, but KMeans was fitted with feature names", category=UserWarning)

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
## EDA are in the correspondent notebooks
dataset_demo = pd.read_csv("/content/drive/MyDrive/THESIS/EmoRecSys/CSVs/demographic.csv")
dataset_cf = pd.read_csv("/content/drive/MyDrive/THESIS/EmoRecSys/CSVs/ratings.csv")

## 1. Preprocessing

In [4]:
## for CF dataset, it's only needed to take care of the final emotion score,
## but, FOR NOW, i will be only using the `like_bool`feature

In [5]:
## preprocessing for Demographic dataset
user_profiles = dataset_demo.iloc[:, -27:].copy()

## one-hot encoder
for col in ['hobby_other', 'country_residence', 'city', 'education', 'gender', 'populational_aff', 'age']:
  dummies = pd.get_dummies(dataset_demo[col], prefix=col)
  user_profiles = pd.concat([dummies, user_profiles], axis=1)

user_profiles.dropna(inplace=True)
user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)

## applying PCA
pca = PCA(n_components=50)
pca.fit(user_profiles)
tve = 0 # total variance explained
for i, ve in enumerate(pca.explained_variance_ratio_):
  tve += ve
  print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i+1, ve, tve))

## keep 32 principal components, since we get a total explained variance of 90.13%.
X_pca = pca.transform(user_profiles)[:, :32]

## see dataframe
user_profiles_train = pd.DataFrame(data=X_pca, columns=['PCA'+str(i) for i in range(1, X_pca.shape[1]+1)], index=dataset_demo['id_survey'])

PC1 - Variance explained:  0.1056 - Total Variance:  0.1056
PC2 - Variance explained:  0.0998 - Total Variance:  0.2053
PC3 - Variance explained:  0.0698 - Total Variance:  0.2751
PC4 - Variance explained:  0.0646 - Total Variance:  0.3397
PC5 - Variance explained:  0.0605 - Total Variance:  0.4002
PC6 - Variance explained:  0.0487 - Total Variance:  0.4488
PC7 - Variance explained:  0.0457 - Total Variance:  0.4946
PC8 - Variance explained:  0.0417 - Total Variance:  0.5363
PC9 - Variance explained:  0.0354 - Total Variance:  0.5717
PC10 - Variance explained:  0.0330 - Total Variance:  0.6048
PC11 - Variance explained:  0.0303 - Total Variance:  0.6351
PC12 - Variance explained:  0.0272 - Total Variance:  0.6624
PC13 - Variance explained:  0.0253 - Total Variance:  0.6877
PC14 - Variance explained:  0.0229 - Total Variance:  0.7106
PC15 - Variance explained:  0.0201 - Total Variance:  0.7308
PC16 - Variance explained:  0.0182 - Total Variance:  0.7490
PC17 - Variance explained:  0.016

### Splitting

In [6]:
def check_items(trainset, testset):
  trainset_items = set(list(item for _, item, _ in trainset))
  testset_items = set(list(item for _, item, _ in testset))

  items_unknown = list(testset_items - trainset_items)

  return len(items_unknown) == 0

def split_emorecsys(dataset, train=0.8, test=0.2):
  trainset, testset = list(), list()

  items_ratings = dataset.groupby('id_photo').size().to_dict() # get number of rating for each id_photo
  items_one_rating = [id_photo for id_photo, size in items_ratings.items() if size == 1] # get the ones with only 1 rating
  # print(items_one_rating)


  for user in list(set(dataset['id_survey'])):
    user_ratings = dataset[dataset['id_survey'] == user]
    all_ratings = list((user, id_photo, like_bool) for id_photo, like_bool in zip(user_ratings['id_photo'], user_ratings['like_bool']))
    size_ratings = len(all_ratings)

    user_trainset = list(rating for rating in all_ratings if rating[1] in items_one_rating) # start trainset with items with only one rating

    relevant = list(x for x in all_ratings if x[2] == 1)
    if relevant: # making sure that we have at least one relevant rating in the testset
      new_rating = random.sample(relevant, 1)
      user_testset = new_rating if new_rating[0] not in user_trainset else list()
    else:
      raise ValueError('The user ' + str(user) + ' did not liked any photo')

    while len(user_testset) < size_ratings*test: # add random ratings until fullfil the size of testset
      new_rating = random.choice(all_ratings)
      if new_rating not in user_testset and new_rating not in user_trainset:
        user_testset.append(new_rating)

    # adding remaining ratings to trainset
    user_trainset.extend(new_rating for new_rating in all_ratings if new_rating not in user_testset and new_rating not in user_trainset)

    assert len(user_trainset) == size_ratings*train
    assert len(user_testset) == size_ratings*test

    # print(user_trainset)
    # print(user_testset)

    trainset.extend(user_trainset)
    testset.extend(user_testset)

  if check_items(trainset, testset):
    train_df = pd.DataFrame(trainset, columns=['id_survey', 'id_photo', 'like_bool']) # like_bool
  else:
    return split_emorecsys(dataset, train, test)

  reader = Reader(rating_scale=(0,1)) # like_bool
  surprise_train = Dataset.load_from_df(train_df, reader).build_full_trainset()

  return surprise_train, testset

### Ratings Matrix

In [7]:
## now we will be creating a ratings_matrix, in this case using the `like_bool` as the rating
ratings_matrix_like = dataset_cf.pivot_table(index='id_survey', columns='id_photo', values='like_bool').reset_index(drop=True)

## 2. Modelling and Evaluating

In [8]:
cf_params = {'n_epochs': 20, 'n_factors': 100, 'reg_pu': 0.001, 'reg_qi': 0.01} # nmf

demo_params = {'affinity': 'nearest_neighbors', 'assign_labels': 'kmeans', 'eigen_solver': 'lobpcg', 'gamma': 2.9, 'n_clusters': 20, 'n_neighbors': 30}

##### Evaluation

In [9]:
def similar_users(user, model, n=5, model_dist=None, Kmeans=False):

  ## 1. buscar o cluster do user test
  test_user = user_profiles_train.loc[user]
  user_idx = user_profiles_train.index.get_loc(user)

  ## 2. buscar os users do mesmo cluster do user test
  user_cluster = model.labels_[user_idx]
  users = [id for id, cluster in enumerate(model.labels_) if cluster == user_cluster and id != user_idx]

  if len(users) < 5: # 2.5. vamos buscar mais users
    if Kmeans:
      user_dist = model_dist[user_profiles_train.index.get_loc(user)]
      sim_cluster = sorted(range(len(user_dist)), key=lambda x: user_dist[x])[1:2][0] # vamos só buscar 1 cluster extra
      users.extend([id for id, cluster in enumerate(model.labels_) if cluster == sim_cluster])

    else:
      dist = euclidean_distances([test_user], user_profiles_train.values)[0] # aqui vamos ver todos os users
      sorted_users = sorted(range(len(dist)), key=lambda x: dist[x])

      # filtrar para não haver repetição de users, ou inserção do test_user
      sorted_users_new = [x for x in sorted_users if x not in users]

      for user in sorted_users_new:
        if len(users) < 5 and user != user_idx:
          users.append(user)

  ## 3. calcular semelhanças entre o user test e os users dos clusters mais próximos
  users_similar = []
  for user_id in users:
      user_profile = user_profiles_train.iloc[user_id]
      similarity_score = 1 / (1 + euclidean_distances([test_user], [user_profile])[0])
      users_similar.append((user_id, similarity_score))

  ## 4. ordem decrescente para o valor de semelhança
  users_similar = sorted(users_similar, key=lambda x: x[1], reverse=True)

  return users_similar[:n]

In [10]:
def recommend_items_from_users(user, model, ratings_matrix, k=10):
  rec_users = similar_users(user, model, n=5)
  weights = [1 / (i+1) for i, _ in enumerate(rec_users)]

  users = list(user[0] for user in rec_users)
  rec_users_items = ratings_matrix.iloc[users] # items of similar users
  item_count = rec_users_items.count(axis=0) # number of ratings of each item within the similar user

  weighted_avg_rating = rec_users_items.multiply(weights, axis=0).mean(axis=0)
  weighted_scores = weighted_avg_rating * item_count # calculate a weighted score using the weighted ratings and the number of ratings per item

  recommended_items = list(weighted_scores.nlargest(k).items())

  return recommended_items

def recommend_items_from_cf(predictions, threshold=0):
  preds = list()
  for _, iid, _, est, _ in predictions:
    preds.append((iid, est))

  preds.sort(key=lambda x: x[1], reverse=True)
  recommended_items = [(iid, est) for (iid, est) in preds if est >= threshold]

  return recommended_items

def evaluation(ratings_matrix, recommendation, relevant, k=10):
  precisions, recalls, f1scores = [], [], []

  for i in range(1, k+1):
  # for i in range(k, k+1):
    precision, recall, f1score = precision_recall_at_k(recommendation, relevant, i)
    # print("K =", i, "- Precision:", precision, ", Recall:", recall, ", F1 Score:", f1score)

    precisions.append(precision)
    recalls.append(recall)
    f1scores.append(f1score)

  return precisions, recalls, f1scores

def precision_recall_at_k(recommendation, relevant, k=10):
  relevant_items = list(item[1] for item in relevant if item[2] > 0)
  # print("ITENS RELEVANTES:", relevant_items)


  # for i in range(k):
  #   print(f"K = {i+1} - ", recommendation[:i+1])
  # print()

  rel = len(relevant_items) # total number of relevant items to the user
  rel_rec = np.sum(np.isin(recommendation[:k], relevant_items)) # number of relevant items recommended to the user

  # k is the total number of recommended items to the user
  precision = rel_rec / k # number of relevant items recommended to the user / total number of recommended items to the user
  recall = rel_rec / rel if rel != 0 else 1  # number of relevant items recommended to the user / total number of relevant items to the user
  f1score = (2*precision*recall) / (precision+recall) if (precision+recall) != 0 else 0.0

  return precision, recall, f1score

def avg_metrics(precisions, recalls, f1scores, k):
  precisions_avg, recall_avg, f1score_avg = [], [], []
  for i in range(k):
    precision, recall, f1score = [], [], []

    for prec, rec, f1 in zip(precisions, recalls, f1scores):
      precision.append(prec[i])
      recall.append(rec[i])
      f1score.append(f1[i])

    precisions_avg.append(np.round(np.mean(precision), 4))
    recall_avg.append(np.round(np.mean(recall), 4))
    f1score_avg.append(np.round(np.mean(f1score), 4))

  return precisions_avg, recall_avg, f1score_avg

##### Hybrid System

In [11]:
def hybrid_system(dataset, ratings_matrix, k=10):

  precisions_final, recalls_final, f1scores_final = [], [], []
  for i in range(5):
    trainset, testset = split_emorecsys(dataset)

    ## CF
    model_cf = NMF(**cf_params)
    # model_cf = CoClustering(**cf_params2)
    model_cf.fit(trainset)

    ## Demographic-based
    model_demo = SpectralClustering(**demo_params)
    model_demo.fit(user_profiles_train)

    test_users = list(set(item[0] for item in testset))
    for user in test_users:

      user_relevant = list(item for item in testset if item[0] == user) # relevant items for evaluation
      recommended_items_by_users = recommend_items_from_users(user=user, model=model_demo, ratings_matrix=ratings_matrix, k=20)

      # print("USER", user)
      # print(recommended_items_by_users)

      predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
      recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)
      # print(recommended_items_by_cf)
      # print()

      weighted_scores = []
      for (iid, demo_score), cf_score in zip(recommended_items_by_users, predictions_cf):
        weighted_score = (demo_score*0.5) + (cf_score.est*0.5)
        weighted_scores.append((iid, weighted_score))

      weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort
      # print(weighted_scores)

      precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)

      precisions_final.append(precisions)
      recalls_final.append(recalls)
      f1scores_final.append(f1scores)

  precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

  for i in range(k):
    print(f"K = {i+1} - Precision: {precision_avg[i]}, Recall: {recall_avg[i]}, F1 Score: {f1score_avg[i]}")

## 2.1. Results

In [None]:
hybrid_system(dataset_cf, ratings_matrix_like) # nmf

K = 1 - Precision: 0.0245, Recall: 0.0125, F1 Score: 0.0162
K = 2 - Precision: 0.0252, Recall: 0.0245, F1 Score: 0.0241
K = 3 - Precision: 0.0241, Recall: 0.0344, F1 Score: 0.0277
K = 4 - Precision: 0.0245, Recall: 0.046, F1 Score: 0.0314
K = 5 - Precision: 0.025, Recall: 0.0583, F1 Score: 0.0343
K = 6 - Precision: 0.0249, Recall: 0.0697, F1 Score: 0.0361
K = 7 - Precision: 0.0235, Recall: 0.0763, F1 Score: 0.0353
K = 8 - Precision: 0.0224, Recall: 0.0822, F1 Score: 0.0347
K = 9 - Precision: 0.0221, Recall: 0.0906, F1 Score: 0.035
K = 10 - Precision: 0.0216, Recall: 0.0975, F1 Score: 0.0349


In [None]:
hybrid_system(dataset_cf, ratings_matrix_like) # coclustering

K = 1 - Precision: 0.0209, Recall: 0.0121, F1 Score: 0.0139
K = 2 - Precision: 0.0196, Recall: 0.0217, F1 Score: 0.0195
K = 3 - Precision: 0.0225, Recall: 0.0335, F1 Score: 0.0259
K = 4 - Precision: 0.0227, Recall: 0.0446, F1 Score: 0.0291
K = 5 - Precision: 0.0216, Recall: 0.0519, F1 Score: 0.0297
K = 6 - Precision: 0.0229, Recall: 0.0665, F1 Score: 0.0333
K = 7 - Precision: 0.0223, Recall: 0.0753, F1 Score: 0.0336
K = 8 - Precision: 0.0213, Recall: 0.0818, F1 Score: 0.0332
K = 9 - Precision: 0.0209, Recall: 0.09, F1 Score: 0.0332
K = 10 - Precision: 0.02, Recall: 0.0957, F1 Score: 0.0325


## 3. Recommending

In [12]:
dataset_photos = pd.read_csv("/content/drive/MyDrive/THESIS/EmoRecSys/CSVs/photos.csv")
dataset_photos.set_index('id', inplace=True)

In [13]:
def display_image(show_list, n_show=3):
  relevant_images_widgets = []
  for item in show_list:
    image = dataset_photos.loc[item]['file_name']
    ext = dataset_photos.loc[item]['ext']
    image_path = f'/content/drive/MyDrive/THESIS/EmoRecSys/emorecsys_survey_photos/{image}.{ext}'

    with open(image_path, "rb") as file:
      img = file.read()

    img_widget = WidgetImage(value=img, format='jpg', width=200, height=200)
    relevant_images_widgets.append(img_widget)

  display(VBox([HBox(relevant_images_widgets[i:i+n_show]) for i in range(0, len(relevant_images_widgets), n_show)]))

In [38]:
def recommending_hf(model_cf, model_demo, dataset):
  trainset, testset = split_emorecsys(dataset)

  test_users = list(set(item[0] for item in testset))
  user = random.choice(test_users)
  items_relevant = list(item[1] for item in testset if item[0] == user and item[2] > 0)

  print('RELEVANT IMAGES:')
  display_image(items_relevant, len(items_relevant))

  model_demo.fit(user_profiles_train)
  recommended_items_by_users = recommend_items_from_users(user=user, model=model_demo, ratings_matrix=ratings_matrix_like, k=20)

  model_cf.fit(trainset)
  predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
  recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)

  weighted_scores = []
  for demo_score, cf_score in zip(recommended_items_by_users, recommended_items_by_cf):
    weighted_score = (min(demo_score[1], 1)*0.5) + (min(cf_score[1], 1)*0.5)
    weighted_scores.append((demo_score[0], weighted_score))

  weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort

  print('TOP 5 IMAGES RECOMMENDED:')
  ratings_5 = list(iid for iid, _ in weighted_scores[:5])
  display_image(ratings_5, 5)

In [44]:
recommending_hf(NMF(**cf_params), SpectralClustering(**demo_params), dataset_cf)

RELEVANT IMAGES:


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…

TOP 5 IMAGES RECOMMENDED:


VBox(children=(HBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff…