In [19]:
# Import libraries
import pandas as pd
import numpy as np
import random
from surprise import Dataset, Reader
from surprise import NMF
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA

from collections import defaultdict

In [20]:
# Data Loading
ratings_df = pd.read_csv('C:/Users/lmir/Desktop/Tese/Results_sim/emorecsys_data/CSVs_8_7_2024/csvs_survey/ratings.csv', sep=',', encoding='latin-1',low_memory=False)
photos_df = pd.read_csv('C:/Users/lmir/Desktop/Tese/Results_sim/emorecsys_data/CSVs_8_7_2024/csvs_survey/photos.csv', sep=',', encoding='latin-1',low_memory=False)

demo_df = pd.read_csv('C:/Users/lmir/Desktop/Tese/Results_sim/emorecsys_data/CSVs_8_7_2024/csvs_survey/demographic.csv')

pixel_sim_resnet152_df = pd.read_csv('pixel_sim_resnet152.csv')

In [21]:
pixel_sim_resnet152_df

Unnamed: 0,Target ID,Comparison ID,Similarity Score
0,193.0,193.0,1.000001
1,2677.0,2677.0,1.000001
2,2135.0,2135.0,1.000001
3,1816.0,1816.0,1.000001
4,1341.0,1341.0,1.000001
...,...,...,...
153176,2901.0,1223.0,0.159288
153177,1858.0,652.0,0.156662
153178,2922.0,1223.0,0.149955
153179,1858.0,1223.0,0.149714


In [22]:
def check_items(trainset, testset):
  trainset_items = set(list(item for _, item, _ in trainset))
  testset_items = set(list(item for _, item, _ in testset))

  items_unknown = list(testset_items - trainset_items)

  return len(items_unknown) == 0

def split_emorecsys(dataset, train=0.8, test=0.2):
  trainset, testset = list(), list()

  items_ratings = dataset.groupby('id_photo').size().to_dict() # get number of rating for each id_photo
  items_one_rating = [id_photo for id_photo, size in items_ratings.items() if size == 1] # get the ones with only 1 rating
  # print(items_one_rating)


  for user in list(set(dataset['id_survey'])):
    user_ratings = dataset[dataset['id_survey'] == user]
    all_ratings = list((user, id_photo, like_bool) for id_photo, like_bool in zip(user_ratings['id_photo'], user_ratings['like_bool']))
    size_ratings = len(all_ratings)

    user_trainset = list(rating for rating in all_ratings if rating[1] in items_one_rating) # start trainset with items with only one rating

    relevant = list(x for x in all_ratings if x[2] == 1)
    if relevant: # making sure that we have at least one relevant rating in the testset
      new_rating = random.sample(relevant, 1)
      user_testset = new_rating if new_rating[0] not in user_trainset else list()
    else:
      raise ValueError('The user ' + str(user) + ' did not liked any photo')

    while len(user_testset) < size_ratings*test: # add random ratings until fullfil the size of testset
      new_rating = random.choice(all_ratings)
      if new_rating not in user_testset and new_rating not in user_trainset:
        user_testset.append(new_rating)

    # adding remaining ratings to trainset
    user_trainset.extend(new_rating for new_rating in all_ratings if new_rating not in user_testset and new_rating not in user_trainset)

    assert len(user_trainset) == size_ratings*train
    assert len(user_testset) == size_ratings*test

    # print(user_trainset)
    # print(user_testset)

    trainset.extend(user_trainset)
    testset.extend(user_testset)

  if check_items(trainset, testset):
    train_df = pd.DataFrame(trainset, columns=['id_survey', 'id_photo', 'like_bool']) # like_bool
  else:
    return split_emorecsys(dataset, train, test)

  reader = Reader(rating_scale=(0,1)) # like_bool
  surprise_train = Dataset.load_from_df(train_df, reader).build_full_trainset()

  return surprise_train, testset

In [23]:
surprise_train, testset = split_emorecsys(ratings_df)

In [24]:
print(surprise_train)
print(testset)

<surprise.trainset.Trainset object at 0x000001F42B30B610>
[(1, 2315, 0), (1, 1507, 1), (1, 1738, 1), (2, 1621, 1), (2, 2234, 0), (2, 2121, 1), (3, 1747, 1), (3, 675, 1), (3, 2184, 1), (4, 2933, 1), (4, 766, 1), (4, 2323, 1), (5, 2717, 1), (5, 1717, 1), (5, 1251, 1), (6, 546, 1), (6, 1730, 1), (6, 1308, 1), (7, 2789, 1), (7, 2098, 1), (7, 2877, 0), (8, 1078, 1), (8, 1925, 0), (8, 750, 0), (9, 3018, 1), (9, 1984, 1), (9, 2387, 1), (10, 55, 1), (10, 321, 0), (10, 2894, 1), (11, 2621, 1), (11, 2985, 0), (11, 2285, 1), (12, 2448, 1), (12, 400, 1), (12, 2955, 0), (13, 1775, 1), (13, 227, 0), (13, 1280, 0), (14, 2736, 0), (14, 1091, 1), (14, 1279, 1), (15, 1312, 1), (15, 2848, 1), (15, 2583, 1), (16, 2435, 1), (16, 2259, 0), (16, 60, 1), (17, 2804, 1), (17, 530, 1), (17, 80, 0), (18, 1502, 1), (18, 2868, 0), (18, 1737, 1), (19, 1801, 1), (19, 3004, 1), (19, 98, 1), (20, 2546, 0), (20, 381, 0), (20, 1013, 0), (21, 1365, 1), (21, 2412, 1), (21, 1388, 1), (22, 890, 1), (22, 2162, 1), (22, 365, 0

In [25]:
## preprocessing for Demographic dataset
user_profiles = demo_df.iloc[:, -27:].copy()

## one-hot encoder
for col in ['hobby_other', 'country_residence', 'city', 'education', 'gender', 'populational_aff', 'age']:
  dummies = pd.get_dummies(demo_df[col], prefix=col)
  user_profiles = pd.concat([dummies, user_profiles], axis=1)

user_profiles.dropna(inplace=True)
user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)

## applying PCA
pca = PCA(n_components=50)
pca.fit(user_profiles)
tve = 0 # total variance explained
for i, ve in enumerate(pca.explained_variance_ratio_):
  tve += ve
  # print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i+1, ve, tve))

## keep 32 principal components, since we get a total explained variance of 90.13%.
X_pca = pca.transform(user_profiles)[:, :32]

## see dataframe
user_profiles_train = pd.DataFrame(data=X_pca, columns=['PCA'+str(i) for i in range(1, X_pca.shape[1]+1)], index=demo_df['id_survey'])

  user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)


In [26]:
## now we will be creating a ratings_matrix, in this case using the `like_bool` as the rating
ratings_matrix_like = ratings_df.pivot_table(index='id_survey', columns='id_photo', values='like_bool').reset_index(drop=True)

In [27]:
ratings_matrix_like

id_photo,2,6,14,25,32,38,39,40,46,55,...,3032,3037,3041,3042,3047,3054,3057,3059,3062,3068
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,0.0,,,,1.0,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,,,,,,,,,,,...,,,,,,,,,,
159,,,,,,,,,,,...,,,,,,,,,,
160,,,,,,,,,,,...,,,,,,,,,,
161,,,,,1.0,,,,,,...,,,,,,,,,,


In [28]:
cf_params = {'n_epochs': 20, 'n_factors': 100, 'reg_pu': 0.001, 'reg_qi': 0.01} # nmf

demo_params = {'affinity': 'nearest_neighbors', 'assign_labels': 'kmeans', 'eigen_solver': 'lobpcg', 'gamma': 2.9, 'n_clusters': 20, 'n_neighbors': 30}

In [29]:
def similar_users(user, model, n=5, model_dist=None, Kmeans=False):

  ## 1. buscar o cluster do user test
  test_user = user_profiles_train.loc[user]
  user_idx = user_profiles_train.index.get_loc(user)

  ## 2. buscar os users do mesmo cluster do user test
  user_cluster = model.labels_[user_idx]
  users = [id for id, cluster in enumerate(model.labels_) if cluster == user_cluster and id != user_idx]

  if len(users) < 5: # 2.5. vamos buscar mais users
    if Kmeans:
      user_dist = model_dist[user_profiles_train.index.get_loc(user)]
      sim_cluster = sorted(range(len(user_dist)), key=lambda x: user_dist[x])[1:2][0] # vamos só buscar 1 cluster extra
      users.extend([id for id, cluster in enumerate(model.labels_) if cluster == sim_cluster])

    else:
      dist = euclidean_distances([test_user], user_profiles_train.values)[0] # aqui vamos ver todos os users
      sorted_users = sorted(range(len(dist)), key=lambda x: dist[x])

      # filtrar para não haver repetição de users, ou inserção do test_user
      sorted_users_new = [x for x in sorted_users if x not in users]

      for user in sorted_users_new:
        if len(users) < 5 and user != user_idx:
          users.append(user)

  ## 3. calcular semelhanças entre o user test e os users dos clusters mais próximos
  users_similar = []
  for user_id in users:
      user_profile = user_profiles_train.iloc[user_id]
      similarity_score = 1 / (1 + euclidean_distances([test_user], [user_profile])[0])
      users_similar.append((user_id, similarity_score))

  ## 4. ordem decrescente para o valor de semelhança
  users_similar = sorted(users_similar, key=lambda x: x[1], reverse=True)

  return users_similar[:n]

In [31]:
def recommend_items_from_users(user, model, ratings_matrix, k=10):
  rec_users = similar_users(user, model, n=5)
  weights = [1 / (i+1) for i, _ in enumerate(rec_users)]

  users = list(user[0] for user in rec_users)
  rec_users_items = ratings_matrix.iloc[users] # items of similar users
  item_count = rec_users_items.count(axis=0) # number of ratings of each item within the similar user

  weighted_avg_rating = rec_users_items.multiply(weights, axis=0).mean(axis=0)
  weighted_scores = weighted_avg_rating * item_count # calculate a weighted score using the weighted ratings and the number of ratings per item

  recommended_items = list(weighted_scores.nlargest(k).items())
  # print(recommended_items)

  return recommended_items

def recommend_items_from_cf(predictions, threshold=0):
  preds = list()
  for _, iid, _, est, _ in predictions:
    preds.append((iid, est))

  preds.sort(key=lambda x: x[1], reverse=True)
  recommended_items = [(iid, est) for (iid, est) in preds if est >= threshold]
  # print(recommended_items)

  return recommended_items

def evaluation(ratings_matrix, recommendation, relevant, k=10):
  precisions, recalls, f1scores = [], [], []

  for i in range(1, k+1):
  # for i in range(k, k+1):
    precision, recall, f1score = precision_recall_at_k(recommendation, relevant, i)
    # print("K =", i, "- Precision:", precision, ", Recall:", recall, ", F1 Score:", f1score)

    precisions.append(precision)
    recalls.append(recall)
    f1scores.append(f1score)

  return precisions, recalls, f1scores

def precision_recall_at_k(recommendation, relevant, k=10):
  relevant_items = list(item[1] for item in relevant if item[2] > 0)
  # print("ITENS RELEVANTES:", relevant_items)


  # for i in range(k):
  #   print(f"K = {i+1} - ", recommendation[:i+1])
  # print()

  rel = len(relevant_items) # total number of relevant items to the user
  rel_rec = np.sum(np.isin(recommendation[:k], relevant_items)) # number of relevant items recommended to the user

  # k is the total number of recommended items to the user
  precision = rel_rec / k # number of relevant items recommended to the user / total number of recommended items to the user
  recall = rel_rec / rel if rel != 0 else 1  # number of relevant items recommended to the user / total number of relevant items to the user
  f1score = (2*precision*recall) / (precision+recall) if (precision+recall) != 0 else 0.0

  return precision, recall, f1score

def avg_metrics(precisions, recalls, f1scores, k):
  precisions_avg, recall_avg, f1score_avg = [], [], []
  for i in range(k):
    precision, recall, f1score = [], [], []

    for prec, rec, f1 in zip(precisions, recalls, f1scores):
      precision.append(prec[i])
      recall.append(rec[i])
      f1score.append(f1[i])

    precisions_avg.append(np.round(np.mean(precision), 4))
    recall_avg.append(np.round(np.mean(recall), 4))
    f1score_avg.append(np.round(np.mean(f1score), 4))

  return precisions_avg, recall_avg, f1score_avg

In [32]:
def hybrid_system(dataset, ratings_matrix, k=10):

  precisions_final, recalls_final, f1scores_final = [], [], []
  for i in range(5):
    trainset, testset = split_emorecsys(dataset)

    ## CF
    model_cf = NMF(**cf_params)
    # model_cf = CoClustering(**cf_params2)
    model_cf.fit(trainset)

    ## Demographic-based
    model_demo = SpectralClustering(**demo_params)
    model_demo.fit(user_profiles_train)

    test_users = list(set(item[0] for item in testset))
    for user in test_users:

      user_relevant = list(item for item in testset if item[0] == user) # relevant items for evaluation
      recommended_items_by_users = recommend_items_from_users(user=user, model=model_demo, ratings_matrix=ratings_matrix, k=20)
      # print(recommend_items_from_users)

      # print("USER", user)
      # print(recommended_items_by_users)

      predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
      recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)
      # print(recommended_items_by_cf)
      # print()

      weighted_scores = []
      for demo_score, cf_score in zip(recommended_items_by_users, recommended_items_by_cf):
        weighted_score = (min(demo_score[1], 1)*0.5) + (min(cf_score[1], 1)*0.5)
        weighted_scores.append((demo_score[0], weighted_score))

      weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort
      # print(weighted_scores)

      precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)

      precisions_final.append(precisions)
      recalls_final.append(recalls)
      f1scores_final.append(f1scores)

  precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

  for i in range(k):
    print(f"K = {i+1} - Precision: {precision_avg[i]}, Recall: {recall_avg[i]}, F1 Score: {f1score_avg[i]}")

In [33]:
hybrid_system(ratings_df, ratings_matrix_like) # nmf



K = 1 - Precision: 0.0184, Recall: 0.0108, F1 Score: 0.0123
K = 2 - Precision: 0.0239, Recall: 0.026, F1 Score: 0.0234
K = 3 - Precision: 0.0225, Recall: 0.0339, F1 Score: 0.0258
K = 4 - Precision: 0.0215, Recall: 0.0454, F1 Score: 0.028
K = 5 - Precision: 0.0213, Recall: 0.0554, F1 Score: 0.0298
K = 6 - Precision: 0.0217, Recall: 0.0683, F1 Score: 0.0319
K = 7 - Precision: 0.0205, Recall: 0.0746, F1 Score: 0.0313
K = 8 - Precision: 0.0195, Recall: 0.0804, F1 Score: 0.0306
K = 9 - Precision: 0.0196, Recall: 0.0885, F1 Score: 0.0314
K = 10 - Precision: 0.0185, Recall: 0.0922, F1 Score: 0.0302


In [58]:
def recommend_top_n_items(trainset, testset, similarity_df, n):
    from collections import defaultdict

    test_users = set(item[0] for item in testset)
    recommendations = {}

    train_df = pd.DataFrame(trainset.all_ratings(), columns=['uid', 'iid', 'rating'])
    train_df['uid'] = train_df['uid'].apply(trainset.to_raw_uid)
    train_df['iid'] = train_df['iid'].apply(trainset.to_raw_iid)

    test_df = pd.DataFrame(testset, columns=['uid', 'iid', 'rating'])

    for user in test_users:
        user_train_df = train_df[train_df['uid'] == user]
        items_seen_by_user = set(user_train_df['iid'])
        liked_items_train = set(user_train_df[user_train_df['rating'] > 0]['iid'])

        user_test_df = test_df[(test_df['uid'] == user) & (test_df['rating'] > 0)]
        relevant_items = set(user_test_df['iid'])

        all_sim_items = set()
        for item in liked_items_train:
            top_sim_items = similar_items(item, similarity_df, items_seen_by_user, n)
            all_sim_items.update(top_sim_items)

        average_similarity_scores = get_average_similarity_scores(relevant_items, all_sim_items, similarity_df)
        
        # Ensure we return the top N items in the correct format: [(item_id, score), ...]
        top_n_items = get_top_n_items(average_similarity_scores, n)
        
        # Convert top_n_items to a list of tuples with (item_id, similarity_score)
        recommendations[user] = [(item, average_similarity_scores[item]) for item in top_n_items]

    return recommendations

def get_average_similarity_scores(liked_items_test, all_sim_items, similarity_df):
    similarity_dict = similarity_df.set_index(['Target ID', 'Comparison ID'])['Similarity Score'].to_dict()
    similarity_sums = defaultdict(float)
    similarity_counts = defaultdict(int)

    for liked_item in liked_items_test:
        for sim_item in all_sim_items:
            score = similarity_dict.get((liked_item, sim_item)) or similarity_dict.get((sim_item, liked_item))
            if score is not None:
                similarity_sums[sim_item] += score
                similarity_counts[sim_item] += 1

    average_similarity_scores = {item: similarity_sums[item] / similarity_counts[item] for item in similarity_sums}
    return dict(sorted(average_similarity_scores.items(), key=lambda x: x[1], reverse=True))

def get_top_n_items(average_similarity_scores, n):
    # Sort by score in descending order and return the top N items
    return list(average_similarity_scores.keys())[:n]


In [59]:
def hybrid_system_cb_cf(dataset, ratings_matrix, similarity_df, k=10):

    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df, n=20)

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))
        recommendations_cf = {}

        for user in test_users:
            user_relevant = list(item for item in testset if item[0] == user) # Relevant items for evaluation

            # Get content-based recommendations
            recommended_items_by_cb = recommendations_cb.get(user, [])
            
            # Predict ratings for items suggested by content-based system
            predictions_cf = [model_cf.predict(user, iid) for iid in recommended_items_by_cb]
            recommended_items_by_cf = recommend_items_from_cf(predictions_cf, threshold=0.5)

            # Combine recommendations
            weighted_scores = []
            for cb_score, cf_score in zip(recommended_items_by_cb, recommended_items_by_cf):
                weighted_score = (min(cb_score[1], 1) * 0.5) + (min(cf_score[1], 1) * 0.5)
                weighted_scores.append((cb_score[0], weighted_score))

            weighted_scores.sort(key=lambda x: x[1], reverse=True) # Descending sort

            # Evaluate hybrid recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)
            
            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")


In [60]:
hybrid_system_cb_cf(ratings_df, ratings_matrix_like, pixel_sim_resnet152_df, k=10)

K = 1 - Precision: 0.4282, Recall: 0.1875, F1 Score: 0.2536
K = 2 - Precision: 0.2607, Recall: 0.2225, F1 Score: 0.2338
K = 3 - Precision: 0.1767, Recall: 0.2254, F1 Score: 0.1932
K = 4 - Precision: 0.1325, Recall: 0.2254, F1 Score: 0.1631
K = 5 - Precision: 0.1060, Recall: 0.2254, F1 Score: 0.1412
K = 6 - Precision: 0.0883, Recall: 0.2254, F1 Score: 0.1245
K = 7 - Precision: 0.0759, Recall: 0.2258, F1 Score: 0.1116
K = 8 - Precision: 0.0664, Recall: 0.2258, F1 Score: 0.1010
K = 9 - Precision: 0.0590, Recall: 0.2258, F1 Score: 0.0922
K = 10 - Precision: 0.0531, Recall: 0.2258, F1 Score: 0.0848
