In [3]:
# Import libraries
import pandas as pd
import numpy as np
import random
from surprise import Dataset, Reader
from surprise import NMF
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import SpectralClustering
from sklearn.decomposition import PCA

from collections import defaultdict

In [4]:
# Data Loading
ratings_df = pd.read_csv("../data/csvs/ratings.csv", sep=',', encoding='latin-1',low_memory=False)
#photos_df = pd.read_csv('C:/Users/lmir/Desktop/Tese/Results_sim/emorecsys_data/CSVs_8_7_2024/csvs_survey/photos.csv', sep=',', encoding='latin-1',low_memory=False)

demo_df = pd.read_csv('../data/csvs/demographic.csv')

pixel_sim_resnet152_df = pd.read_csv('../data/csvs/pixel_sim_resnet152.csv')

In [5]:
pixel_sim_resnet152_df

Unnamed: 0,Target ID,Comparison ID,Similarity Score
0,193.0,193.0,1.000001
1,2677.0,2677.0,1.000001
2,2135.0,2135.0,1.000001
3,1816.0,1816.0,1.000001
4,1341.0,1341.0,1.000001
...,...,...,...
153176,2901.0,1223.0,0.159288
153177,1858.0,652.0,0.156662
153178,2922.0,1223.0,0.149955
153179,1858.0,1223.0,0.149714


In [6]:
def check_items(trainset, testset):
  trainset_items = set(list(item for _, item, _ in trainset))
  testset_items = set(list(item for _, item, _ in testset))

  items_unknown = list(testset_items - trainset_items)

  return len(items_unknown) == 0

def split_emorecsys(dataset, train=0.8, test=0.2):
  trainset, testset = list(), list()

  items_ratings = dataset.groupby('id_photo').size().to_dict() # get number of rating for each id_photo
  items_one_rating = [id_photo for id_photo, size in items_ratings.items() if size == 1] # get the ones with only 1 rating
  # print(items_one_rating)


  for user in list(set(dataset['id_survey'])):
    user_ratings = dataset[dataset['id_survey'] == user]
    all_ratings = list((user, id_photo, like_bool) for id_photo, like_bool in zip(user_ratings['id_photo'], user_ratings['like_bool']))
    size_ratings = len(all_ratings)

    user_trainset = list(rating for rating in all_ratings if rating[1] in items_one_rating) # start trainset with items with only one rating

    relevant = list(x for x in all_ratings if x[2] == 1)
    if relevant: # making sure that we have at least one relevant rating in the testset
      new_rating = random.sample(relevant, 1)
      user_testset = new_rating if new_rating[0] not in user_trainset else list()
    else:
      raise ValueError('The user ' + str(user) + ' did not liked any photo')

    while len(user_testset) < size_ratings*test: # add random ratings until fullfil the size of testset
      new_rating = random.choice(all_ratings)
      if new_rating not in user_testset and new_rating not in user_trainset:
        user_testset.append(new_rating)

    # adding remaining ratings to trainset
    user_trainset.extend(new_rating for new_rating in all_ratings if new_rating not in user_testset and new_rating not in user_trainset)

    assert len(user_trainset) == size_ratings*train
    assert len(user_testset) == size_ratings*test

    # print(user_trainset)
    # print(user_testset)

    trainset.extend(user_trainset)
    testset.extend(user_testset)

  if check_items(trainset, testset):
    train_df = pd.DataFrame(trainset, columns=['id_survey', 'id_photo', 'like_bool']) # like_bool
  else:
    return split_emorecsys(dataset, train, test)

  reader = Reader(rating_scale=(0,1)) # like_bool
  surprise_train = Dataset.load_from_df(train_df, reader).build_full_trainset()

  return surprise_train, testset

In [7]:
surprise_train, testset = split_emorecsys(ratings_df)

In [8]:
print(surprise_train)
print(testset)

<surprise.trainset.Trainset object at 0x000001BCD58FE410>
[(1, 1785, 1), (1, 1507, 1), (1, 948, 0), (2, 2370, 1), (2, 342, 0), (2, 1898, 0), (3, 2184, 1), (3, 1747, 1), (3, 655, 1), (4, 1816, 1), (4, 1082, 1), (4, 2933, 1), (5, 749, 1), (5, 2067, 1), (5, 2717, 1), (6, 681, 1), (6, 546, 1), (6, 2581, 1), (7, 429, 1), (7, 2789, 1), (7, 2195, 1), (8, 426, 1), (8, 1925, 0), (8, 38, 1), (9, 2193, 0), (9, 2668, 1), (9, 1712, 1), (10, 55, 1), (10, 321, 0), (10, 732, 1), (11, 1821, 1), (11, 1681, 1), (11, 2951, 1), (12, 1721, 1), (12, 2876, 0), (12, 1838, 1), (13, 2787, 1), (13, 142, 0), (13, 1089, 1), (14, 612, 1), (14, 1342, 1), (14, 2097, 1), (15, 1312, 1), (15, 2848, 1), (15, 407, 1), (16, 2410, 1), (16, 993, 0), (16, 2135, 1), (17, 791, 1), (17, 2811, 0), (17, 2804, 1), (18, 2536, 1), (18, 1550, 1), (18, 474, 1), (19, 1801, 1), (19, 1535, 1), (19, 1891, 1), (20, 2796, 1), (20, 2444, 1), (20, 1013, 0), (21, 137, 1), (21, 1012, 1), (21, 2792, 1), (22, 1428, 1), (22, 2369, 1), (22, 1910, 1),

In [9]:
## preprocessing for Demographic dataset
user_profiles = demo_df.iloc[:, -27:].copy()

## one-hot encoder
for col in ['hobby_other', 'country_residence', 'city', 'education', 'gender', 'populational_aff', 'age']:
  dummies = pd.get_dummies(demo_df[col], prefix=col)
  user_profiles = pd.concat([dummies, user_profiles], axis=1)

user_profiles.dropna(inplace=True)
user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)

## applying PCA
pca = PCA(n_components=50)
pca.fit(user_profiles)
tve = 0 # total variance explained
for i, ve in enumerate(pca.explained_variance_ratio_):
  tve += ve
  # print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i+1, ve, tve))

## keep 32 principal components, since we get a total explained variance of 90.13%.
X_pca = pca.transform(user_profiles)[:, :32]

## see dataframe
user_profiles_train = pd.DataFrame(data=X_pca, columns=['PCA'+str(i) for i in range(1, X_pca.shape[1]+1)], index=demo_df['id_survey'])

  user_profiles = user_profiles.applymap(lambda x: 1 if x is True else 0 if x is False else x)


In [10]:
## now we will be creating a ratings_matrix, in this case using the `like_bool` as the rating
ratings_matrix_like = ratings_df.pivot_table(index='id_survey', columns='id_photo', values='like_bool').reset_index(drop=True)

In [11]:
ratings_matrix_like

id_photo,2,6,14,25,32,38,39,40,46,55,...,3032,3037,3041,3042,3047,3054,3057,3059,3062,3068
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,0.0,,,,1.0,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,,,,,,,,,,,...,,,,,,,,,,
159,,,,,,,,,,,...,,,,,,,,,,
160,,,,,,,,,,,...,,,,,,,,,,
161,,,,,1.0,,,,,,...,,,,,,,,,,


In [12]:
cf_params = {'n_epochs': 20, 'n_factors': 100, 'reg_pu': 0.001, 'reg_qi': 0.01} # nmf

demo_params = {'affinity': 'nearest_neighbors', 'assign_labels': 'kmeans', 'eigen_solver': 'lobpcg', 'gamma': 2.9, 'n_clusters': 20, 'n_neighbors': 30}

In [13]:
def similar_users(user, model, n=5, model_dist=None, Kmeans=False):

  ## 1. buscar o cluster do user test
  test_user = user_profiles_train.loc[user]
  user_idx = user_profiles_train.index.get_loc(user)

  ## 2. buscar os users do mesmo cluster do user test
  user_cluster = model.labels_[user_idx]
  users = [id for id, cluster in enumerate(model.labels_) if cluster == user_cluster and id != user_idx]

  if len(users) < 5: # 2.5. vamos buscar mais users
    if Kmeans:
      user_dist = model_dist[user_profiles_train.index.get_loc(user)]
      sim_cluster = sorted(range(len(user_dist)), key=lambda x: user_dist[x])[1:2][0] # vamos só buscar 1 cluster extra
      users.extend([id for id, cluster in enumerate(model.labels_) if cluster == sim_cluster])

    else:
      dist = euclidean_distances([test_user], user_profiles_train.values)[0] # aqui vamos ver todos os users
      sorted_users = sorted(range(len(dist)), key=lambda x: dist[x])

      # filtrar para não haver repetição de users, ou inserção do test_user
      sorted_users_new = [x for x in sorted_users if x not in users]

      for user in sorted_users_new:
        if len(users) < 5 and user != user_idx:
          users.append(user)

  ## 3. calcular semelhanças entre o user test e os users dos clusters mais próximos
  users_similar = []
  for user_id in users:
      user_profile = user_profiles_train.iloc[user_id]
      similarity_score = 1 / (1 + euclidean_distances([test_user], [user_profile])[0])
      users_similar.append((user_id, similarity_score))

  ## 4. ordem decrescente para o valor de semelhança
  users_similar = sorted(users_similar, key=lambda x: x[1], reverse=True)

  return users_similar[:n]

In [14]:
def recommend_items_from_users(user, model, ratings_matrix, k=10):
  rec_users = similar_users(user, model, n=5)
  weights = [1 / (i+1) for i, _ in enumerate(rec_users)]

  users = list(user[0] for user in rec_users)
  rec_users_items = ratings_matrix.iloc[users] # items of similar users
  item_count = rec_users_items.count(axis=0) # number of ratings of each item within the similar user

  weighted_avg_rating = rec_users_items.multiply(weights, axis=0).mean(axis=0)
  weighted_scores = weighted_avg_rating * item_count # calculate a weighted score using the weighted ratings and the number of ratings per item

  recommended_items = list(weighted_scores.nlargest(k).items())
  # print(recommended_items)

  return recommended_items

def recommend_items_from_cf(predictions, threshold=0):
  preds = list()
  for _, iid, _, est, _ in predictions:
    preds.append((iid, est))

  preds.sort(key=lambda x: x[1], reverse=True)
  recommended_items = [(iid, est) for (iid, est) in preds if est >= threshold]
  # print(recommended_items)

  return recommended_items

def evaluation(ratings_matrix, recommendation, relevant, k=10):
  precisions, recalls, f1scores = [], [], []

  for i in range(1, k+1):
  # for i in range(k, k+1):
    precision, recall, f1score = precision_recall_at_k(recommendation, relevant, i)
    # print("K =", i, "- Precision:", precision, ", Recall:", recall, ", F1 Score:", f1score)

    precisions.append(precision)
    recalls.append(recall)
    f1scores.append(f1score)

  return precisions, recalls, f1scores

def precision_recall_at_k(recommendation, relevant, k=10):
  relevant_items = list(item[1] for item in relevant if item[2] > 0)
  # print("ITENS RELEVANTES:", relevant_items)


  # for i in range(k):
  #   print(f"K = {i+1} - ", recommendation[:i+1])
  # print()

  rel = len(relevant_items) # total number of relevant items to the user
  rel_rec = np.sum(np.isin(recommendation[:k], relevant_items)) # number of relevant items recommended to the user

  # k is the total number of recommended items to the user
  precision = rel_rec / k # number of relevant items recommended to the user / total number of recommended items to the user
  recall = rel_rec / rel if rel != 0 else 1  # number of relevant items recommended to the user / total number of relevant items to the user
  f1score = (2*precision*recall) / (precision+recall) if (precision+recall) != 0 else 0.0

  return precision, recall, f1score

def avg_metrics(precisions, recalls, f1scores, k):
  precisions_avg, recall_avg, f1score_avg = [], [], []
  for i in range(k):
    precision, recall, f1score = [], [], []

    for prec, rec, f1 in zip(precisions, recalls, f1scores):
      precision.append(prec[i])
      recall.append(rec[i])
      f1score.append(f1[i])

    precisions_avg.append(np.round(np.mean(precision), 4))
    recall_avg.append(np.round(np.mean(recall), 4))
    f1score_avg.append(np.round(np.mean(f1score), 4))

  return precisions_avg, recall_avg, f1score_avg

In [44]:
def hybrid_system(dataset, ratings_matrix, k=10):

  precisions_final, recalls_final, f1scores_final = [], [], []
  for i in range(5):
    trainset, testset = split_emorecsys(dataset)

    ## CF
    model_cf = NMF(**cf_params)
    # model_cf = CoClustering(**cf_params2)
    model_cf.fit(trainset)

    ## Demographic-based
    model_demo = SpectralClustering(**demo_params)
    model_demo.fit(user_profiles_train)

    test_users = list(set(item[0] for item in testset))
    for user in test_users:

      user_relevant = list(item for item in testset if item[0] == user) # relevant items for evaluation
      recommended_items_by_users = recommend_items_from_users(user=user, model=model_demo, ratings_matrix=ratings_matrix, k=20)

      # print("USER", user)
      # print(recommended_items_by_users)

      predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_users] # predicting for the items suggested by demographic-based system
      recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)
      # print(recommended_items_by_cf)
      # print()

      weighted_scores = []
      for demo_score, cf_score in zip(recommended_items_by_users, recommended_items_by_cf):
        # print(demo_score)
        # print(cf_score)
        weighted_score = (min(demo_score[1], 1)*0.5) + (min(cf_score[1], 1)*0.5)
        weighted_scores.append((demo_score[0], weighted_score))

      weighted_scores.sort(key=lambda x: x[1], reverse=True) # descending sort
      # print(weighted_scores)

      precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)

      precisions_final.append(precisions)
      recalls_final.append(recalls)
      f1scores_final.append(f1scores)

  precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

  for i in range(k):
    print(f"K = {i+1} - Precision: {precision_avg[i]}, Recall: {recall_avg[i]}, F1 Score: {f1score_avg[i]}")

In [45]:
hybrid_system(ratings_df, ratings_matrix_like) # nmf



K = 1 - Precision: 0.0209, Recall: 0.0112, F1 Score: 0.0131
K = 2 - Precision: 0.0196, Recall: 0.0186, F1 Score: 0.018
K = 3 - Precision: 0.0209, Recall: 0.0288, F1 Score: 0.0232
K = 4 - Precision: 0.0215, Recall: 0.0407, F1 Score: 0.0271
K = 5 - Precision: 0.0216, Recall: 0.0521, F1 Score: 0.0295
K = 6 - Precision: 0.0223, Recall: 0.0673, F1 Score: 0.0324
K = 7 - Precision: 0.021, Recall: 0.073, F1 Score: 0.0317
K = 8 - Precision: 0.0206, Recall: 0.0798, F1 Score: 0.0318
K = 9 - Precision: 0.0206, Recall: 0.0881, F1 Score: 0.0326
K = 10 - Precision: 0.02, Recall: 0.0947, F1 Score: 0.0323


In [17]:
def recommend_top_n_items(trainset, testset, similarity_df, n=86, threshold=0.5):

    test_users = set(item[0] for item in testset)
    recommendations = {}

    train_df = pd.DataFrame(trainset.all_ratings(), columns=['uid', 'iid', 'rating'])
    train_df['uid'] = train_df['uid'].apply(trainset.to_raw_uid)
    train_df['iid'] = train_df['iid'].apply(trainset.to_raw_iid)

    test_df = pd.DataFrame(testset, columns=['uid', 'iid', 'rating'])

    for user in test_users:
        user_train_df = train_df[train_df['uid'] == user]
        items_seen_by_user = set(user_train_df['iid'])
        liked_items_train = set(user_train_df[user_train_df['rating'] > 0]['iid'])

        user_test_df = test_df[(test_df['uid'] == user) & (test_df['rating'] > 0)]
        relevant_items = set(user_test_df['iid'])

        all_sim_items = set()
        for item in liked_items_train:
            top_sim_items = similar_items(item, similarity_df, items_seen_by_user, n)
            all_sim_items.update(top_sim_items)

        average_similarity_scores = get_average_similarity_scores(relevant_items, all_sim_items, similarity_df, threshold)
        
        top_n_items = get_top_n_items(average_similarity_scores, n)
        
        # Convert top_n_items to a list of tuples with (item_id, similarity_score)
        recommendations[user] = [(item, average_similarity_scores[item]) for item in top_n_items]

    return recommendations

def get_average_similarity_scores(liked_items_test, all_sim_items, similarity_df, threshold):
    similarity_dict = similarity_df.set_index(['Target ID', 'Comparison ID'])['Similarity Score'].to_dict()
    similarity_sums = defaultdict(float)
    similarity_counts = defaultdict(int)

    for liked_item in liked_items_test:
        for sim_item in all_sim_items:
            score = similarity_dict.get((liked_item, sim_item)) or similarity_dict.get((sim_item, liked_item))
            if score is not None and score >= threshold:
                similarity_sums[sim_item] += score
                similarity_counts[sim_item] += 1

    average_similarity_scores = {item: similarity_sums[item] / similarity_counts[item] for item in similarity_sums}
    return dict(sorted(average_similarity_scores.items(), key=lambda x: x[1], reverse=True))

def get_top_n_items(average_similarity_scores, n):
    # Sort by score in descending order and return the top N items
    return list(average_similarity_scores.keys())[:n]

def similar_items(item_id, similarity_df, items_seen_by_user, top_n):
    # Filtrando diretamente com pandas e ordenando
    similar_items_df = similarity_df[similarity_df['Target ID'] == item_id].nlargest(top_n, 'Similarity Score')
    
    seen_items = set(items_seen_by_user)
    similar_item_ids = set()

    for comparison_id in similar_items_df['Comparison ID']:
        if comparison_id != item_id and comparison_id not in seen_items and comparison_id not in similar_item_ids:
            similar_item_ids.add(comparison_id)
        if len(similar_item_ids) >= top_n:
            break
    
    return list(similar_item_ids)[:top_n]

# 1. Weighted

In [18]:
def hybrid_system_cb_cf(dataset, ratings_matrix, similarity_df, k=10):

    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df)

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))

        for user in test_users:
            user_relevant = list(item for item in testset if item[0] == user) # Relevant items for evaluation

            # Get content-based recommendations
            recommended_items_by_cb = recommendations_cb.get(user, [])
            # print('recommended_items_by_cb')
            # print(recommended_items_by_cb)
            # print(len(recommended_items_by_cb))

            predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_cb] # predicting for the items suggested by content-based system
            recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)
            # print('recommended_items_by_cf')
            # print(recommended_items_by_cf)
            # print(len(recommended_items_by_cf))
            # print() #Dá sempre 1 mega bom!

            # Combine recommendations
            weighted_scores = []
            for cb_score, cf_score in zip(recommended_items_by_cb, recommended_items_by_cf):
                # print(cb_score, cf_score)
                if cb_score[0] != cf_score[0]: #os items recomendados nem sempre eram iguais então achei que assim fazia sentido
                    if cb_score[1] > cf_score[1]:
                        weighted_scores.append((cb_score[0], cb_score[1]))
                    else:
                        weighted_scores.append((cf_score[0], cf_score[1]))
                else:
                    weighted_score = (min(cb_score[1], 1) * 0.5) + (min(cf_score[1], 1) * 0.5)
                    weighted_scores.append((cb_score[0], weighted_score))

            weighted_scores.sort(key=lambda x: x[1], reverse=True) # Descending sort

            # Evaluate hybrid recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)
            
            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")


In [19]:
hybrid_system_cb_cf(ratings_df, ratings_matrix_like, pixel_sim_resnet152_df, k=10)

K = 1 - Precision: 0.3239, Recall: 0.1577, F1 Score: 0.2037
K = 2 - Precision: 0.1847, Recall: 0.1763, F1 Score: 0.1735
K = 3 - Precision: 0.1239, Recall: 0.1771, F1 Score: 0.1407
K = 4 - Precision: 0.0929, Recall: 0.1771, F1 Score: 0.1180
K = 5 - Precision: 0.0744, Recall: 0.1771, F1 Score: 0.1017
K = 6 - Precision: 0.0620, Recall: 0.1771, F1 Score: 0.0893
K = 7 - Precision: 0.0531, Recall: 0.1771, F1 Score: 0.0797
K = 8 - Precision: 0.0465, Recall: 0.1771, F1 Score: 0.0720
K = 9 - Precision: 0.0413, Recall: 0.1771, F1 Score: 0.0656
K = 10 - Precision: 0.0372, Recall: 0.1771, F1 Score: 0.0602


In [20]:
def hybrid_system_cb_cf(dataset, ratings_matrix, similarity_df, k=10):

    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df)

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))

        for user in test_users:
            user_relevant = list(item for item in testset if item[0] == user) # Relevant items for evaluation

            # Obtenha as recomendações baseadas em conteúdo
            recommended_items_by_cb = recommendations_cb.get(user, [])

            # Preveja para os itens sugeridos pelo sistema baseado em conteúdo
            predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_cb]
            recommended_items_by_cf_dict = {iid: score for (iid, score) in recommend_items_from_cf(predictions_cf, threshold=0.5)}

            # Inicialize a lista de weighted_scores
            weighted_scores = []

            # Itere sobre as recomendações de conteúdo e combine com as de CF se existir
            for cb_score in recommended_items_by_cb:
                iid_cb, score_cb = cb_score
                # print(iid_cb)
                if iid_cb in recommended_items_by_cf_dict:
                    score_cf = recommended_items_by_cf_dict[iid_cb]
                    weighted_score = (min(score_cb, 1) * 0.5) + (min(score_cf, 1) * 0.5)
                    weighted_scores.append((iid_cb, weighted_score))
                else:
                    weighted_scores.append((iid_cb, score_cb))  # Caso não exista cf põe o item com o score do cb

            weighted_scores.sort(key=lambda x: x[1], reverse=True) # sort descending

            # Evaluate hybrid recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)
            
            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")

In [21]:
hybrid_system_cb_cf(ratings_df, ratings_matrix_like, pixel_sim_resnet152_df, k=10)

K = 1 - Precision: 0.7558, Recall: 0.3507, F1 Score: 0.4636
K = 2 - Precision: 0.5442, Recall: 0.4796, F1 Score: 0.4952
K = 3 - Precision: 0.3845, Recall: 0.5012, F1 Score: 0.4236
K = 4 - Precision: 0.2887, Recall: 0.5016, F1 Score: 0.3575
K = 5 - Precision: 0.2317, Recall: 0.5029, F1 Score: 0.3101
K = 6 - Precision: 0.1930, Recall: 0.5029, F1 Score: 0.2733
K = 7 - Precision: 0.1655, Recall: 0.5029, F1 Score: 0.2443
K = 8 - Precision: 0.1448, Recall: 0.5029, F1 Score: 0.2209
K = 9 - Precision: 0.1288, Recall: 0.5035, F1 Score: 0.2018
K = 10 - Precision: 0.1160, Recall: 0.5035, F1 Score: 0.1856


In [22]:
def hybrid_system_cb_cf(dataset, ratings_matrix, similarity_df, k=10):

    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df)

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))

        for user in test_users:
            user_relevant = list(item for item in testset if item[0] == user) # Relevant items for evaluation

            # Obtenha as recomendações baseadas em conteúdo
            recommended_items_by_cb = recommendations_cb.get(user, [])
            recommended_items_by_cb_dict = {item: score for item, score in recommended_items_by_cb}

            # Preveja para os itens sugeridos pelo sistema baseado em conteúdo
            predictions_cf = [model_cf.predict(user, iid) for iid, _ in recommended_items_by_cb]
            recommended_items_by_cf =  recommend_items_from_cf(predictions_cf, threshold=0.5)

            # Inicialize a lista de weighted_scores
            weighted_scores = []

            # Itere sobre as recomendações de conteúdo e combine com as de CF se existir
            for cf_score in recommended_items_by_cf:
                iid_cf, score_cf = cf_score
                if iid_cf in recommended_items_by_cb_dict:
                    score_cb = recommended_items_by_cb_dict[iid_cf]
                    weighted_score = (min(score_cf, 1) * 0.5) + (min(score_cb, 1) * 0.5)
                    weighted_scores.append((iid_cf, weighted_score))
                else:
                    weighted_scores.append((iid_cf, score_cf))  # Caso não exista cf põe o item com o score do cb

            weighted_scores.sort(key=lambda x: x[1], reverse=True) # sort descending

            # Evaluate hybrid recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)
            
            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")

In [23]:
hybrid_system_cb_cf(ratings_df, ratings_matrix_like, pixel_sim_resnet152_df, k=10)

K = 1 - Precision: 0.7374, Recall: 0.3448, F1 Score: 0.4542
K = 2 - Precision: 0.5092, Recall: 0.4499, F1 Score: 0.4628
K = 3 - Precision: 0.3587, Recall: 0.4691, F1 Score: 0.3948
K = 4 - Precision: 0.2696, Recall: 0.4699, F1 Score: 0.3335
K = 5 - Precision: 0.2157, Recall: 0.4699, F1 Score: 0.2885
K = 6 - Precision: 0.1798, Recall: 0.4699, F1 Score: 0.2542
K = 7 - Precision: 0.1541, Recall: 0.4699, F1 Score: 0.2273
K = 8 - Precision: 0.1348, Recall: 0.4699, F1 Score: 0.2055
K = 9 - Precision: 0.1200, Recall: 0.4706, F1 Score: 0.1878
K = 10 - Precision: 0.1080, Recall: 0.4706, F1 Score: 0.1727


## Hibrido sem dependencias

In [40]:
def hybrid_system_cb_cf(dataset, ratings_matrix, similarity_df, k=10):

    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)
        # print(testset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df)

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))

        for user in test_users:
            user_relevant = list(item for item in testset if item[0] == user) # Relevant items for evaluation

            # print("USER", user)

            # Get content-based recommendations
            recommended_items_by_cb = recommendations_cb.get(user, [])
            # print('recommend_items_by_cb')
            # print(recommended_items_by_cb)
            
            # Predict ratings for items suggested 
            predictions_cf = [model_cf.predict(user, item[1]) for item in user_relevant]
            recommended_items_by_cf_dict = {iid: score for (iid, score) in recommend_items_from_cf(predictions_cf, threshold=0.5)}
            
            # print('recommend_items_by_cf')
            # print(recommended_items_by_cf_dict)
            # print()

            # Combine recommendations
            weighted_scores = []
            for cb_score in recommended_items_by_cb:
                iid_cb, score_cb = cb_score
                # print(iid_cb)
                if iid_cb in recommended_items_by_cf_dict:
                    score_cf = recommended_items_by_cf_dict[iid_cb]
                    weighted_score = (min(score_cb, 1) * 0.5) + (min(score_cf, 1) * 0.5)
                    weighted_scores.append((iid_cb, weighted_score))
                else:
                    weighted_scores.append((iid_cb, score_cb))  # Caso não exista cf põe o item com o score do cb
            weighted_scores.sort(key=lambda x: x[1], reverse=True) # Descending sort

            # Evaluate hybrid recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)
            
            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")

In [41]:
hybrid_system_cb_cf(ratings_df, ratings_matrix_like, pixel_sim_resnet152_df, k=10)

K = 1 - Precision: 0.7755, Recall: 0.3601, F1 Score: 0.4753
K = 2 - Precision: 0.5564, Recall: 0.4916, F1 Score: 0.5065
K = 3 - Precision: 0.3947, Recall: 0.5153, F1 Score: 0.4347
K = 4 - Precision: 0.2963, Recall: 0.5157, F1 Score: 0.3669
K = 5 - Precision: 0.2371, Recall: 0.5157, F1 Score: 0.3173
K = 6 - Precision: 0.1975, Recall: 0.5157, F1 Score: 0.2796
K = 7 - Precision: 0.1693, Recall: 0.5157, F1 Score: 0.2500
K = 8 - Precision: 0.1482, Recall: 0.5157, F1 Score: 0.2260
K = 9 - Precision: 0.1317, Recall: 0.5157, F1 Score: 0.2063
K = 10 - Precision: 0.1185, Recall: 0.5157, F1 Score: 0.1897


In [38]:
def hybrid_system_cb_cf(dataset, ratings_matrix, similarity_df, k=10):

    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)
        # print(testset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df, n= 86, threshold = 0 )

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))

        for user in test_users:
            user_relevant = list(item for item in testset if item[0] == user) # Relevant items for evaluation

            # print("USER", user)

            # Get content-based recommendations
            recommended_items_by_cb = recommendations_cb.get(user, [])
            recommended_items_by_cb_dict = {item: score for item, score in recommended_items_by_cb}
            # print('recommend_items_by_cb')
            # print(recommended_items_by_cb_dict)
            
            # Predict ratings for items suggested 
            predictions_cf = [model_cf.predict(user, item[1]) for item in user_relevant]
            recommended_items_by_cf = recommend_items_from_cf(predictions_cf, threshold=0.5) 
            
            # print('recommend_items_by_cf')
            # print(recommended_items_by_cf)
            # print()

            # Combine recommendations
            weighted_scores = []

            for cf_score in recommended_items_by_cf:
                iid_cf, score_cf = cf_score
                if iid_cf in recommended_items_by_cb_dict:
                    score_cb = recommended_items_by_cb_dict[iid_cf]
                    weighted_score = (min(score_cf, 1) * 0.5) + (min(score_cb, 1) * 0.5)
                    weighted_scores.append((iid_cf, weighted_score))
                else:
                    weighted_scores.append((iid_cf, score_cf))  # Caso não exista cf põe o item com o score do cb
            weighted_scores.sort(key=lambda x: x[1], reverse=True) # Descending sort

            # Evaluate hybrid recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, weighted_scores, user_relevant, k=k)
            
            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")

In [39]:
hybrid_system_cb_cf(ratings_df, ratings_matrix_like, pixel_sim_resnet152_df, k=10) 

K = 1 - Precision: 0.8638, Recall: 0.4094, F1 Score: 0.5339
K = 2 - Precision: 0.8049, Recall: 0.7080, F1 Score: 0.7299
K = 3 - Precision: 0.7284, Recall: 0.9370, F1 Score: 0.7981
K = 4 - Precision: 0.5463, Recall: 0.9370, F1 Score: 0.6735
K = 5 - Precision: 0.4371, Recall: 0.9370, F1 Score: 0.5828
K = 6 - Precision: 0.3642, Recall: 0.9370, F1 Score: 0.5138
K = 7 - Precision: 0.3122, Recall: 0.9370, F1 Score: 0.4595
K = 8 - Precision: 0.2732, Recall: 0.9370, F1 Score: 0.4156
K = 9 - Precision: 0.2428, Recall: 0.9370, F1 Score: 0.3794
K = 10 - Precision: 0.2185, Recall: 0.9370, F1 Score: 0.3490


# 2. Switching

In [49]:
def hybrid_system_switching(dataset, ratings_matrix, similarity_df, k=10, positive_interaction_threshold = 8):
    
    precisions_final, recalls_final, f1scores_final = [], [], []

    for i in range(5):
        trainset, testset = split_emorecsys(dataset)

        # Content-Based Recommendations
        recommendations_cb = recommend_top_n_items(trainset, testset, similarity_df)

        # Collaborative Filtering Recommendations
        model_cf = NMF(**cf_params)
        model_cf.fit(trainset)

        test_users = list(set(item[0] for item in testset))
        recommendations_final = {}

        # Convert trainset to a suitable format for interaction counting
        interactions = defaultdict(int)
        for user, _, rating in trainset.all_ratings():
            # print(rating)
            if rating > 0.5:
                interactions[user] += 1
        
        for user in test_users:
            user_relevant = [item for item in testset if item[0] == user]  # Relevant items for evaluation
            user_interactions = interactions.get(user, 0)
            # print(user_interactions)

            if user_interactions < positive_interaction_threshold:
                # Use Content-Based for users with few interactions
                recommendations = recommendations_cb.get(user, [])
            else:
                # Use Collaborative Filtering for users with sufficient interactions
                predictions_cf = [model_cf.predict(user, item[1]) for item in user_relevant]
                recommendations = recommend_items_from_cf(predictions_cf, threshold=0.5) 

            # Update recommendations_final with the final recommendations for the user
            recommendations_final[user] = recommendations
            # print(recommendations_final[user])

            # Evaluate switching recommendations
            precisions, recalls, f1scores = evaluation(ratings_matrix, recommendations_final[user], user_relevant, k=k)

            precisions_final.append(precisions)
            recalls_final.append(recalls)
            f1scores_final.append(f1scores)

    precision_avg, recall_avg, f1score_avg = avg_metrics(precisions_final, recalls_final, f1scores_final, k)

    for i in range(k):
        print(f"K = {i+1} - Precision: {precision_avg[i]:.4f}, Recall: {recall_avg[i]:.4f}, F1 Score: {f1score_avg[i]:.4f}")

In [50]:
hybrid_system_switching(ratings_df, ratings_matrix_like,pixel_sim_resnet152_df ,k=10)