# **Hummus - Community Based Recommendations**
Notebook for the first project for the Machine Learning Complements course (CAC).

### Imports

In [None]:
import pandas as pd
import utils as ut
import numpy as np
import warnings
warnings.simplefilter(action='ignore')
import networkx as nx
import os
from networkx.algorithms.community import greedy_modularity_communities, girvan_newman, label_propagation_communities
import matplotlib.pyplot as plt
from surprise.model_selection import train_test_split
from surprise import Dataset, Reader, KNNBasic, NormalPredictor, SVD, accuracy

### Constants

In [None]:
VERBOSE = True
SAMPLES = 10000
USE_SAMPLES = True

### Load Data

In [None]:
if USE_SAMPLES:
    df_members = pd.read_csv('pp_members_sampled.csv')
    df_recipes = pd.read_csv('pp_recipes_sampled.csv')
    df_reviews = pd.read_csv('pp_reviews_sampled.csv')
else:
    df_members = pd.read_csv('pp_members.csv')#, nrows=SAMPLES)
    df_recipes = pd.read_csv('pp_recipes.csv')#, nrows=SAMPLES)
    df_reviews = pd.read_csv('pp_reviews.csv', nrows=SAMPLES)

    df_members = df_members[df_members['member_id'].isin(df_reviews['member_id'])] # keep only members who have reviewed
    df_recipes = df_recipes[df_recipes['recipe_id'].isin(df_reviews['recipe_id'])] # keep only recipes that have been reviewed
    
    # Save the sampled data
    df_members.to_csv('pp_members_sampled.csv', index=False)
    df_recipes.to_csv('pp_recipes_sampled.csv', index=False)
    df_reviews.to_csv('pp_reviews_sampled.csv', index=False)

#### Initial Observation - Members dataset

In [None]:
ut.initial_obs(df_members)

In [None]:
df_members.describe()

#### Initial Observation - Recipes dataset

In [None]:
ut.initial_obs(df_recipes)

#### Initial Observation - Reviews dataset

In [None]:
ut.initial_obs(df_reviews)

#### Plot amount of reviews over rating

In [None]:
ut.plot_reviews_rating(df_reviews)

#### Plot amount of users over amount of reviews

In [None]:
ut.plot_num_users_num_reviews(df_reviews)

In [None]:
# Calculate average rating for each recipe
# Filter recipes with more than 20 reviews
filtered_recipes = df_recipes[df_recipes['number_of_ratings'] > 20]

# Sort recipes based on average rating
top_rated_recipes = filtered_recipes.sort_values(by='average_rating', ascending=False).head(10)

# Print the name and rating of the top-rated recipes as well as the number of reviews
print('Top-Rated Recipes:')
print('------------------')

for index, recipe in top_rated_recipes.iterrows():
    print(f"{recipe['title']} (Recipe ID: {recipe['recipe_id']}) - Average Rating: {recipe['average_rating']:.2f} ({recipe['number_of_ratings']} reviews)")

#### Initial Preparation - Create the graph for network analysis

We will create a graph with the members as nodes and the reviews as edges. The weight of the edges will be the number of reviews in common (to the same recipe with the same attitude) between the two members. This will allow us to use network analysis to find communities of members with similar tastes.

First we will group the reviews by recipe and evaluations, so we can extract the members that have something in common.

In [None]:
# Group reviews by recipe and evaluation (>3, <=3)
grouped_reviews = df_reviews.groupby(['recipe_id', df_reviews['rating'] > 3])

# Create a dictionary to store relations between users
user_relations = {}

# Iterate through each group
for (recipe_id, is_positive_rating), group in grouped_reviews:
    # Extract user IDs for this recipe and evaluation
    if VERBOSE: print(recipe_id, is_positive_rating, group['member_id'].unique())
    user_ids = group['member_id'].unique()
    user_ids.sort()
    
    # Update relations between users for this recipe
    for i, user_id1 in enumerate(user_ids):
        for user_id2 in user_ids[i+1:]:
            # Check if there's an entry for this relation between users
            if (user_id1, user_id2) not in user_relations:
                if VERBOSE: print(f"Creating new relation between {user_id1} and {user_id2}")
                user_relations[(user_id1, user_id2)] = 0
            
            # Increment the relation count between the users based on the evaluation
            user_relations[(user_id1, user_id2)] += 1
            if VERBOSE: print(f"Relation between {user_id1} and {user_id2} has been incremented to {user_relations[(user_id1, user_id2)]}")

# Now user_relations contains relations between users
if VERBOSE: print("Size of user_relations:", len(user_relations))

Users with the same taste will have a high number in the relation, and users with different tastes will have a low number. Here are the most strong relations:

In [None]:
sorted_dict = sorted(user_relations.items(), key=lambda item: item[1], reverse=True)

# Print the 10 most frequent key-value pairs
for key, value in sorted_dict[:10]:
    print(key, ":", value)

Creating the graph...

In [None]:
g = nx.Graph()
vertex_indices = {}

# Check if the file exists
if os.path.exists('graph_file.graphml'):
    # Load the graph from file
    if VERBOSE: print("Loading graph from file")
    g = nx.read_graphml('graph_file.graphml')
else:
    if VERBOSE: print("Creating new graph")
    for (u,v), weight in user_relations.items():
        g.add_edge(u, v, weight = weight)
    nx.write_graphml(g, "graph_file.graphml")
    
if VERBOSE: print(g)
nx.draw(g, with_labels=True)

## Social Network Analysis


In [None]:
print("Number of nodes:", g.number_of_nodes())
print("Number of edges:", g.number_of_edges())
print("Average degree:", sum(dict(g.degree()).values()) / g.number_of_nodes())
print("Graph density:", nx.density(g))

The graph itself is very sparse as the density is very low.

### Power Law Distribution
Here, we will investigate whether our network adheres to a power law distribution, which signifies a characteristic pattern in which a few nodes possess an exceptionally high number of connections, while the majority have only a few connections.

In [None]:
degree_sequence = sorted([d for n, d in g.degree()], reverse=True)
degree_count = np.unique(degree_sequence, return_counts=True)

# Plot degree distribution
plt.figure(figsize=(10, 6))
plt.scatter(degree_count[0], degree_count[1], marker='o', color='b', alpha=0.5)
plt.xscale('log')
plt.yscale('log')
plt.title("Degree Distribution")
plt.xlabel("Degree")
plt.ylabel("Number of Users")
plt.grid(True, which="both", ls="--")
plt.show()

As we can see, our network does follow a power law distribution. However, there are some outliers that might appear because we're only looking at a portion of the dataset.

### Most Influencial Users

In this section, we'll employ various statistical measures to extract insights about our data, particularly focusing on identifying influential users. To achieve this, we will compute different centrality metrics including degree centrality, betweenness centrality, eigenvector centrality, PageRank and closeness centrality for the top 10 users in each category.

#### Degree Centrality

A user with high degree centrality likely reviews a large number of recipes. They may be very active in providing feedback on various recipes, indicating a strong engagement with the platform or community. They might have a significant influence on others in the network, potentially influencing their choices of recipes for others to try.

In [None]:
AMOUNT_USERS = 10

degree_centrality = nx.degree_centrality(g)
degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

degree_centrality_members = df_members[df_members['member_id'].isin([int(x[0]) for x in degree_centrality[0:5]])]
degree_centrality_members.head(AMOUNT_USERS)

#### Closeness Centrality

This measure for finding the individuals who are best placed to influence the entire network most quickly, meaning the users that are "close" to all other users in the network in terms of the shortest paths between them.

In [None]:
# closeness_centrality = nx.closeness_centrality(g, distance='weight')
# closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
# closeness_centrality_members = df_members[df_members['member_id'].isin([int(x[0]) for x in closeness_centrality[0:5]])]
# closeness_centrality_members.head(AMOUNT_USERS)

#### Betweenness Centrality

This measure shows which users who are ‘bridges’ between other users in a network, it's good to find the individuals who influence the flow around a system.

In [None]:
#betweenness_centrality = nx.betweenness_centrality(g, weight= 'weight')
#betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
#betweenness_centrality_members = df_members[df_members['member_id'].isin([int(x[0]) for x in betweenness_centrality[0:5]])]
#betweenness_centrality_members.head(AMOUNT_USERS)

#### EigenVector Centrality

In [None]:
# eigen_centrality = nx.eigenvector_centrality(g, weight='weight')
# eigen_centrality = sorted(eigen_centrality.items(), key=lambda x: x[1], reverse=True)
# eigen_centrality_members = df_members[df_members['member_id'].isin([int(x[0]) for x in eigen_centrality[0:5]])]
# eigen_centrality_members.head(AMOUNT_USERS)

#### Page Rank

In [None]:
page_rank = nx.pagerank(g, weight='weight')
page_rank = sorted(page_rank.items(), key=lambda x: x[1], reverse=True)
page_rank_members = df_members[df_members['member_id'].isin([int(x[0]) for x in page_rank[0:5]])]
page_rank_members.head(AMOUNT_USERS)

### Community Detection
In this part we will test different community detection algorithms and run some metrics to find out which one is better

#### Louvain Algorithm

In [None]:
louvain_communities = greedy_modularity_communities(g)
for i, community in enumerate(louvain_communities):
    print(f"Community {i + 1}: {len(community)}")

#### Label Propagation Algorithm

In [None]:
label_prop_communities = list(label_propagation_communities(g))
label_prop_communities = sorted(label_prop_communities, key=lambda x: len(x), reverse=True)

for i, community in enumerate(label_prop_communities):
    print(f"Community {i + 1}: {len(community)}")

### Community Filtering
Here we will be removing the communities with very few users

In [None]:
# average number of users in a community
average_users = sum([len(x) for x in louvain_communities]) / len(louvain_communities)
print("Average Amount Users p/ Community: ", average_users)

filtered_communities = [c for c in louvain_communities if len(c) >= average_users]
for i, community in enumerate(filtered_communities):
    print(f"Community {i + 1}: {len(community)}")

## Recommender System

In [None]:
communities_rmse = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
communities_mae = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
communities_precision = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
communities_recall = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
whole_dataset_mae = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
whole_dataset_rmse = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
whole_dataset_precision = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}
whole_dataset_recall = {"Random Recommender":0, "User-Based CF":0, "Item-Based CF":0, "Model-Based CF":0, "Content-Based Filtering":0}

models_predictions = {}

filtered_users = [int(user) for sublist in filtered_communities for user in sublist]

df_members = df_members[df_members['member_id'].isin(filtered_users)]
df_reviews = df_reviews[df_reviews['member_id'].isin(filtered_users)]
df_recipes = df_recipes[df_recipes['recipe_id'].isin(df_reviews['recipe_id'])]

print("Shape of Filtered Members:", df_members.shape)
print("Shape of Filtered Reviews:", df_reviews.shape)
print("Shape of Filtered Recipes", df_recipes.shape)

### Collaborative Filtering (Applied @ each community)

In this section, we'll be exploring recommender systems that help suggest items based on similarities between users or items. We'll dive into both user-based and item-based collaborative filtering methods. Our aim is to apply these techniques to different communities, assess how well they work for each, and then gauge their overall performance by averaging the errors. 

#### Memory-Based

##### User-Based

We will predict a user's preferences based on the preferences of similar users (users in the same community).

In [None]:
avg_rmse, avg_mae, avg_precision, avg_recall = ut.collaborative_filtering(df_reviews, filtered_communities, 0.25, True)

In [None]:
print(f"\033[1m-----Overall Performance-----\033[0m")
print(f"\033[1mAverage RMSE ->\033[0m", avg_rmse)
print(f"\033[1mAverage MAE ->\033[0m", avg_mae)
print(f"\033[1mAverage Precision ->\033[0m", avg_precision)
print(f"\033[1mAverage Recall ->\033[0m", avg_recall)
print()    

communities_rmse["User-Based CF"] = avg_rmse
communities_mae["User-Based CF"] = avg_mae
communities_precision["User-Based CF"] = avg_precision
communities_recall["User-Based CF"] = avg_recall

##### Item-Based

This time we will use a recommendation approach that predicts a user's preferences by examining similarities between items rather than users.

In [None]:
avg_rmse, avg_mae, avg_precision, avg_recall = ut.collaborative_filtering(df_reviews, filtered_communities, 0.25, False)

In [None]:
print(f"\033[1m-----Overall Performance-----\033[0m")
print(f"\033[1mAverage RMSE ->\033[0m", avg_rmse)
print(f"\033[1mAverage MAE ->\033[0m", avg_mae)
print(f"\033[1mAverage Precision ->\033[0m", avg_precision)
print(f"\033[1mAverage Recall ->\033[0m", avg_recall)
print()    

communities_rmse["Item-Based CF"] = avg_rmse
communities_mae["Item-Based CF"] = avg_mae
communities_precision["Item-Based CF"] = avg_precision
communities_recall["Item-Based CF"] = avg_recall

#### Model-Based

We will employ model-based collaborative filtering for personalized recommendations, contrasting with memory-based methods. Unlike memory-based approaches that directly compare user-item interactions, model-based methods utilize mathematical models to capture underlying patterns and relationships in the data.

In [None]:
avg_rmse, avg_mae, avg_precision, avg_recall = ut.collaborative_filtering(df_reviews, filtered_communities, 0.25, False, 'SVD')

In [None]:
print(f"\033[1m-----Overall Performance-----\033[0m")
print(f"\033[1mAverage RMSE ->\033[0m", avg_rmse)
print(f"\033[1mAverage MAE ->\033[0m", avg_mae)
print(f"\033[1mAverage Precision ->\033[0m", avg_precision)
print(f"\033[1mAverage Recall ->\033[0m", avg_recall)
print()    

communities_rmse["Model-Based CF"] = avg_rmse
communities_mae["Model-Based CF"] = avg_mae
communities_precision["Model-Based CF"] = avg_precision
communities_recall["Model-Based CF"] = avg_recall

#### Content-based Filtering 

In this section, we'll be exploring recommender systems that suggest items based on similarities between the characteristics of items. We'll delve into content-based filtering methods, which recommend items to users based on the similarity of the items' features or attributes. Our aim is to apply these techniques to different communities, assess how well they work for each community, and then evaluate their overall performance.

By vectorizing text based features we can find similar recipes.

In [None]:
all_recommendations = ut.find_similars(df_reviews, df_recipes, filtered_communities)

for community_id, community_recommendations in all_recommendations.items():
        print(f"\033[1mCommunity {community_id} Recommendations:\033[0m")
        for recipe_id, recipe_data in community_recommendations.items():
            print(f"\n\033[1mOriginal Recipe: {recipe_data['original_title']} (Recipe ID: {recipe_id})\033[0m")
            print("Similar Recipes:")
            unique_similar_recipe_ids = set()  # Track unique similar recipe IDs for each original recipe
            for similar_recipe in recipe_data['similar_recipes']:
                if similar_recipe['id'] not in unique_similar_recipe_ids:
                    print(f"- {similar_recipe['title']} (Recipe ID: {similar_recipe['id']}) | Similarity Score: {similar_recipe['score']:.2f}")
                    unique_similar_recipe_ids.add(similar_recipe['id'])
            print()

In [None]:
df_similar_recipes = ut.create_similar_recipes_dataframe(all_recommendations)
print(df_similar_recipes)

In [None]:
avg_rmse, avg_mae = ut.content_based_filtering(df_reviews, df_similar_recipes, filtered_communities, 0.25, False)  

As can be seen, some communities have a RMSE and a MAE of 0. This happens because there is no recipe reviewed by a user that has a similar reviewed by the same user. For the sake of the average results integrety, results from these communities will be excluded from the overall calculation.

In [None]:
print(f"\033[1m-----Overall Performance-----\033[0m")
print(f"\033[1mAverage RMSE ->\033[0m", avg_rmse)
print(f"\033[1mAverage MAE ->\033[0m", avg_mae)
print()  

communities_rmse["Content-Based Filtering"] = avg_rmse
communities_mae["Content-Based Filtering"] = avg_mae
communities_precision["Content-Based Filtering"] = 0
communities_recall["Content-Based Filtering"] = 0

### Popularity model (Naive Approach)

In [None]:
# Popularity model
#Sort by "average_rating" and numeber_of_ratings > 30
df_recipes_top = df_recipes[df_recipes['number_of_ratings'] > 30]
df_recipes_top = df_recipes_top.sort_values(by='average_rating', ascending=False)

# Get top N recommendations
top_n_popularity = df_recipes_top.head(10)

# Print the top N recommended items
print("\nTop Recommendations using Popularity Model:")
for index, recipe in top_n_popularity.iterrows():
    print(f"ID: {recipe['recipe_id']}, Title: {recipe['title']}, Average Rating: {recipe['average_rating']:.2f}, Number of Ratings: {recipe['number_of_ratings']}")


### Random Recommender

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_reviews[['member_id', 'recipe_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)



In [None]:
random_algo = NormalPredictor()
rmse, mae, predictions, precision, recall = ut.evaluate_model(random_algo, trainset, testset)
print(f"RMSE -> {rmse}")
print(f"MAE -> {mae}")
print(f"Precision -> {precision}")
print(f"Recall -> {recall}")

whole_dataset_rmse["Random Recommender"] = rmse
whole_dataset_mae["Random Recommender"] = mae
whole_dataset_precision["Random Recommender"] = precision
whole_dataset_recall["Random Recommender"] = recall
models_predictions["Random Recommender"] = predictions

In [None]:
# run random for communities
avg_rmse, avg_mae, avg_precision, avg_recall = ut.collaborative_filtering(df_reviews, filtered_communities, 0.25, True, 'Random')
communities_rmse["Random Recommender"] = avg_rmse
communities_mae["Random Recommender"] = avg_mae
communities_precision["Random Recommender"] = avg_precision
communities_recall["Random Recommender"] = avg_recall

### Collaborative Filtering (Applied @ whole data)

#### Memory-Based

##### User-Based

In [None]:
ubcf_algo = KNNBasic(sim_options={'user_based': True})
rmse, mae, pred_user, precision, recall = ut.evaluate_model(ubcf_algo, trainset, testset)
print(f"RMSE -> {rmse}")
print(f"MAE -> {mae}")
print(f"Precision -> {precision}")
print(f"Recall -> {recall}")

whole_dataset_rmse["User-Based CF"] = rmse
whole_dataset_mae["User-Based CF"] = mae
whole_dataset_precision["User-Based CF"] = precision
whole_dataset_recall["User-Based CF"] = recall
models_predictions["User-Based CF"] = pred_user

##### Item-Based

In [None]:
ibcf_algo = KNNBasic(sim_options={'user_based': False})
rmse, mae, pred_item, precision, recall = ut.evaluate_model(ibcf_algo, trainset, testset)
print(f"RMSE -> {rmse}")
print(f"MAE -> {mae}")
print(f"Precision -> {precision}")
print(f"Recall -> {recall}")

whole_dataset_rmse["Item-Based CF"] = rmse
whole_dataset_mae["Item-Based CF"] = mae
whole_dataset_precision["Item-Based CF"] = precision
whole_dataset_recall["Item-Based CF"] = recall
models_predictions["Item-Based CF"] = pred_item

#### Model-Based

In [None]:
svd_algo = SVD(verbose = False)
rmse, mae, pred_model, precision, recall = ut.evaluate_model(ibcf_algo, trainset, testset)
print(f"RMSE -> {rmse}")
print(f"MAE -> {mae}")
print(f"Precision -> {precision}")
print(f"Recall -> {recall}")

whole_dataset_rmse["Model-Based CF"] = rmse
whole_dataset_mae["Model-Based CF"] = mae
whole_dataset_precision["Model-Based CF"] = precision
whole_dataset_recall["Model-Based CF"] = recall
models_predictions["Model-Based CF"] = pred_model

#### Content-Based

Evaluation

In [None]:
# RMSE and MAE values for each model with communities and whole dataset
# Models
models = ["Random Recommender", "User-Based CF", "Item-Based CF",
          "Model-Based CF", "Content-Based Filtering"]
indices = np.arange(len(models))

# RMSE and MAE values for each model with communities and whole dataset
communities_rmse_values = [communities_rmse[model] for model in models]
communities_mae_values = [communities_mae[model] for model in models]
communities_precision_values = [communities_precision[model] for model in models]
communities_recall_values = [communities_recall[model] for model in models]
whole_dataset_rmse_values = [whole_dataset_rmse[model] for model in models]
whole_dataset_mae_values = [whole_dataset_mae[model] for model in models]
whole_dataset_precision_values = [whole_dataset_precision[model] for model in models]
whole_dataset_recall_values = [whole_dataset_recall[model] for model in models]


# Create subplots for RMSE and MAE values
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 10))

# Plot RMSE values
axes[0].bar(indices - 0.2, communities_rmse_values, width=0.4, color='lightsalmon', alpha=0.6, label='Communities RMSE')
axes[0].bar(indices + 0.2, whole_dataset_rmse_values, width=0.4, color='wheat', alpha=0.6, label='Whole Dataset RMSE')
axes[0].set_xticks(indices)
axes[0].set_xticklabels(models, rotation=45)
axes[0].set_ylabel('RMSE')
axes[0].set_title('Comparison of RMSE Values for Different Models')
axes[0].legend()

axes[0].set_ylim(0, max(communities_rmse_values + whole_dataset_rmse_values) + 0.2)

# Plot MAE values
axes[1].bar(indices - 0.2, communities_mae_values, width=0.4, color='cornflowerblue', alpha=0.6, label='Communities MAE')
axes[1].bar(indices + 0.2, whole_dataset_mae_values, width=0.4, color='seagreen', alpha=0.6, label='Whole Dataset MAE')
axes[1].set_xticks(indices)
axes[1].set_xticklabels(models, rotation=45)
axes[1].set_ylabel('MAE')
axes[1].set_title('Comparison of MAE Values for Different Models')
axes[1].legend()

axes[1].set_ylim(0, max(communities_mae_values + whole_dataset_mae_values) + 0.2)

# Adjust spacing between subplots
plt.subplots_adjust(hspace=0.5)

plt.show()

In [None]:


# Models
models = ["Random Recommender", "User-Based CF", "Item-Based CF", "Model-Based CF"]

# Create subplots for error distribution
fig, axes = plt.subplots(nrows=len(models), ncols=1, figsize=(10, 15))

# Plot error distribution for each model
for i, model in enumerate(models):
    predictions = models_predictions[model]  # Get predictions for the current model
    actual_ratings = [pred.r_ui for pred in predictions]  # Extract actual ratings
    predicted_ratings = [pred.est for pred in predictions]  # Extract predicted ratings
    errors = [abs(actual - predicted) for actual, predicted in zip(actual_ratings, predicted_ratings)]  # Compute errors
    
     # Plot the error distribution
    hist, bins, _ = axes[i].hist(errors, bins=30, color='teal', alpha=0.6)
    axes[i].set_title(f'Error Distribution for {model}')
    axes[i].set_xlabel('Error')
    axes[i].set_ylabel('Frequency')

    # Annotate bars with frequency values
    a = 0
    for rect, count in zip(hist, hist):

        height = rect
  
        axes[i].text(bins[a]+0.06, height + 0.5, int(height), ha='center', va='bottom')
        a += 1
        


plt.tight_layout()
plt.show()
