In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import importlib as imp

import src
import trecs
from trecs.models import ContentFiltering
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement

random_state = np.random.seed(42)

In [20]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [21]:
ratings_df = pd.read_csv('data/ml-100k/u.data', 
                 sep="\t", 
                 names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

## Creating RecommenderSystem on MovieLens 

### Creating Embeddings

In [24]:
from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

In [25]:
from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()

  "LightFM was compiled without OpenMP support. "


In [33]:
n_attrs=100

nmf = NMF(n_components=n_attrs, solver="mu", max_iter=500)
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)



(943, 100) (100, 1682)


## Create RS Model

In [None]:
recsys = trecs.models.ContentFiltering(
    user_representation=user_representation,
    item_representation=item_representation,
    record_base_state=True,
)

In [None]:
print("Model representation of users and items are given by:")
print(f"- An all-zeros matrix of users of dimension {recsys.predicted_user_profiles.shape}")
print(f"- A randomly generated matrix of items of dimension {recsys.predicted_item_attributes.shape}")

Model representation of users and items are given by:
- An all-zeros matrix of users of dimension (943, 100)
- A randomly generated matrix of items of dimension (100, 1682)


In [None]:
user_pairs = [(u_idx, v_idx) for u_idx in range(recsys.num_users) for v_idx in range(recsys.num_users)]

mse = MSEMeasurement(diagnostics=True)

recsys.add_metrics(InteractionMeasurement(),
                   mse,
                   InteractionSpread(),
                   RecSimilarity(pairs=user_pairs),
                   )

### Calculate MSE

In [None]:
# Calculate the mean absolute error using the recsys score function and a threshold value to decide when to recommend
from sklearn.metrics import mean_squared_error

prediction_threshold = 0.6

predicted_scores = recsys.score_fn(user_representation, item_representation)
predicted_recs = np.where(predicted_scores > prediction_threshold, 1, 0)

mse = mean_squared_error(binary_ratings_matrix, predicted_recs)

print(f'Mean squared error: {mse}.')

Mean squared error: 0.050920292587095864.


### Create Topic Clusters

In [30]:
from sklearn.cluster import KMeans

def get_topic_clusters(interaction_matrix, n_clusters:int=100, n_attrs:int=100, max_iter:int=100, nmf_solver:str="mu"):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    co_occurence_matrix = interaction_matrix.T @ interaction_matrix
    co_occurence_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, solver=nmf_solver, max_iter=max_iter)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(W_topics)

    # assign nearest cluster to observation
    cluster_ids = kmeans.predict(W_topics)

    return cluster_ids

In [None]:

topics = get_topic_clusters(binary_ratings_matrix, n_clusters=50, n_attrs=n_attrs, nmf_solver="mu")

## Calculate Evaluation Metrics

In [None]:
# Generate example recommnedations
recs = recsys.recommend()

In [None]:
# Calculate diversity, novelty, 
imp.reload(src.evaluation_metrics)
from src.evaluation_metrics import calculate_diversity, calculate_novelty, calculate_spread, calculate_serendipity

def calculate_diversity_metrics(topics, recs):
    """
    Calculates diversity metrics for a set of recommendations.
    Inputs:
        topics: a list of cluster assignments
        recs: a list of recommendations
    Outputs:
        diversity_metrics: a dictionary of diversity metrics
    """
    diversity, serendipity, spread, novelty = 0, 0, 0, 0
    for user_id, slate in enumerate(recs):
        # novelty += calculate_novelty(slate, len(recs), binary_ratings_matrix)
        # serendipity += calculate_serendipity(slate, user_representation[user_id])
        # spread += calculate_spread(slate)
        diversity += calculate_diversity(topics, slate)

    [diversity, serendipity, spread, novelty] = np.divide([diversity, serendipity, spread, novelty], len(recs)) 

    diversity_metrics = {
        'novelty': novelty,
        'serendipity': serendipity,
        'spread': spread,
        'diversity': diversity,
    }
    return diversity_metrics

# print key and value of calcualte_diversity_metrics
diversity_metrics = calculate_diversity_metrics(topics, recs)
for key, value in diversity_metrics.items():
    print(f'{key}: {value}')

## Re-rank recommendations
There are two main approaches in this section to re-ranking the recommendations.
1. Only change the order of the top k recommendations
2. Consider an unreasonably large set of recommendations, re-order that and then select the top k

In [None]:
# Get example recommendations before re-ranking
recs = recsys.recommend()

# Calculate mean average precision
def calculate_mean_average_precision(recs, interactions_matrix):
    """
    Calculates mean average precision for a set of recommendations.
    Inputs:
        recs: a list of recommendations
        binary_ratings_matrix: a binary matrix of interactions between users and items
    Outputs:
        map: mean average precision"""
    precision = 0
    for user_id, slate in enumerate(recs):
        for item_id in slate:
            if interactions_matrix[user_id][item_id] == 1:
                precision += 1
    map = precision / (len(recs) * len(recs[0]))
    return map

In [None]:
# Calculate metrics for myopic RS
k=3
top_k_recs = recs[:, 0:k]
map = calculate_mean_average_precision(top_k_recs, binary_ratings_matrix)
print(f'Mean absolute precision: {round(map*100, 2)}%.')

# print key and value of calcualte_diversity_metrics
diversity_metrics = calculate_diversity_metrics(topics, top_k_recs)
for key, value in diversity_metrics.items():
    print(f'{key}: {np.round(value*100, 2)}%')

Mean absolute precision: 95.09%.
novelty: 0.0%
serendipity: 0.0%
spread: 0.0%
diversity: 82.15%


In [None]:
# calculate cosine similarity for items in slate
from numpy.linalg import norm

def calculate_cosine_similarities(slate, item_representation):
    """
    Calculates cosine similarity for a set of recommendations.
    Inputs:
        slate: a list of recommendations
        item_representation: a matrix of item representations
    Outputs:
        cosine_similarity: mean average precision
    """
    cosine_similarities = []
    for item_id in slate:
        cosine_similarity = 0
        for item_id_2 in slate:
            if item_id != item_id_2:
                vec_1 = item_representation[:, item_id]
                vec_2 = item_representation[:, item_id_2]
                vec_prod = np.dot(vec_1, vec_2) / (norm(vec_1) * norm(vec_2))
                cosine_similarity += vec_prod
        cosine_similarities.append(cosine_similarity)
    return cosine_similarities

cosine_similarities = calculate_cosine_similarities(recs[0], item_representation)

In [None]:
# Re-ranking scores
def re_rank_scores(item_representation, recommendations):
    """
    Re-ranks scores for a set of recommendations.
    Inputs:
        item_representation: a matrix of item representations
        recommendations: a list of recommendations
    Outputs:
        re_ranked_recommendations: a list of re-ranked recommendations
    """
    exps = [np.round(x * 0.1, 1) for x in range(0, len(recommendations[0]))][::-1]
    initial_scores = np.exp(exps)
    re_ranked_recommendations = np.zeros_like(recommendations)
    
    for i, slate in enumerate(recommendations):
        # print(f"Slate:\t\t\t{slate}")
        cosine_similarities = calculate_cosine_similarities(slate, item_representation=item_representation)
        # multiply cosine_similarities with each list in recommendations
        re_ranked_scores = initial_scores * 1/cosine_similarities
        # print(f'Initial Scores:\t\t{np.round(initial_scores, 2)}')
        # print(f'Re-ranked scores:\t{np.round(re_ranked_scores, 2)}')
        tup = list(zip(slate, re_ranked_scores))
        tup.sort(key = lambda x: x[1], reverse=True)
        # create list from second element in each tuple in tup
        re_ranked_slate = np.array([x[0] for x in tup])
        # print(f"Re-ranked Slate:\t{re_ranked_slate}")
        re_ranked_recommendations[i] = re_ranked_slate

    return re_ranked_recommendations

re_ranked_recs = re_rank_scores(item_representation, recs)

In [None]:
# Calculate metrics for myopic RS
top_k_reranked_recs = re_ranked_recs[:, 0:k]
map = calculate_mean_average_precision(top_k_reranked_recs, binary_ratings_matrix)
print(f'Mean absolute precision: {round(map*100, 2)}%.')

# print key and value of calcualte_diversity_metrics
diversity_metrics = calculate_diversity_metrics(topics, top_k_reranked_recs)
for key, value in diversity_metrics.items():
    print(f'{key}: {np.round(value*100, 2)}%')

Mean absolute precision: 93.18%.
novelty: 0.0%
serendipity: 0.0%
spread: 0.0%
diversity: 89.86%
