In [33]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pickle
import os
from sklearn.decomposition import NMF, PCA
from sklearn.cluster import KMeans
from importlib import reload

import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../t-recs/')
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users
import trecs.matrix_ops as mo
import src.globals as globals
import seaborn as sns

from wrapper.models.bubble import BubbleBurster
from src.utils import *
from src.plotting import plot_measurements 
from src.scoring_functions import cosine_sim, entropy, content_fairness
from wrapper.metrics.evaluation_metrics import *

random_state = np.random.seed(42)
plt.style.use("seaborn")

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

globals.initialize()

  plt.style.use("seaborn")


In [34]:
n_attrs=20
max_iter=1000
n_clusters=25

In [35]:
score_fn = 'entropy' #'content_fairness'
probabilistic = False
globals.ALPHA = 0.2
alpha = globals.ALPHA

# User parameters
drift = 0.05
attention_exp=-0.8

In [36]:
binary_ratings_matrix = load_and_process_movielens(file_path='data/ml-100k/u.data')

In [37]:
# Get user and item representations using NMF
user_representation, item_representation = create_embeddings(binary_ratings_matrix, n_attrs=n_attrs, max_iter=max_iter)

Loaded embeddings.


In [38]:
# Define topic clusters using NMF
item_cluster_ids, item_cluster_centers = get_clusters(item_representation.T, name='item', n_clusters=n_clusters, n_attrs=n_attrs, max_iter=max_iter)
user_cluster_ids, user_cluster_centers = get_clusters(user_representation, name='user', n_clusters=n_clusters, n_attrs=n_attrs, max_iter=max_iter)

Loaded clusters.
Loaded clusters.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [39]:
item_representation.shape

(20, 1682)

In [40]:
num_users = user_representation.shape[0]
num_items = item_representation.shape[1]
print(f'Number of items: {num_items}')
print(f'Number of users: {num_users}')

users = Users(actual_user_profiles=user_representation, 
              repeat_interactions=False, 
              drift=drift,
              attention_exp=attention_exp)

Number of items: 1682
Number of users: 943


In [41]:
# Create user_pairs by pairing users only with others that are not in the same cluster
user_item_cluster_mapping = user_topic_mapping(user_representation, item_cluster_centers) # TODO: Remove?
experiment_name = 'users_by_topic'
# Create user_pairs by pairing users only with others that are not in the same cluster
inter_cluster_user_pairs, intra_cluster_user_pairs = create_cluster_user_pairs(user_item_cluster_mapping)

In [42]:
mse = MSEMeasurement()
measurements = [
    InteractionMeasurement(),
    MSEMeasurement(),  
    InteractionSpread(),                
    InteractionSimilarity(pairs=inter_cluster_user_pairs, name='inter_cluster_interaction_similarity'), 
    InteractionSimilarity(pairs=intra_cluster_user_pairs, name='intra_cluster_interaction_similarity'), 
    RecSimilarity(pairs=inter_cluster_user_pairs, name='inter_cluster_rec_similarity'), 
    RecSimilarity(pairs=intra_cluster_user_pairs, name='intra_cluster_rec_similarity'), 
    UserMSEMeasurement(),
    SerendipityMetric(), 
    DiversityMetric(), 
    NoveltyMetric(),
    RecallMeasurement(),
    MeanNumberOfTopics(),
]

In [43]:
# Model
config = {
    'actual_user_representation': users,
    'actual_item_representation': item_representation,
    'item_topics': item_cluster_ids,
    'num_attributes': n_attrs,
    'num_items_per_iter': 10,
    'seed': 42,
    'record_base_state': True,
}

model_name='myopic'
requires_alpha = False

if score_fn:
    if score_fn == 'cosine_sim':
        config['score_fn'] = cosine_sim
        requires_alpha = True
    elif score_fn == 'entropy':
        config['score_fn'] = entropy
        requires_alpha = True
    elif score_fn == 'content_fairness':
        config['score_fn'] = content_fairness        
    else:
        raise Exception('Given score function does not exist.')
    model_name = score_fn

if probabilistic:
    config['probabilistic_recommendations'] = True
    model_name += '_prob'

In [44]:
model = BubbleBurster(**config)

model.add_metrics(*measurements)

In [45]:
# Fair Model
train_timesteps=5
model.startup_and_train(timesteps=train_timesteps)

100%|██████████| 5/5 [00:20<00:00,  4.14s/it]


In [46]:
run_timesteps=20
model.run(timesteps=run_timesteps)

100%|██████████| 20/20 [02:24<00:00,  7.22s/it]


In [47]:
import src
reload(src.utils)
from src.utils import *
    
# Determine file name based on parameter values
parameters = f'_{train_timesteps}trainTimesteps_{run_timesteps}runTimesteps_{n_attrs}nAttrs_{n_clusters}nClusters_{drift}Drift_{attention_exp}AttentionExp'
if requires_alpha:
    parameters += f'_{alpha}Lambda'

# Save actual user preferences
final_preferences_dir = 'artefacts/supplementary/final_preferences/'
file_prefix = f'{model_name}_final_preferences'
final_preferences_path = final_preferences_dir + file_prefix + parameters + '.npy'
np.save(final_preferences_path, model.users.actual_user_profiles.value, allow_pickle=True)

# Save measurements
measurements_dir = f'artefacts/supplementary/measurements/'
file_prefix = f'{model_name}_measurements'

measurements_path = measurements_dir + file_prefix + parameters + '.csv'
# np.set_printoptions(threshold=sys.maxsize)
measurements_df = load_or_create_measurements_df(model, model_name, train_timesteps, measurements_path)
# measurements_df['interaction_histogram'] = measurements_df['interaction_histogram'].tolist()
# saving interaction histogram
path_interaction_histogram = f'{measurements_dir}{model_name}_interaction_histogram{parameters}.csv'
interaction_hist = measurements_df['interaction_histogram'].copy()
interaction_hist[0] = np.repeat(np.nan, interaction_hist[1].shape[0], axis=0)
interaction_hist = np.stack(interaction_hist.values)
test = pd.DataFrame(interaction_hist)
test.to_csv(path_interaction_histogram, index=False)
# saving use_mse histogram
path_user_mse_histogram = f'{measurements_dir}{model_name}_user_mse_histogram{parameters}.csv'
user_mse = measurements_df['user_mse'].copy()
user_mse = np.stack(user_mse.values)
test = pd.DataFrame(user_mse)
test.to_csv(path_user_mse_histogram, index=False)
# saving all measurements
measurements_df.to_csv(measurements_path, index=False)
print('Measurements saved.')

Measurements saved.


In [48]:
def plot_measurements(dfs, parameters_df):
    fig, ax = plt.subplots(4, 3, figsize=(15, 15))
    fig.tight_layout(pad=5.0)

    # plot rec_similarity with timesteps on x axis
    legend_lines, legend_names = [], []
    for i, df in enumerate(dfs):
        ts = df['timesteps']
        name = parameters_df.loc[i, 'model_name']
        if not np.isnan(parameters_df.loc[i, 'Lambda']):
             name += f" (Lambda: {parameters_df.loc[i, 'Lambda']})" 
        legend_names.append(name)
        
        line, = ax[0,0].plot(ts, df['mse'], label=name)
        ax[0,1].plot(ts, df['user_mse'], label=name)
        ax[0,2].plot(ts, df['recall_at_k'], label=name)
    
        if 'interaction_spread' in df.columns:
            ax[1,0].plot(ts, df['interaction_spread'], label=name, alpha=0.5)
        if 'inter_cluster_interaction_similarity' in df.columns:
            ax[1,1].plot(ts, df['inter_cluster_interaction_similarity'], label=name, alpha=0.5)
        if 'intra_cluster_interaction_similarity' in df.columns:
            ax[1,2].plot(ts, df['intra_cluster_interaction_similarity'], label=name, alpha=0.5)

        if 'diversity_metric' in df.columns:
            ax[2,0].plot(ts, df['diversity_metric'], label=name, alpha=0.5)
        if 'inter_cluster_rec_similarity' in df.columns:
            ax[2,1].plot(ts, df['inter_cluster_rec_similarity'], label=name, alpha=0.5)
        if 'intra_cluster_rec_similarity' in df.columns:
            ax[2,2].plot(ts, df['intra_cluster_rec_similarity'], label=name, alpha=0.5)

        if 'serendipity_metric' in df.columns:
            ax[3,0].plot(ts, df['serendipity_metric'], label=name, alpha=0.5)
        if 'novelty_metric' in df.columns:
            ax[3,1].plot(ts, df['novelty_metric'], label=name, alpha=0.5)
        if 'mean_num_topics' in df.columns:
            ax[3,2].plot(ts, df['mean_num_topics'], label=name, alpha=0.5)
        
        legend_lines.append(line)

    for a in ax:
        for b in a:
            b.set_xlabel('Timestep')

    ax[0, 0].set_title('Mean Squared Error')
    ax[0, 0].set_ylabel('MSE')
    
    ax[0, 1].set_title('User Mean Squared Error')
    ax[0, 1].set_ylabel('MSE')
    ax[0, 1].set_xlabel('User ID')
    
    ax[0, 2].set_title('Recall')
    ax[0, 2].set_ylabel('Recall')
    
    ax[1, 0].set_title('Interaction Spread')
    ax[1, 0].set_ylabel('Jaccard Similarity')
    
    ax[1, 1].set_title('Inter Cluster Interaction Similarity')
    ax[1, 1].set_ylabel('Jaccard Similarity')
    
    ax[1, 2].set_title('Intra Cluster Interaction Similarity')
    ax[1, 2].set_ylabel('Jaccard Similarity')
    
    ax[2, 0].set_title('Diversity')
    ax[2, 0].set_ylabel('Diversity')
    
    ax[2, 1].set_title('Inter Cluster Recommendation similarity')
    ax[2, 1].set_ylabel('Jaccard Similarity')
    
    ax[2, 2].set_title('Intra Cluster Recommendation similarity')
    ax[2, 2].set_ylabel('Jaccard Similarity')
    
    ax[3, 0].set_title('Serendipity')
    ax[3, 0].set_ylabel('Serendipity')
    
    ax[3, 1].set_title('Novelty')
    ax[3, 1].set_ylabel('Novelty')

    ax[3, 2].set_title('Mean Number of Topics Interacted per User')
    ax[3, 2].set_ylabel('Mean Number of Topics Interacted per User')
    
    fig.legend(legend_lines, legend_names, loc='upper center', fontsize=14, frameon=False, ncol=5, bbox_to_anchor=(.5, 1.05))

In [50]:
measurements = model.get_measurements()
# for i in measurements.keys():
#     print(i)
measurements['intra_cluster_interaction_similarity']

[None,
 0.0036580933956289244,
 0.004084617075309885,
 0.004776629984179791,
 0.005546776725527721,
 0.0062603956079672575,
 0.007494669937734605,
 0.009220792257337075,
 0.011266096739274752,
 0.013309680202130216,
 0.014998457289770822,
 0.016645454775808647,
 0.018042117382745932,
 0.01949998873780004,
 0.02084258534920795,
 0.02185158008693212,
 0.022842023720266506,
 0.023649050771219943,
 0.02433787510979452,
 0.025054520233856337,
 0.025748382771203074,
 0.026437800351867373,
 0.027093648204908283,
 0.027541286977165083,
 0.028065023402112716,
 0.02857914741053031]

In [29]:
df_myopic = pd.read_csv('artefacts/supplementary/measurements/myopic_measurements_5trainTimesteps_20runTimesteps_20nAttrs_25nClusters_0.05Drift_-0.8AttentionExp.csv').drop(columns='Unnamed: 0')
df_cosine_sim = pd.read_csv('artefacts/supplementary/measurements/cosine_sim_measurements_5trainTimesteps_20runTimesteps_20nAttrs_25nClusters_0.05Drift_-0.8AttentionExp_0.2Lambda.csv').drop(columns='Unnamed: 0')
df_cosine_sim.head(3)

Unnamed: 0,interaction_histogram,mse,interaction_spread,inter_cluster_interaction_similarity,intra_cluster_interaction_similarity,inter_cluster_rec_similarity,intra_cluster_rec_similarity,user_mse,serendipity_metric,diversity_metric,novelty_metric,recall_at_k,mean_num_topics,timesteps,state,model
0,,0.090065,,,,,,[0.03512269 0.08176823 0.16119423 0.2149699 0...,,,,,,0,train,cosine_sim
1,[6. 0. 1. ... 0. 0. 0.],0.094148,-939.0,0.001247,0.003658,0.003169,0.005219,[0.0423986 0.08672108 0.159959 0.22921314 0...,0.899788,0.828526,,0.943796,1.0,1,train,cosine_sim
2,[1. 1. 1. ... 0. 0. 0.],0.09786,-1.0,0.001623,0.004085,0.0031,0.005164,[0.04715542 0.09561756 0.15838462 0.23577749 0...,0.91421,0.82763,,0.95228,2.0,2,train,cosine_sim


In [32]:
df_myopic.head(3)

Unnamed: 0,interaction_histogram,mse,interaction_spread,inter_cluster_interaction_similarity,intra_cluster_interaction_similarity,inter_cluster_rec_similarity,intra_cluster_rec_similarity,user_mse,serendipity_metric,diversity_metric,novelty_metric,recall_at_k,mean_num_topics,timesteps,state,model
0,,0.090065,,,,,,[0.03512269 0.08176823 0.16119423 0.2149699 0...,,,,,,0,train,myopic
1,[6. 0. 1. ... 0. 0. 0.],0.094148,-939.0,0.001247,0.003658,0.003169,0.005219,[0.0423986 0.08672108 0.159959 0.22921314 0...,0.899788,0.828526,,0.943796,1.0,1,train,myopic
2,[1. 1. 1. ... 0. 0. 0.],0.09786,-1.0,0.001623,0.004085,0.0031,0.005164,[0.04715542 0.09561756 0.15838462 0.23577749 0...,0.91421,0.82763,,0.95228,2.0,2,train,myopic


In [31]:
# measurements['intra_cluster_interaction_similarity']
timesteps = df_myopic['timesteps'].values
timesteps
# df_myopic.columns

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])

In [None]:
plt.plot(timesteps, , label = "line 1")
plt.plot(y, x, label = "line 2")
plt.legend()
plt.show()