In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from sklearn.decomposition import NMF
from importlib import reload
import wrapper

import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../t-recs/')
import trecs
import os
from scipy import sparse
from trecs.models import ContentFiltering
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users

from wrapper.models.bubble import BubbleBurster
# from src.utils import get_topic_clusters
from wrapper.metrics.evaluation_metrics import SerendipityMetric, DiversityMetric, NoveltyMetric, TopicInteractionMeasurement, MeanNumberOfTopics, UserMSEMeasurement

random_state = np.random.seed(42)
plt.style.use("seaborn-paper")

  plt.style.use("seaborn-paper")


**Ref:** https://towardsdatascience.com/dimensionality-reduction-for-data-visualization-pca-vs-tsne-vs-umap-be4aa7b1cb29

In [2]:
from sklearn.cluster import KMeans

def get_topic_clusters(interaction_matrix, n_clusters:int=100, n_attrs:int=100, max_iter:int=100):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    print('Calculating clusters...')
    co_occurence_matrix = interaction_matrix.T @ interaction_matrix

    co_occurence_matrix = interaction_matrix.T @ interaction_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, max_iter=max_iter, verbose=1)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    cluster_ids = KMeans(n_clusters=n_clusters, max_iter=max_iter, random_state=random_state, verbose=1).fit_predict(W_topics)
    # np.save(file_path, cluster_ids)

    print('Calculated clusters.')

    return cluster_ids

In [3]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [4]:
ratings_df = pd.read_csv('data/ml-100k/u.data', sep="\t", names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

In [5]:
from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()

n_attrs=100
nmf = NMF(n_components=n_attrs, solver="mu")
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)

# nmf_solver='mu'
n_clusters=50
max_iter=500
n_attrs=100
item_topics = get_topic_clusters(binary_ratings_matrix, n_attrs=n_clusters, max_iter=max_iter)#, nmf_solver=nmf_solver)

users = Users(size=(943,100), repeat_interactions=False)



(943, 100) (100, 1682)
Calculating clusters...
violation: 1.0
violation: 0.0008111012211140843
violation: 0.0018579971758544973
violation: 0.002348404969455981
violation: 0.001963674297444194
violation: 0.0015236872281129985
violation: 0.0012535307370826517
violation: 0.0010656736015344352
violation: 0.0009157574897872806
violation: 0.0007991225960594281
violation: 0.000714636141551671
violation: 0.0006490381082209844
violation: 0.0005972169726710217
violation: 0.0005549932878832688
violation: 0.0005206685817139828
violation: 0.0004915790573408767
violation: 0.0004652150471906113
violation: 0.00044215158985576287
violation: 0.00042257032842413857
violation: 0.00040465988180981
violation: 0.00038809403110232906
violation: 0.0003732735237999536
violation: 0.00035958751936517996
violation: 0.00034623336152060695
violation: 0.00033458078658895094
violation: 0.0003231686127826073
violation: 0.00031319485990230906
violation: 0.0003040539872361975
violation: 0.000295455944650737
violation: 0.

# Instantiating `BubbleBurster`

In [6]:
sys.path.insert(1, '../')

from wrapper.models.bubble import BubbleBurster
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement

In [7]:
bubble = BubbleBurster(
    # num_users=number_of_users,
    # num_items=num_items,
    # num_attributes=number_of_attributes,
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    actual_user_representation=users,
    record_base_state=True,
)

In [8]:
user_pairs = [(u_idx, v_idx) for u_idx in range(len(user_representation)) for v_idx in range(len(user_representation))]

bubble.add_metrics(
    MSEMeasurement(), 
    InteractionSpread(), 
    InteractionSpread(), 
    InteractionSimilarity(pairs=user_pairs), 
    RecSimilarity(pairs=user_pairs), 
    RMSEMeasurement(), 
    InteractionMeasurement()
)

print("These are the current metrics:")
print(bubble.metrics)

These are the current metrics:
[<trecs.metrics.measurement.MSEMeasurement object at 0x10643f5e0>, <trecs.metrics.measurement.InteractionSpread object at 0x15916d610>, <trecs.metrics.measurement.InteractionSpread object at 0x15916d8b0>, <trecs.metrics.measurement.InteractionSimilarity object at 0x15916dac0>, <trecs.metrics.measurement.RecSimilarity object at 0x15916db20>, <trecs.metrics.measurement.RMSEMeasurement object at 0x15916df10>, <trecs.metrics.measurement.InteractionMeasurement object at 0x158f1b730>]


# Compute user-topic mappings

In [9]:
from src.utils import user_topic_mapping

actual_user_profiles, actual_item_attributes = bubble.actual_user_profiles, bubble.actual_item_attributes

actual_user_topic_mapping = user_topic_mapping(actual_user_profiles, actual_item_attributes, item_topics)

  assert ((user_profiles.shape[1] == item_attributes.shape[0]),
  assert ((item_topics.shape == (item_attributes.shape[1],)),


In [10]:
actual_user_topic_mapping.shape

(943, 100)

In [11]:
print(actual_user_topic_mapping[1,:])

[-8.35341467e-02  5.15514104e-02  2.13418751e-01 -5.74607934e-02
  4.74594264e-02  4.71630967e-01  2.07945915e-01 -3.97229067e-01
  1.17692779e-01  1.06162687e-01  4.46434399e-02  6.55337944e-02
  4.31692657e-03  1.46125367e-01  3.33335391e-01 -2.10816639e-01
 -8.56687442e-02 -1.80205393e-01 -1.28527244e-03  1.12538303e-01
 -3.40183452e-01  3.47406730e-02  1.24052599e-01  1.44067481e-01
  1.54857598e-01  7.44290825e-02 -4.79909126e-01 -6.09175168e-02
  1.61371829e-01 -9.30690152e-02 -4.23834194e-01  1.38166949e-01
  1.83125544e-01  6.42546165e-02 -2.97908449e-03  2.11144404e-01
  1.75592838e-01  2.51079398e-01  2.65574692e-01  1.53260919e-01
 -8.93433913e-01 -4.99728569e-01  1.95009254e-01  2.46388518e-01
 -7.23814317e-01 -1.70631262e-01 -4.35620719e-02  2.65036027e-02
 -2.93415958e-01  1.54588113e-01  1.25343603e-01  1.68891256e-01
  2.75492853e-01 -3.62344549e-01 -5.62147557e-01  5.82687715e-01
 -2.36324746e-01  3.75823593e-01  1.32173138e-01 -3.71697529e-03
 -4.15301895e-01 -9.40628

In [12]:
user_clusters = get_topic_clusters(binary_ratings_matrix.T, n_attrs=n_clusters, max_iter=max_iter)#, nmf_solver=nmf_solver)
print(user_clusters.shape)
print(item_topics.shape)

Calculating clusters...
violation: 1.0
violation: 0.00023105663147641192
violation: 0.0008968555782884329
violation: 0.002797937212173178
violation: 0.00319759143004992
violation: 0.003113167357527142
violation: 0.0025264247993327677
violation: 0.002108708820660568
violation: 0.0017758168790870848
violation: 0.0015422410030184273
violation: 0.0013973782795763843
violation: 0.0013138559707382403
violation: 0.001232101569542869
violation: 0.00116297004761247
violation: 0.0010996678746988044
violation: 0.0010225828845818446
violation: 0.0009603342206569841
violation: 0.0009052762554306112
violation: 0.0008578066670110475
violation: 0.0008155155380737319
violation: 0.0007934177895929073
violation: 0.0007767893749273988
violation: 0.000751243440723186
violation: 0.0007313453770383489
violation: 0.0007198664975299297
violation: 0.0007083381562689952
violation: 0.0006964311361125571
violation: 0.0006875259660570504
violation: 0.0006786099146629198
violation: 0.0006679631629007681
violation: 0

# Visualizing user-topic mappings

In [13]:
import time

# For plotting
import plotly.io as plt_io
import plotly.graph_objects as go

In [14]:
#PCA
from sklearn.decomposition import PCA
#TSNE
from sklearn.manifold import TSNE
#UMAP
import umap
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
def plot_2d(component1, component2):
    
    fig = go.Figure(data=go.Scatter(
        x = component1,
        y = component2,
        mode='markers',
        marker=dict(
            size=20,
            color=user_clusters, #set color equal to a variable
            colorscale='Rainbow', # one of plotly colorscales
            showscale=True,
            line_width=1
        )
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=750,height=450)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [16]:
def plot_3d(component1, component2, component3):
    fig = go.Figure(data=[go.Scatter3d(
        x=component1,
        y=component2,
        z=component3,
        mode='markers',
        marker=dict(
            size=10,
            color=user_clusters,                # set color to an array/list of desired values
            colorscale='Rainbow',   # choose a colorscale
            opacity=1,
            line_width=1
        )
    )])
# tight layout
    fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=900,height=500)
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [17]:
from sklearn.preprocessing import StandardScaler
## Standardizing the data
x = StandardScaler().fit_transform(actual_user_topic_mapping)

### **PCA**

In [18]:
start = time.time()
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
print('Duration: {} seconds'.format(time.time() - start))
principal = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2','principal component 3'])

Duration: 0.04532480239868164 seconds


In [19]:
plot_2d(principalComponents[:, 0],principalComponents[:, 1])

In [20]:
plot_3d(principalComponents[:, 0],principalComponents[:, 1],principalComponents[:, 2])

### **t-SNE**

In [21]:
from sklearn.decomposition import TruncatedSVD

truncated_svd = TruncatedSVD(n_components=50)
X_svd = truncated_svd.fit_transform(x)
tsne = TSNE(random_state = 42, n_components=3,verbose=0, perplexity=40, n_iter=400).fit_transform(X_svd)
print('Duration: {} seconds'.format(time.time() - start))

Duration: 2.308011054992676 seconds


In [22]:
plot_2d(tsne[:, 0],tsne[:, 1])

In [23]:
plot_3d(tsne[:, 0],tsne[:, 1],tsne[:, 2])

### **UMAP**

In [24]:
start = time.time()
reducer = umap.UMAP(random_state=42,n_components=3)
embedding = reducer.fit_transform(x)
print('Duration: {} seconds'.format(time.time() - start))

Duration: 3.412677764892578 seconds


In [25]:
plot_2d(reducer.embedding_[:, 0],reducer.embedding_[:, 1])

In [26]:
plot_3d(reducer.embedding_[:, 0],reducer.embedding_[:, 1],reducer.embedding_[:, 2])

### **LDA**

In [27]:
start = time.time()
X_LDA = LDA(n_components=3).fit_transform(x, y=user_clusters)
print('Duration: {} seconds'.format(time.time() - start))

# from sklearn.decomposition import LatentDirichletAllocation

# lda = LatentDirichletAllocation(n_components=3, random_state=0)
# lda_x = lda.fit_transform(x)
# print('Duration: {} seconds'.format(time.time() - start))

Duration: 0.07610702514648438 seconds


In [28]:
plot_2d(X_LDA[:, 0],X_LDA[:, 1])

In [29]:
plot_3d(X_LDA[:, 0],X_LDA[:, 1],X_LDA[:, 2])