**Ref:** https://towardsdatascience.com/dimensionality-reduction-for-data-visualization-pca-vs-tsne-vs-umap-be4aa7b1cb29

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from sklearn.decomposition import NMF
from importlib import reload
import wrapper

import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../t-recs/')
import trecs
import os
from scipy import sparse
from trecs.models import ContentFiltering
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement
from trecs.components import Users

from wrapper.models.bubble import BubbleBurster
# from src.utils import get_topic_clusters
from wrapper.metrics.evaluation_metrics import SerendipityMetric, DiversityMetric, NoveltyMetric, TopicInteractionMeasurement, MeanNumberOfTopics, UserMSEMeasurement

random_state = np.random.seed(42)
plt.style.use("seaborn-paper")


The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.



In [53]:
def get_topic_clusters(interaction_matrix, n_clusters:int=100, n_attrs:int=100, max_iter:int=100):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    print('Calculating clusters...')
    co_occurence_matrix = interaction_matrix.T @ interaction_matrix

    co_occurence_matrix = interaction_matrix.T @ interaction_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, max_iter=max_iter, verbose=1)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    cluster_ids = KMeans(n_clusters=n_clusters, max_iter=max_iter, random_state=random_state, verbose=1).fit_predict(W_topics)
    # np.save(file_path, cluster_ids)

    print('Calculated clusters.')

    return cluster_ids

In [46]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [47]:
ratings_df = pd.read_csv('data/ml-100k/u.data', sep="\t", names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

In [54]:
from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()

n_attrs=100
nmf = NMF(n_components=n_attrs, solver="mu")
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)

# nmf_solver='mu'
n_clusters=50
max_iter=500
n_attrs=100
item_topics = get_topic_clusters(binary_ratings_matrix, n_attrs=n_clusters, max_iter=max_iter)#, nmf_solver=nmf_solver)

users = Users(size=(943,100), repeat_interactions=False)


Maximum number of iterations 200 reached. Increase it to improve convergence.



(943, 100) (100, 1682)
Calculating clusters...
violation: 1.0
violation: 0.0008655790446247651
violation: 0.0020134933102098906
violation: 0.0023178343932158054
violation: 0.0019146869632399675
violation: 0.0014784921443730527
violation: 0.0012412117188303738
violation: 0.0010566915246783518
violation: 0.0009065444452546385
violation: 0.0007915714336492692
violation: 0.0007038879063333596
violation: 0.0006321160871253133
violation: 0.0005751559328467217
violation: 0.0005331339280041813
violation: 0.0004980232856638827
violation: 0.00046830276640510315
violation: 0.00044320345104436584
violation: 0.00042135799425808035
violation: 0.0004032483521996451
violation: 0.00038764459176627696
violation: 0.0003742907574291649
violation: 0.0003622478042950487
violation: 0.00035049718741318487
violation: 0.0003392221085310889
violation: 0.00032837134105451416
violation: 0.00031825376718504076
violation: 0.0003093373833729513
violation: 0.00030072815704387004
violation: 0.0002929618660028207
violat

# Instantiating `BubbleBurster`

In [5]:
sys.path.insert(1, '../')

from wrapper.models.bubble import BubbleBurster
from trecs.metrics import MSEMeasurement, InteractionSpread, InteractionSpread, InteractionSimilarity, RecSimilarity, RMSEMeasurement, InteractionMeasurement

In [6]:
bubble = BubbleBurster(
    # num_users=number_of_users,
    # num_items=num_items,
    # num_attributes=number_of_attributes,
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    actual_user_representation=users,
    record_base_state=True,
)

In [7]:
user_pairs = [(u_idx, v_idx) for u_idx in range(len(user_representation)) for v_idx in range(len(user_representation))]

bubble.add_metrics(
    MSEMeasurement(), 
    InteractionSpread(), 
    InteractionSpread(), 
    InteractionSimilarity(pairs=user_pairs), 
    RecSimilarity(pairs=user_pairs), 
    RMSEMeasurement(), 
    InteractionMeasurement()
)

print("These are the current metrics:")
print(bubble.metrics)

These are the current metrics:
[<trecs.metrics.measurement.MSEMeasurement object at 0x106b30f40>, <trecs.metrics.measurement.InteractionSpread object at 0x1602de790>, <trecs.metrics.measurement.InteractionSpread object at 0x1602de9a0>, <trecs.metrics.measurement.InteractionSimilarity object at 0x1602dec10>, <trecs.metrics.measurement.RecSimilarity object at 0x1602dec40>, <trecs.metrics.measurement.RMSEMeasurement object at 0x106c03670>, <trecs.metrics.measurement.InteractionMeasurement object at 0x106c03850>]


# Compute user-topic mappings

In [8]:
from src.utils import user_topic_mapping

actual_user_profiles, actual_item_attributes = bubble.actual_user_profiles, bubble.actual_item_attributes

actual_user_topic_mapping = user_topic_mapping(actual_user_profiles, actual_item_attributes, item_topics)

In [9]:
actual_user_topic_mapping.shape

(943, 100)

In [10]:
print(actual_user_topic_mapping[1,:])

[ 0.08351249  0.05374733  0.38896743  0.40084058 -0.05441123  0.10636736
  0.68358686  0.14120841  0.04801196  0.09928464 -0.94183602  0.42988197
  0.20727075 -1.21126307  0.1904005   0.33480345  0.0563957   0.52271338
  0.25504841  0.14438217 -0.26719914  0.05260924  0.902245    0.35849795
  0.56650092  0.6118474   0.18139817  0.15160157  0.22891245  0.11126508
  0.08693579  0.26124397  0.34932467 -0.04409482  0.53404448  0.09572678
  0.17200136  0.2395926   0.08458403  0.07724171  0.05864637  0.06742834
 -0.64895036  0.12146726  0.31521816 -0.4447371   0.21080586 -0.50270863
  0.34109149  0.53339524  0.07648019 -0.09958394  0.15605902 -0.17287361
  0.3337034   0.18780998 -0.72154692 -0.248561    0.27048558  0.26646441
  0.14355914  0.5251558   0.57865169  0.1599636   0.15568285  0.43797683
  0.28537781  0.20212463  0.10763088  0.04921253  0.09649884  0.80954037
 -0.36842094  0.26191212 -1.63232691  0.19887867  0.37228945 -0.05139817
  0.15094037  0.25784609  0.02096039  0.1469611   0

In [55]:
user_clusters = get_topic_clusters(binary_ratings_matrix.T, n_attrs=n_clusters, max_iter=max_iter)#, nmf_solver=nmf_solver)
print(user_clusters.shape)
print(item_topics.shape)

Calculating clusters...
violation: 1.0
violation: 0.00018642268610769578
violation: 0.0006218383875034578
violation: 0.0006579085888660048
violation: 0.0005590863671827319
violation: 0.0004611102900625298
violation: 0.00039148569607948524
violation: 0.0003466222495682169
violation: 0.0003143589932413479
violation: 0.0002878886488464396
violation: 0.0002671136529638555
violation: 0.0002501977234599987
violation: 0.0002352279888683451
violation: 0.0002224297049361122
violation: 0.0002111886251360171
violation: 0.00020219538614187643
violation: 0.00019393601171076964
violation: 0.000186519406285159
violation: 0.0001798608086736024
violation: 0.00017416340180910757
violation: 0.00016790349793011552
violation: 0.0001623678129941847
violation: 0.00015695116592578842
violation: 0.00015214154176973788
violation: 0.00014809432995631835
violation: 0.00014325341240895882
violation: 0.00013873053524077664
violation: 0.00013446445458153056
violation: 0.00013048038487347796
violation: 0.000126336602

# Visualizing user-topic mappings

In [11]:
import time

# For plotting
import plotly.io as plt_io
import plotly.graph_objects as go

In [12]:
#PCA
from sklearn.decomposition import PCA
#TSNE
from sklearn.manifold import TSNE
#UMAP
import umap
#LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

  from .autonotebook import tqdm as notebook_tqdm


In [56]:
def plot_2d(component1, component2):
    
    fig = go.Figure(data=go.Scatter(
        x = component1,
        y = component2,
        mode='markers',
        marker=dict(
            size=20,
            color=user_clusters, #set color equal to a variable
            colorscale='Rainbow', # one of plotly colorscales
            showscale=True,
            line_width=1
        )
    ))
    fig.update_layout(margin=dict( l=100,r=100,b=100,t=100),width=750,height=450)                 
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [57]:
def plot_3d(component1, component2, component3):
    fig = go.Figure(data=[go.Scatter3d(
        x=component1,
        y=component2,
        z=component3,
        mode='markers',
        marker=dict(
            size=10,
            color=user_clusters,                # set color to an array/list of desired values
            colorscale='Rainbow',   # choose a colorscale
            opacity=1,
            line_width=1
        )
    )])
# tight layout
    fig.update_layout(margin=dict(l=50,r=50,b=50,t=50),width=900,height=500)
    fig.layout.template = 'plotly_dark'
    
    fig.show()

In [58]:
from sklearn.preprocessing import StandardScaler
## Standardizing the data
x = StandardScaler().fit_transform(actual_user_topic_mapping)

### **PCA**

In [59]:
start = time.time()
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
print('Duration: {} seconds'.format(time.time() - start))
principal = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2','principal component 3'])

Duration: 0.02225208282470703 seconds


In [60]:
plot_2d(principalComponents[:, 0],principalComponents[:, 1])

In [61]:
plot_3d(principalComponents[:, 0],principalComponents[:, 1],principalComponents[:, 2])

### **t-SNE**

In [62]:
from sklearn.decomposition import TruncatedSVD

truncated_svd = TruncatedSVD(n_components=50)
X_svd = truncated_svd.fit_transform(x)
tsne = TSNE(random_state = 42, n_components=3,verbose=0, perplexity=40, n_iter=400).fit_transform(X_svd)
print('Duration: {} seconds'.format(time.time() - start))

Duration: 19.42277717590332 seconds


In [63]:
plot_2d(tsne[:, 0],tsne[:, 1])

In [64]:
plot_3d(tsne[:, 0],tsne[:, 1],tsne[:, 2])

### **UMAP**

In [65]:
start = time.time()
reducer = umap.UMAP(random_state=42,n_components=3)
embedding = reducer.fit_transform(x)
print('Duration: {} seconds'.format(time.time() - start))

Duration: 1.5953857898712158 seconds


In [66]:
plot_2d(reducer.embedding_[:, 0],reducer.embedding_[:, 1])

In [67]:
plot_3d(reducer.embedding_[:, 0],reducer.embedding_[:, 1],reducer.embedding_[:, 2])

### **LDA**

In [68]:
start = time.time()
X_LDA = LDA(n_components=3).fit_transform(x, y=user_clusters)
print('Duration: {} seconds'.format(time.time() - start))

# from sklearn.decomposition import LatentDirichletAllocation

# lda = LatentDirichletAllocation(n_components=3, random_state=0)
# lda_x = lda.fit_transform(x)
# print('Duration: {} seconds'.format(time.time() - start))

Duration: 0.05242919921875 seconds


In [69]:
plot_2d(X_LDA[:, 0],X_LDA[:, 1])

In [70]:
plot_3d(X_LDA[:, 0],X_LDA[:, 1],X_LDA[:, 2])