# Setup

In [1]:
# some_file.py
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../../t-recs/')
from trecs.models import ContentFiltering
from trecs.metrics import *
from trecs.random import Generator
from trecs.components import Users

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx

random_state = np.random.seed(42)

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

ratings_df = pd.read_csv('../../ml-100k/u.data', 
    sep="\t", 
    names=['UserID', 'MovieID', 'Rating', 'Timestamp']
)

movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies_df = pd.read_csv('../../ml-100k/u.item', sep="|", names=movie_cols, encoding='latin')

# display(movies_df.head(2))
# print(movies_df.shape)

In [2]:
from sklearn.cluster import KMeans

def get_topic_clusters(binary_ratings_matrix, n_attrs:int=100, nmf_solver:str="mu"):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    co_occurence_matrix = binary_ratings_matrix.T @ binary_ratings_matrix
    co_occurence_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, solver=nmf_solver)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    kmeans = KMeans(n_clusters=100, random_state=random_state).fit(W_topics)

    # assign nearest cluster to observation
    cluster_ids = kmeans.predict(W_topics)

    return cluster_ids

In [3]:
from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()

n_attrs=100
nmf = NMF(n_components=n_attrs, solver="mu")
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)

num_topics = None
item_topics = get_topic_clusters(binary_ratings_matrix, n_attrs=n_attrs, nmf_solver="mu")
user_topic_history = None
item_count = None

users = Users(size=(943,100), repeat_interactions=False)



(943, 100) (100, 1682)


# Using `next_k_myopic_scoring` as score_fn function

In [4]:
sys.path.insert(1, '../')

from wrapper.models.bubble import BubbleBurster
from src.scoring_functions import next_k_myopic_scoring

In [5]:
bubble = BubbleBurster(
    # num_users=number_of_users,
    # num_items=num_items,
    # num_attributes=number_of_attributes,
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    actual_user_representation=users,
    record_base_state=True,
    score_fn=next_k_myopic_scoring
)

In [6]:
from wrapper.metrics import UserMSEMeasurement

bubble.add_metrics(MSEMeasurement(), UserMSEMeasurement())
print("These are the current metrics:")
print(bubble.metrics)

These are the current metrics:
[<trecs.metrics.measurement.MSEMeasurement object at 0x1379373a0>, <wrapper.metrics.evaluation_metrics.UserMSEMeasurement object at 0x15128d670>]


In [7]:
import trecs.matrix_ops as mo
from numpy.random import RandomState

rs = RandomState(42)

In [8]:
pred_scores = mo.inner_product(bubble.predicted_user_profiles, bubble.predicted_item_attributes)

k = 10
top_k_idxs = mo.top_k_indices(matrix=pred_scores, k=k, random_state=rs)
re_ranked_scores = pred_scores
np.put_along_axis(arr=re_ranked_scores, indices=top_k_idxs, values=0, axis=1)
# print(top_k_idxs.shape)
# print(top_k_idxs)
x = np.where(re_ranked_scores==0)
# print(x.shape)
zero_elem = x[1].reshape((bubble.num_users, bubble.num_items_per_iter))#[0,:]
# x

In [9]:
# np.array_equiv(top_k_idxs, zero_elem)
count = 0
for i in range(top_k_idxs.shape[0]):
    top_k = set(top_k_idxs[i,:])
    zero_e = set(zero_elem[i,:])
    if not top_k == zero_e:
        print(i)
        print(top_k-zero_e)
        count += 1
print(count)

0


In [10]:
re_ranked_scores

array([[4.10224447e-01, 4.01335839e-01, 3.37221744e-01, ...,
        2.15729699e-05, 3.15573376e-05, 2.97179692e-04],
       [1.70370450e+00, 9.52845031e-02, 3.71216258e-01, ...,
        1.06496320e-02, 2.44146229e-09, 4.42048040e-04],
       [4.55731007e-01, 6.27871819e-03, 1.19531389e-02, ...,
        6.53266824e-02, 2.74191702e-05, 8.97444964e-04],
       ...,
       [0.00000000e+00, 9.10585615e-02, 9.18170361e-02, ...,
        1.93075765e-23, 4.76003124e-19, 6.32614176e-12],
       [5.46091837e-01, 8.98424146e-02, 4.53552350e-03, ...,
        1.21473295e-02, 2.67239974e-16, 3.60955815e-19],
       [8.35350317e-01, 1.30434578e+00, 5.90299106e-01, ...,
        2.54408344e-14, 8.21993885e-02, 6.32758435e-02]])

In [11]:
bubble.run(timesteps=1)
measurements = bubble.get_measurements()

100%|██████████| 1/1 [00:03<00:00,  3.92s/it]


In [12]:
model_pred_scores = bubble.predicted_scores.value
model_pred_scores

array([[1.43675451e-001, 1.54836704e-001, 1.16594108e-001, ...,
        0.00000000e+000, 7.62976632e-046, 2.58129567e-311],
       [7.07251039e-002, 4.66969401e-002, 7.05649075e-002, ...,
        1.22373629e-002, 5.45910454e-005, 1.56904062e-030],
       [3.72396417e-001, 3.79174922e-002, 4.49083767e-002, ...,
        2.27327067e-002, 1.20231179e-072, 1.35664286e-004],
       ...,
       [5.10899486e-001, 4.49580960e-002, 1.09342294e-004, ...,
        0.00000000e+000, 1.02981676e-002, 4.54274083e-003],
       [4.90115762e-002, 2.39067417e-002, 1.67523881e-009, ...,
        2.34742052e-002, 7.17532715e-004, 2.90717983e-054],
       [1.32512079e+000, 6.14827250e-001, 1.70867463e-014, ...,
        0.00000000e+000, 1.08420185e-004, 7.73468349e-019]])

In [15]:
pred_scores = mo.inner_product(bubble.predicted_user_profiles, bubble.predicted_item_attributes)

k = 10
top_k_idxs = mo.top_k_indices(matrix=pred_scores, k=k, random_state=rs)
re_ranked_scores = pred_scores
np.put_along_axis(arr=re_ranked_scores, indices=top_k_idxs, values=0, axis=1)

# x = np.where(re_ranked_scores==0)

# zero_elem = x[1].reshape((bubble.num_users, bubble.num_items_per_iter))#[0,:]

# count = 0
# for i in range(top_k_idxs.shape[0]):
#     top_k = set(top_k_idxs[i,:])
#     zero_e = set(zero_elem[i,:])
#     if not top_k == zero_e:
#         # print(i)
#         # print(top_k-zero_e)
#         count += 1
# print(count)
print(re_ranked_scores)

[[1.43675451e-001 1.54836704e-001 1.16594108e-001 ... 0.00000000e+000
  7.62976632e-046 2.58129567e-311]
 [7.07251039e-002 4.66969401e-002 7.05649075e-002 ... 1.22373629e-002
  5.45910454e-005 1.56904062e-030]
 [3.72396417e-001 3.79174922e-002 4.49083767e-002 ... 2.27327067e-002
  1.20231179e-072 1.35664286e-004]
 ...
 [5.10899486e-001 4.49580960e-002 1.09342294e-004 ... 0.00000000e+000
  1.02981676e-002 4.54274083e-003]
 [4.90115762e-002 2.39067417e-002 1.67523881e-009 ... 2.34742052e-002
  7.17532715e-004 2.90717983e-054]
 [1.32512079e+000 6.14827250e-001 1.70867463e-014 ... 0.00000000e+000
  1.08420185e-004 7.73468349e-019]]


In [16]:
np.array_equal(model_pred_scores, re_ranked_scores)

True