# Setup

In [3]:
# some_file.py
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../../t-recs/')
from trecs.models import ContentFiltering
from trecs.metrics import *
from trecs.random import Generator
from trecs.components import Users
import trecs.matrix_ops as mo

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx

random_state = np.random.seed(42)

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

ratings_df = pd.read_csv('../data/ml-100k/u.data', 
    sep="\t", 
    names=['UserID', 'MovieID', 'Rating', 'Timestamp']
)

movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies_df = pd.read_csv('../data/ml-100k/u.item', sep="|", names=movie_cols, encoding='latin')

# display(movies_df.head(2))
# print(movies_df.shape)

In [4]:
from sklearn.cluster import KMeans

def get_topic_clusters(binary_ratings_matrix, n_attrs:int=100, nmf_solver:str="mu"):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    co_occurence_matrix = binary_ratings_matrix.T @ binary_ratings_matrix
    co_occurence_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, solver=nmf_solver)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    kmeans = KMeans(n_clusters=100, random_state=random_state).fit(W_topics)

    # assign nearest cluster to observation
    cluster_ids = kmeans.predict(W_topics)

    return cluster_ids

In [6]:
from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()

n_attrs=100
nmf = NMF(n_components=n_attrs, solver="mu")
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)

# nmf_solver='mu'
n_clusters=50

num_topics = None
item_topics = get_topic_clusters(binary_ratings_matrix, n_attrs=n_attrs)#, nmf_solver="mu")
user_topic_history = None
item_count = None

users = Users(size=(943,100), repeat_interactions=False)



(943, 100) (100, 1682)


# Instantiating `BubbleBurster`

In [5]:
sys.path.insert(1, '../')

from wrapper.models.bubble import BubbleBurster

In [6]:
bubble = BubbleBurster(
    # num_users=number_of_users,
    # num_items=num_items,
    # num_attributes=number_of_attributes,
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    actual_user_representation=users,
    record_base_state=True,
)

# Testing `user_topic_mapping`

In [7]:
# user_profiles, item_attributes = bubble.actual_user_profiles, bubble.actual_item_attributes
user_profiles, item_attributes = bubble.predicted_user_profiles, bubble.predicted_item_attributes

In [8]:
topics = np.unique(item_topics)#, return_counts=True)
# print(type(topics))
# print(topics)
# break
user_item_scores = mo.inner_product(user_profiles, item_attributes)

test_user_topic_mapping = np.zeros((user_profiles.shape[0], topics.size))

for topic_i in topics:
    
    topic_idx = np.where(item_topics == topic_i)[0]
    # topic_i_user_scores = np.mean(user_item_scores[:, topic_idx], axis=1)
    # user_topic_mapping[:,topic_i] = topic_i_user_scores
    # ^ Condensed:
    test_user_topic_mapping[:,topic_i] = np.mean(user_item_scores[:, topic_idx], axis=1)

print(test_user_topic_mapping.shape)

(943, 100)


In [9]:
from src.utils import user_topic_mapping

result_user_topic_mapping = user_topic_mapping(user_profiles, item_attributes, item_topics)

assert(np.array_equal(result_user_topic_mapping, test_user_topic_mapping))

In [10]:
np.array_equal(result_user_topic_mapping, test_user_topic_mapping)

True

In [11]:
print(test_user_topic_mapping.shape)
print(result_user_topic_mapping.shape)

(943, 100)
(943, 100)


In [12]:
# topics, topic_counts = np.unique(item_topics, return_counts=True)
# user_item_scores = mo.inner_product(user_profiles, item_attributes)

# user_topic_mapping = np.zeros((user_profiles.shape[0], topics.size))
# # temp = np.zeros((user_profiles.shape[0], topics.size))
# # print(user_topic_mapping.shape)

# count = 0
# for topic_i in topics:
    
#     topic_idx = np.where(item_topics == topic_i)[0]
#     # print(topic_idx.shape)
#     assert (len(topic_idx) == topic_counts[topic_i]), "number of topic indices is not equal to number of topic_i instances in item_topics ):"
    
#     topic_i_user_scores = np.sum(user_item_scores[:, topic_idx], axis=1)
#     assert (np.sum(user_item_scores[0,topic_idx]).round(decimals=10) == topic_i_user_scores[0].round(decimals=10)), f"{count}"
#     count += 1
    
#     # temp[:,topic_i] = topic_i_user_scores
#     user_topic_mapping[:,topic_i] = topic_i_user_scores / topic_idx.size 
#     # print(topic_user_mapping.shape)
#     # break

# print(count)
# print(user_topic_mapping.shape)

# topics, topic_counts = np.unique(item_topics, return_counts=True)
# user_item_scores = mo.inner_product(user_profiles, item_attributes)

# temp = np.zeros((user_profiles.shape[0], topics.size))
# # temp = np.zeros((user_profiles.shape[0], topics.size))
# # print(user_topic_mapping.shape)

# count = 0
# for topic_i in topics:
    
#     topic_idx = np.where(item_topics == topic_i)[0]
#     # print(topic_idx.shape)
#     assert (len(topic_idx) == topic_counts[topic_i]), "number of topic indices is not equal to number of topic_i instances in item_topics ):"
    
#     # topic_i_user_scores = np.sum(user_item_scores[:, topic_idx], axis=1)
#     # assert (np.sum(user_item_scores[0,topic_idx]).round(decimals=10) == topic_i_user_scores[0].round(decimals=10)), f"{count}"
#     temp2 = np.mean(user_item_scores[:, topic_idx], axis=1)
#     # print(temp.shape)
#     count += 1
#     # break
#     temp[:,topic_i] = temp2
#     # temp[:,topic_i] = topic_i_user_scores
#     # user_topic_mapping[:,topic_i] = topic_i_user_scores / topic_idx.size 
#     # print(topic_user_mapping.shape)
#     # break

# print(count)
# print(temp.shape)

# count = 0
# for i in range(len(topics)):
#     topic_idx = np.where(item_topics == i)[0]
#     assert(np.array_equal(bubble.actual_user_item_scores[0, np.where(item_topics == i)].round(decimals=10), np.expand_dims(user_item_scores[0, topic_idx].round(decimals=10), axis=0)))
#     # assert(np.array_equal(topic_user_mapping[:,i], ))
# # asse