In [1]:
# some_file.py
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../t-recs/')
from trecs.models import ContentFiltering
from trecs.metrics import *
from trecs.random import Generator
from trecs.components import Users

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx

random_state = np.random.seed(42)

In [3]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [4]:
ratings_df = pd.read_csv('../ml-100k/u.data', 
    sep="\t", 
    names=['UserID', 'MovieID', 'Rating', 'Timestamp']
)

In [5]:
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies_df = pd.read_csv('../ml-100k/u.item', sep="|", names=movie_cols, encoding='latin')

# display(movies_df.head(2))
# print(movies_df.shape)

In [6]:
from sklearn.cluster import KMeans

def get_topic_clusters(binary_ratings_matrix, n_attrs:int=100, nmf_solver:str="mu"):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    co_occurence_matrix = binary_ratings_matrix.T @ binary_ratings_matrix
    co_occurence_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, solver=nmf_solver)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    kmeans = KMeans(n_clusters=100, random_state=random_state).fit(W_topics)

    # assign nearest cluster to observation
    cluster_ids = kmeans.predict(W_topics)

    return cluster_ids

In [7]:
from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

In [8]:
from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()



In [9]:
n_attrs=100
nmf = NMF(n_components=n_attrs, solver="mu")
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)

(943, 100) (100, 1682)




In [10]:
users = Users(size=(943,100), repeat_interactions=False)

In [11]:
print(user_representation.shape)
print(item_representation.shape)

(943, 100)
(100, 1682)


In [12]:
from wrapper.models.bubble import BubbleBurster

In [13]:
num_topics = None
item_topics = get_topic_clusters(binary_ratings_matrix, n_attrs=n_attrs, nmf_solver="mu")
user_topic_history = None
item_count = None

In [14]:
recsys = BubbleBurster(
    # num_users=number_of_users,
    # num_items=num_items,
    # num_attributes=number_of_attributes,
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    actual_user_representation=users,
    record_base_state=True,
)

In [15]:
print(recsys.num_topics)
print(recsys.item_topics.shape)
print(recsys.user_topic_history.shape)
print(np.unique(recsys.user_topic_history))
print(recsys.item_count.shape) 

100
(1682,)
(943, 100)
[0.]
(1, 1682)


In [16]:
recsys.add_metrics(MSEMeasurement(), InteractionSpread(), AverageFeatureScoreRange())
print("These are the current metrics:")
print(recsys.metrics)

These are the current metrics:
[<trecs.metrics.measurement.MSEMeasurement object at 0x15c50c700>, <trecs.metrics.measurement.InteractionSpread object at 0x15c50c7f0>, <trecs.metrics.measurement.AverageFeatureScoreRange object at 0x15c50c910>]


In [17]:
# now we run the model
recsys.run(timesteps=1)
measurements = recsys.get_measurements()

100%|██████████| 1/1 [00:03<00:00,  3.44s/it]


**-> Model successfully runs for 1 timestep**

In [18]:
keys = recsys.__dict__.keys()
vals = recsys.__dict__.values()

for k, v in zip(keys,vals):
    print(k)

all_interactions
_logger
metrics
users_hat
items_hat
score_fn
interleaving_fn
predicted_scores
probabilistic_recommendations
items_shown
interactions
users
items
creators
_system_state
num_users
num_items
expand_items_per_iter
num_items_per_iter
random_state
indices
item_topics
num_topics
user_topic_history
item_count


In [25]:
def state_update(recommender, item_count, user_topic_history, item_topics):
    items_shown = recommender.items_shown
    for i in range(items_shown.shape[0]):
        items_shown_val, items_shown_count = np.unique(items_shown[i,:], return_counts=True)
        item_count[0, items_shown_val] += 1
        topics_shown = item_topics[items_shown_val]
        topics_shown_val, topics_shown_count = np.unique(topics_shown, return_counts=True)
        user_topic_history[i, topics_shown_val] += topics_shown_count
        if (sum(items_shown_count) != 10):
            print("DUPLICATE ITEMS IN SLATE", items_shown_count)
            break
    return item_count, user_topic_history

In [26]:
test_item_count = np.zeros((1,recsys.num_items))
test_user_topic_history = np.zeros((recsys.num_users, recsys.num_topics))

test_item_count, test_user_topic_history = state_update(recsys, test_item_count, test_user_topic_history, item_topics)

In [30]:
print(np.array_equal(recsys.item_count, test_item_count))
print(np.array_equal(recsys.user_topic_history, test_user_topic_history))

True
True


**-> Two values are equal after 1 iteration**

In [31]:
# now we run the model
recsys.run(timesteps=1)
measurements = recsys.get_measurements()

100%|██████████| 1/1 [00:03<00:00,  3.92s/it]


In [32]:
test_item_count, test_user_topic_history = state_update(recsys, test_item_count, test_user_topic_history, item_topics)
print(np.array_equal(recsys.item_count, test_item_count))
print(np.array_equal(recsys.user_topic_history, test_user_topic_history))

True
True


**-> The two values are equal after a second iteration**

In [33]:
for key in measurements.keys():
    print(key, measurements[key])

mse [0.22339414046796585, 0.1738726227590423, 0.16824752880109448]
interaction_spread [None, -905.5, -19.5]
afsr [None, 11.748918789814644, 9.868651018668231]
timesteps [0 1 2]


In [36]:
bubble = BubbleBurster(
    # num_users=number_of_users,
    # num_items=num_items,
    # num_attributes=number_of_attributes,
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    actual_user_representation=users,
    record_base_state=True,
)

In [37]:
bubble.add_metrics(MSEMeasurement(), InteractionSpread(), AverageFeatureScoreRange())
print("These are the current metrics:")
print(recsys.metrics)

These are the current metrics:
[<trecs.metrics.measurement.MSEMeasurement object at 0x15c50c700>, <trecs.metrics.measurement.InteractionSpread object at 0x15c50c7f0>, <trecs.metrics.measurement.AverageFeatureScoreRange object at 0x15c50c910>]


In [38]:
bubble.startup_and_train(timesteps=100)

100%|██████████| 100/100 [00:07<00:00, 13.20it/s]


In [40]:
for key in bubble.get_measurements().keys():
    print(key, bubble.get_measurements()[key])

mse [0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772018, 0.1652112923772