# Setup

In [1]:
# some_file.py
import sys
sys.path.insert(1, '../../t-recs/')
from trecs.metrics import *
from trecs.random import Generator

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx

random_state = np.random.seed(42)

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

ratings_df = pd.read_csv('../../ml-100k/u.data', 
    sep="\t", 
    names=['UserID', 'MovieID', 'Rating', 'Timestamp']
)

movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies_df = pd.read_csv('../../ml-100k/u.item', sep="|", names=movie_cols, encoding='latin')

from sklearn.cluster import KMeans

def get_topic_clusters(binary_ratings_matrix, n_attrs:int=100, nmf_solver:str="mu"):
    """
    Creates clusters of movies based on their genre.
    Inputs:
        binary_ratings_matrix: a binary matrix of users and movies
        n_attrs: number of attributes to use in NMF
        nmf_solver: solver to use in NMF
    Outputs:
        clusters: a list of cluster assignments
    """
    # Create topic clusters
    #create co-occurence matrix from binary_interaction_matrix
    co_occurence_matrix = binary_ratings_matrix.T @ binary_ratings_matrix
    co_occurence_matrix

    # Matrix factorize co_occurence_matrix to get embeddings
    nmf_cooc = NMF(n_components=n_attrs, solver=nmf_solver)
    W_topics = nmf_cooc.fit_transform(co_occurence_matrix)

    # cluster W_topics
    kmeans = KMeans(n_clusters=100, random_state=random_state).fit(W_topics)

    # assign nearest cluster to observation
    cluster_ids = kmeans.predict(W_topics)

    return cluster_ids

from sklearn.decomposition import NMF

binary_ratings_df = ratings_df.drop(columns=['Timestamp'])
binary_ratings_df.loc[binary_ratings_df['Rating'] > 0, 'Rating'] = 1

# turn dataframe into matrix where each movie is a column and each user is a row
binary_ratings_matrix = binary_ratings_df.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0).to_numpy()

from lightfm.cross_validation import random_train_test_split
from scipy import sparse

# split data into train and test sets
train_interactions, test_interactions = random_train_test_split(sparse.csr_matrix(binary_ratings_matrix), test_percentage=0.2, random_state=random_state)
train_interactions = train_interactions.toarray()
test_interactions = test_interactions.toarray()

n_attrs=100
nmf = NMF(n_components=n_attrs, solver="mu")
user_representation = nmf.fit_transform(binary_ratings_matrix)
item_representation = nmf.components_
print(user_representation.shape, item_representation.shape)

num_topics = None
item_topics = get_topic_clusters(binary_ratings_matrix, n_attrs=n_attrs, nmf_solver="mu")
user_topic_history = None
item_count = None



(943, 100) (100, 1682)


# Testing MyopicExcludeK

In [11]:
from models.excl_k_myopic import MyopicExcludeK

In [12]:
recsys = MyopicExcludeK(
    item_topics=item_topics,
    user_representation=user_representation,
    item_representation=item_representation,
    record_base_state=True,
    excludeK=10,
)

In [13]:
print(recsys.num_topics)
print(recsys.item_topics.shape)
print(recsys.user_topic_history.shape)
print(np.unique(recsys.user_topic_history))
print(recsys.item_count.shape) 
print(recsys.excludeK) 

100
(1682,)
(943, 100)
[0.]
(1682,)
10


In [14]:
recsys.add_metrics(MSEMeasurement(), InteractionSpread(), AverageFeatureScoreRange())
print("These are the current metrics:")
print(recsys.metrics)

These are the current metrics:
[<trecs.metrics.measurement.MSEMeasurement object at 0x15d786850>, <trecs.metrics.measurement.InteractionSpread object at 0x15d807910>, <trecs.metrics.measurement.AverageFeatureScoreRange object at 0x15d786d30>]


In [15]:
# now we run the model
recsys.run(timesteps=1)
measurements = recsys.get_measurements()

100%|██████████| 1/1 [00:03<00:00,  3.32s/it]
