In [1]:
import time
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
import scipy.sparse as sp
from scipy.sparse import csr_matrix

In [2]:
df_train = pd.read_parquet('/home/yx1750/final-project-group20/data/interactions_train_small_rating.parquet/')
df_val = pd.read_parquet('/home/yx1750/final-project-group20/data/interactions_val_small_rating.parquet/')
df_test = pd.read_parquet('/home/yx1750/final-project-group20/data/interactions_test_rating.parquet/')
df_train = df_train[['user_id', 'reindex_int']]
df_val = df_val[['user_id', 'reindex_int']]
df_test = df_test[['user_id', 'reindex_int']]

In [3]:
df_combined = pd.concat([df_train, df_test], axis=0)

In [4]:
def df_to_matrix(df):
    # maps each unique user_id to an integer value
    user_id_map = {}
    for i, user_id in enumerate(df['user_id'].unique()):
        user_id_map[user_id] = i
    
    # maps each unique item_id to an integer value
    item_id_map = {}
    for i, item_id in enumerate(df['reindex_int'].unique()):
        item_id_map[item_id] = i
    
    df_copy = df.copy()
    df_copy['user_id'] = df_copy['user_id'].apply(lambda x: user_id_map[x])
    df_copy['reindex_int'] = df_copy['reindex_int'].apply(lambda x: item_id_map[x])
    
    num_users = len(user_id_map)
    num_items = len(item_id_map)

#     rating = df['scaled_rating'].values
    row = df_copy['user_id'].values
    col = df_copy['reindex_int'].values
#     row = list(user_id_map.keys())
#     col = list(item_id_map.keys())
    V = np.ones(len(row))

#     sparse_matrix = csr_matrix((V, (row, col)), shape=(num_users, num_items))
    interactions = sp.coo_matrix((V, (row, col)), dtype=np.float64)
    interactions = interactions.tocsr()
    
    return interactions, user_id_map, item_id_map

In [5]:
interactions, user_id_map, item_id_map = df_to_matrix(df_combined)

In [6]:
interactions

<7821x3039419 sparse matrix of type '<class 'numpy.float64'>'
	with 7937704 stored elements in Compressed Sparse Row format>

In [7]:
def subset_to_matrix(interactions, user_id_map, item_id_map, subset):
    diff = df_combined.merge(subset, how = 'outer' ,indicator=True).loc[lambda x : x['_merge'] == 'left_only']
    print(diff)
    user_list = diff['user_id'].values
    item_list = diff['reindex_int'].values

    sub_mat = interactions.copy().tolil()
    
    for user, item in zip(user_list, item_list):
        uidx = user_id_map[user]
        midx = item_id_map[item]

        sub_mat[uidx, midx] = 0.
    
    return sub_mat.tocsr()

In [8]:
train = subset_to_matrix(interactions, user_id_map, item_id_map, df_train)

         user_id  reindex_int     _merge
5222667       34       948391  left_only
5222668       34       738926  left_only
5222669       34       881158  left_only
5222670       34      9576131  left_only
5222671       34      5959258  left_only
...          ...          ...        ...
8887559    22187      1410789  left_only
8887560    22187     16873952  left_only
8887561    22187      7470930  left_only
8887562    22187     16437507  left_only
8887563    22187     16952627  left_only

[3664897 rows x 3 columns]


In [9]:
test = subset_to_matrix(interactions, user_id_map, item_id_map, df_val)

         user_id  reindex_int     _merge
0             34     17504245  left_only
1             34     13652040  left_only
2             34     17428983  left_only
3             34      7736006  left_only
4             34     15663704  left_only
...          ...          ...        ...
8887559    22187      1410789  left_only
8887560    22187     16873952  left_only
8887561    22187      7470930  left_only
8887562    22187     16437507  left_only
8887563    22187     16952627  left_only

[8649570 rows x 3 columns]


In [10]:
train

<7821x3039419 sparse matrix of type '<class 'numpy.float64'>'
	with 4272807 stored elements in Compressed Sparse Row format>

In [11]:
test

<7821x3039419 sparse matrix of type '<class 'numpy.float64'>'
	with 237994 stored elements in Compressed Sparse Row format>

In [12]:
# Initialize the model
model = LightFM(loss='warp')

# Train the  model
start = time.process_time()

model.fit(train, epochs=3)
end = time.process_time()
elapsed_time = end - start
print("Time to fit model: ", elapsed_time)

Time to fit model:  24.249038502000005


In [13]:
# start = time.process_time()
# train_precision = precision_at_k(model, train_matrix, k=100).mean()
# end = time.process_time()
# elapsed_time = end - start
# print("Time to calculate training precision: ", elapsed_time)

start = time.process_time()
test_precision = precision_at_k(model, test, k=100).mean()
end = time.process_time()
elapsed_time = end - start
print("Time to calculate test precision: ", elapsed_time)

# print(train_precision)
print(test_precision)

Time to calculate test precision:  1056.8419021920001
0.0063928394
