In [1]:
import time
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
import scipy.sparse as sp
from scipy.sparse import csr_matrix



In [4]:
df_train = pd.read_parquet('user_recording_rating_normalized_small_random.parquet')
df_val = pd.read_parquet('user_recording_reindex_val_small_random.parquet')
df_train = df_train[df_train['scaled_rating'] != 0]
df_train = df_train[['user_id', 'reindex_int']]
df_val = df_val[['user_id', 'reindex_int']]

In [5]:
df_combined = pd.concat([df_train, df_val], axis=0)

In [6]:
def df_to_matrix(df):
    user_id_map = {}
    for i, user_id in enumerate(df['user_id'].unique()):
        user_id_map[user_id] = i
    
    item_id_map = {}
    for i, item_id in enumerate(df['reindex_int'].unique()):
        item_id_map[item_id] = i
    
    df_copy = df.copy()
    df_copy['user_id'] = df_copy['user_id'].apply(lambda x: user_id_map[x])
    df_copy['reindex_int'] = df_copy['reindex_int'].apply(lambda x: item_id_map[x])
    
    num_users = len(user_id_map)
    num_items = len(item_id_map)

    row = df_copy['user_id'].values
    col = df_copy['reindex_int'].values
    V = np.ones(len(row))

    interactions = sp.coo_matrix((V, (row, col)), dtype=np.float64)
    interactions = interactions.tocsr()
    
    return interactions, user_id_map, item_id_map

In [7]:
interactions, user_id_map, item_id_map = df_to_matrix(df_combined)

In [8]:
interactions

<6668x2228541 sparse matrix of type '<class 'numpy.float64'>'
	with 6532411 stored elements in Compressed Sparse Row format>

In [9]:
def subset_to_matrix(interactions, user_id_map, item_id_map, subset):
    diff = df_combined.merge(subset, how = 'outer' ,indicator=True).loc[lambda x : x['_merge'] == 'left_only']
    print(diff)
    user_list = diff['user_id'].values
    item_list = diff['reindex_int'].values

    sub_mat = interactions.copy().tolil()
    
    for user, item in zip(user_list, item_list):
        uidx = user_id_map[user]
        midx = item_id_map[item]

        sub_mat[uidx, midx] = 0.
    
    return sub_mat.tocsr()

In [10]:
train = subset_to_matrix(interactions, user_id_map, item_id_map, df_train)

          user_id  reindex_int     _merge
9847046      8551         2976  left_only
9847047      2056        23582  left_only
9847048      4956     16296509  left_only
9847049      4717        26078  left_only
9847050      7877       403632  left_only
...           ...          ...        ...
12048839      223      8580063  left_only
12048840    19774     18967586  left_only
12048841    17825     18967586  left_only
12048842    19477     18967586  left_only
12048843     5306       418584  left_only

[2201798 rows x 3 columns]


In [11]:
val = subset_to_matrix(interactions, user_id_map, item_id_map, df_val)

          user_id  reindex_int     _merge
8              34     14091143  left_only
13             34      5519085  left_only
14             34       942919  left_only
65             34      8024679  left_only
66             34      9487167  left_only
...           ...          ...        ...
54113345    22187      9187283  left_only
54113346    22187      5049056  left_only
54113353    22187      4348390  left_only
54113358    22187      9898403  left_only
54113359    22187     10087012  left_only

[2273004 rows x 3 columns]


In [12]:
train

<6668x2228541 sparse matrix of type '<class 'numpy.float64'>'
	with 4556972 stored elements in Compressed Sparse Row format>

In [13]:
val

<6668x2228541 sparse matrix of type '<class 'numpy.float64'>'
	with 4259407 stored elements in Compressed Sparse Row format>

In [14]:
# Initialize the model
model = LightFM(loss='warp')

# Train the  model
start = time.process_time()

model.fit(train, epochs=3)
end = time.process_time()
elapsed_time = end - start
print("Time to fit model: ", elapsed_time)

Time to fit model:  22.597467999999992


In [15]:
start = time.process_time()
test_precision = precision_at_k(model, val, k=100).mean()
end = time.process_time()
elapsed_time = end - start
print("Time to calculate test precision: ", elapsed_time)
print(test_precision)

Time to calculate test precision:  6490.66727
0.057763945
