In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import heapq
import os
import time
import math

In [27]:
dataset = 'last-fm'
algorithm = 'kgat'

In [28]:
ds_path = os.path.join(os.getcwd(), f'Data/{dataset}')
embed_path = os.path.join(os.getcwd(), f'embeds/{dataset}/{algorithm}')
plot_path = os.path.join(os.getcwd(), f'plots/{dataset}')
scores_path = os.path.join(os.getcwd(), f'scores/{dataset}/{algorithm}')
print("Dataset path:", ds_path)
print("Embed path:", embed_path)
print("Plots path:", plot_path)
print("Score path:", scores_path)

Dataset path: /Users/johnhannebery/gitrepos/recsys/Data/last-fm
Embed path: /Users/johnhannebery/gitrepos/recsys/embeds/last-fm/kgat
Plots path: /Users/johnhannebery/gitrepos/recsys/plots/last-fm
Score path: /Users/johnhannebery/gitrepos/recsys/scores/last-fm/kgat


In [29]:
user_embed = pd.read_csv(os.path.join(embed_path,'user_embed.csv'))
if algorithm == 'kgat':
    item_embed = pd.read_csv(os.path.join(embed_path,'entity_embed.csv'))
else:
    item_embed = pd.read_csv(os.path.join(embed_path,'item_embed.csv'))

In [30]:
user_list = user_embed['user_id'].tolist()

In [31]:
n_users = user_embed['user_id'].nunique()
print(n_users)

23566


In [32]:
batch_size = 1000

In [33]:
n_batches = math.ceil(n_users/batch_size)
print(n_batches)

24


In [34]:
def get_scores(user_embed, item_embed, K=20):
    u_e = user_embed.drop(columns={'user_id'}).values
    i_e = item_embed.drop(columns={'item_id'}).values

    dot = np.matmul(u_e, i_e.T)

    scores = user_embed[['user_id']].assign(a=1).merge(item_embed[['item_id']].assign(a=1)).drop(columns=['a'])
    scores['score'] = dot.flatten()
    scores = scores.sort_values('score',ascending=False)
    scores['rank'] = scores.groupby('user_id').cumcount() + 1
    
    return scores[scores['rank']<=K]

In [35]:
score_df_list = []
for b in range(n_batches):
    print(b)
    user_list_batch = user_list[b*batch_size:(b+1)*batch_size]
    user_embed_batch = user_embed[user_embed['user_id'].isin(user_list_batch)]
    
    scores_batch = get_scores(user_embed_batch, item_embed)
    score_df_list.append(scores_batch)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


In [36]:
scores_df = pd.concat(score_df_list)

In [37]:
#write scores to loc
scores_df.to_csv(os.path.join(scores_path,'scores_df.csv'))