In [1]:
import numpy as np
import pickle as pkl
import pandas as pd
import json
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm
2025-10-12 23:05:17.253277: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-12 23:05:17.383218: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-12 23:05:18.032334: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-10-12 23:05:18.032399: W tensorflow/compiler/xla/s

In [2]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [3]:
user_bpr_embedding = np.load('bpr_user_matrix_128_01.npy')
item_bpr_embedding = np.load('bpr_item_matrix_128_01.npy')

node_embeddings = np.load('node_embeddings_gcf_64_transe.npy')

In [4]:
item_bpr_embedding.shape

(196, 128)

In [4]:
n_users = 6863

In [8]:
with open('pickles/course_texts.pkl', 'rb') as f:
    course_texts = pkl.load(f)

In [9]:
with open('id2entity.pkl', 'rb') as f:
    id2entity = pkl.load(f)

# Train

In [5]:
with open('train_users_bpr_4.pkl', 'rb') as f:
    users = pkl.load(f)

In [6]:
with open('train_items_preferred_bpr_4.pkl', 'rb') as f:
    items_preferred = pkl.load(f)

In [7]:
with open('train_items_not_preferred_bpr_4.pkl', 'rb') as f:
    items_not_preferred = pkl.load(f)

In [10]:
len(users)

11924

In [11]:
len(items_preferred)

11924

In [12]:
len(items_not_preferred)

11924

In [13]:
df = pd.DataFrame([[user, preferred, not_preferred] for user, preferred, not_preferred in zip(users, items_preferred, items_not_preferred)], columns=['user_id', 'pos_item_id', 'neg_item_id'])

In [15]:
def cumulative_list(series):
    result = []
    acc = []
    for x in series:
        acc.append(x + n_users)
        result.append(acc.copy())
    return result

In [16]:
df['item_seq'] = df.groupby('user_id')['pos_item_id'].transform(cumulative_list)

In [17]:
def get_text_emb_from_index(index):
    entity = id2entity[index + n_users]
    text = course_texts[entity]
    embedding = model.encode(text)
    return embedding

In [18]:
df['pos_item_text_embedding'] = df['pos_item_id'].apply(get_text_emb_from_index)

In [19]:
df['neg_item_text_embedding'] = df['neg_item_id'].apply(get_text_emb_from_index)

In [20]:
df_user_vectors = df[['user_id', 'pos_item_text_embedding']].groupby('user_id', as_index=False).mean()

In [21]:
df_user_vectors = df_user_vectors.rename(columns={'pos_item_text_embedding': 'user_text_embedding'})
df = df.merge(df_user_vectors, on='user_id')

In [22]:
def cumulative_user_embedding(series):
    result = []
    total = np.zeros_like(series.iloc[0])
    for arr in series:
        total = total + arr
        result.append(total.copy())
    return result

In [24]:
df['user_cum_text_embedding'] = df.groupby('user_id')['pos_item_text_embedding'].transform(cumulative_user_embedding)
df['user_cum_count'] = df.groupby('user_id')['pos_item_text_embedding'].cumcount()

In [27]:
df['user_cum_text_embedding'] = df['user_cum_text_embedding'] / (df['user_cum_count'] + 1)
df = df.drop(columns='user_cum_count')

In [28]:
df['pos_item_bpr_embedding'] = df['pos_item_id'].apply(lambda x: item_bpr_embedding[x, :])
df['neg_item_bpr_embedding'] = df['neg_item_id'].apply(lambda x: item_bpr_embedding[x, :])
df['user_bpr_embedding'] = df['user_id'].apply(lambda x: user_bpr_embedding[x, :])

In [29]:
df['pos_item_graph_embedding'] = df['pos_item_id'].apply(lambda x: node_embeddings[x + n_users, :])
df['neg_item_graph_embedding'] = df['neg_item_id'].apply(lambda x: node_embeddings[x + n_users, :])
df['user_graph_embedding'] = df['user_id'].apply(lambda x: node_embeddings[x, :])

In [30]:
df['pos_item_id'] = df['pos_item_id'] + n_users
df['neg_item_id'] = df['neg_item_id'] + n_users

In [31]:
df.head()

Unnamed: 0,user_id,pos_item_id,neg_item_id,item_seq,pos_item_text_embedding,neg_item_text_embedding,user_text_embedding,user_cum_text_embedding,pos_item_bpr_embedding,neg_item_bpr_embedding,user_bpr_embedding,pos_item_graph_embedding,neg_item_graph_embedding,user_graph_embedding
0,0,6897,7001,[6897],"[0.00029639987, 0.014771659, 0.014058901, -0.0...","[-0.021708943, -0.07313048, 0.0040764194, 0.01...","[0.00029639987, 0.014771659, 0.014058901, -0.0...","[0.00029639987, 0.014771659, 0.014058901, -0.0...","[-0.004581114, 0.0626458, -0.02941907, -0.0164...","[0.02627153, -0.042739343, 0.0686576, 0.071189...","[-0.16485311, 0.13305487, -0.109800704, -0.159...","[-0.019909197, 0.015652379, -0.18591776, -0.19...","[-0.09981072, -0.17273627, -0.06508206, -0.185...","[-0.21868221, 0.107340455, 0.20054282, 0.15184..."
1,1,6920,6998,[6920],"[-0.001356997, -0.027733184, 0.010143433, -0.0...","[0.06441419, 0.036574744, 0.0004071652, -0.046...","[-0.001356997, -0.027733184, 0.010143433, -0.0...","[-0.001356997, -0.027733184, 0.010143433, -0.0...","[-0.031264078, 0.065558866, 0.002417838, -0.05...","[0.11860577, -0.10629058, 0.09773831, 0.059424...","[-0.15681404, 0.14787957, -0.14133789, -0.1282...","[0.21109915, 0.16339235, 0.031736396, -0.01513...","[-0.033010166, -0.1609984, -0.0942061, -0.1157...","[0.16515407, 0.18278724, -0.07278056, -0.06131..."
2,2,6937,6992,[6937],"[0.009646171, 0.015768941, 0.0102583, 0.011016...","[0.030553505, -0.08276028, 0.021069285, -0.022...","[0.01137045, -0.021448795, 0.0008535719, 0.005...","[0.009646171, 0.015768941, 0.0102583, 0.011016...","[-0.048642207, 0.019007254, -0.033599593, -0.0...","[0.10899715, -0.09564237, 0.10501876, 0.085825...","[-0.12063337, 0.13376932, -0.13144864, -0.0975...","[-0.07782954, 0.19538878, 0.08653913, -0.04023...","[0.041632142, 0.070655555, -0.13588788, -0.160...","[-0.21761492, 0.1274874, 0.19495715, -0.179376..."
3,2,6961,7043,"[6937, 6961]","[0.03704412, -0.011753682, 0.0050762245, 0.021...","[-0.007348519, -0.056608632, -0.011322977, 0.0...","[0.01137045, -0.021448795, 0.0008535719, 0.005...","[0.023345144, 0.0020076297, 0.007667262, 0.016...","[-0.014944556, -0.028715644, 0.0095228795, 0.0...","[0.120994814, -0.089827366, 0.10192869, 0.0929...","[-0.12063337, 0.13376932, -0.13144864, -0.0975...","[0.16374268, -0.07315293, 0.10547099, 0.158405...","[0.03657927, -0.060817167, -0.195846, -0.21245...","[-0.21761492, 0.1274874, 0.19495715, -0.179376..."
4,2,6895,7040,"[6937, 6961, 6895]","[-0.031121682, -0.08368267, 8.339915e-05, 0.06...","[-0.004670808, -0.05079265, 0.0073155914, -0.0...","[0.01137045, -0.021448795, 0.0008535719, 0.005...","[0.0051895357, -0.026555805, 0.0051393076, 0.0...","[-0.073091574, 0.10878877, -0.07272676, -0.079...","[0.09851998, -0.10024271, 0.09955378, 0.079989...","[-0.12063337, 0.13376932, -0.13144864, -0.0975...","[0.18731615, 0.13930675, 0.13596186, 0.1819212...","[-0.022520965, 0.16019231, -0.16499598, -0.088...","[-0.21761492, 0.1274874, 0.19495715, -0.179376..."


In [17]:
with open('train_bpr_all_vectors_128_01_transe.pkl', 'wb') as f:
    pkl.dump(df, f)

In [33]:
df_binary_pos = df[['user_id', 'pos_item_id', 'item_seq', 'pos_item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'pos_item_bpr_embedding', 'user_bpr_embedding', 'pos_item_graph_embedding', 'user_graph_embedding']].copy()
df_binary_pos.columns = ['user_id', 'item_id', 'item_seq', 'item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'item_bpr_embedding', 'user_bpr_embedding', 'item_graph_embedding', 'user_graph_embedding']
df_binary_pos['label'] = 1

In [34]:
df_binary_neg = df[['user_id', 'neg_item_id', 'item_seq', 'neg_item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'neg_item_bpr_embedding', 'user_bpr_embedding', 'neg_item_graph_embedding', 'user_graph_embedding']].copy()
df_binary_neg.columns = ['user_id', 'item_id', 'item_seq', 'item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'item_bpr_embedding', 'user_bpr_embedding', 'item_graph_embedding', 'user_graph_embedding']
df_binary_neg['label'] = 0

In [35]:
df_binary = pd.concat([df_binary_pos, df_binary_neg])

In [18]:
with open('train_binary_all_vectors_128_01_transe.pkl', 'wb') as f:
    pkl.dump(df_binary, f)

# Test

In [5]:
with open('test_users_bpr_4.pkl', 'rb') as f:
    users = pkl.load(f)

In [6]:
with open('test_items_preferred_bpr_4.pkl', 'rb') as f:
    items_preferred = pkl.load(f)

In [7]:
with open('test_items_not_preferred_bpr_4.pkl', 'rb') as f:
    items_not_preferred = pkl.load(f)

In [10]:
len(users)

11924

In [11]:
len(items_preferred)

11924

In [12]:
len(items_not_preferred)

11924

In [13]:
df = pd.DataFrame([[user, preferred, not_preferred] for user, preferred, not_preferred in zip(users, items_preferred, items_not_preferred)], columns=['user_id', 'pos_item_id', 'neg_item_id'])

In [16]:
df['item_seq'] = df.groupby('user_id')['pos_item_id'].transform(cumulative_list)

In [18]:
df['pos_item_text_embedding'] = df['pos_item_id'].apply(get_text_emb_from_index)

In [19]:
df['neg_item_text_embedding'] = df['neg_item_id'].apply(get_text_emb_from_index)

In [20]:
df_user_vectors = df[['user_id', 'pos_item_text_embedding']].groupby('user_id', as_index=False).mean()

In [21]:
df_user_vectors = df_user_vectors.rename(columns={'pos_item_text_embedding': 'user_text_embedding'})
df = df.merge(df_user_vectors, on='user_id')

In [24]:
df['user_cum_text_embedding'] = df.groupby('user_id')['pos_item_text_embedding'].transform(cumulative_user_embedding)
df['user_cum_count'] = df.groupby('user_id')['pos_item_text_embedding'].cumcount()

In [27]:
df['user_cum_text_embedding'] = df['user_cum_text_embedding'] / (df['user_cum_count'] + 1)
df = df.drop(columns='user_cum_count')

In [28]:
df['pos_item_bpr_embedding'] = df['pos_item_id'].apply(lambda x: item_bpr_embedding[x, :])
df['neg_item_bpr_embedding'] = df['neg_item_id'].apply(lambda x: item_bpr_embedding[x, :])
df['user_bpr_embedding'] = df['user_id'].apply(lambda x: user_bpr_embedding[x, :])

In [29]:
df['pos_item_graph_embedding'] = df['pos_item_id'].apply(lambda x: node_embeddings[x + n_users, :])
df['neg_item_graph_embedding'] = df['neg_item_id'].apply(lambda x: node_embeddings[x + n_users, :])
df['user_graph_embedding'] = df['user_id'].apply(lambda x: node_embeddings[x, :])

In [30]:
df['pos_item_id'] = df['pos_item_id'] + n_users
df['neg_item_id'] = df['neg_item_id'] + n_users

In [33]:
df_binary_pos = df[['user_id', 'pos_item_id', 'item_seq', 'pos_item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'pos_item_bpr_embedding', 'user_bpr_embedding', 'pos_item_graph_embedding', 'user_graph_embedding']].copy()
df_binary_pos.columns = ['user_id', 'item_id', 'item_seq', 'item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'item_bpr_embedding', 'user_bpr_embedding', 'item_graph_embedding', 'user_graph_embedding']
df_binary_pos['label'] = 1

In [34]:
df_binary_neg = df[['user_id', 'neg_item_id', 'item_seq', 'neg_item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'neg_item_bpr_embedding', 'user_bpr_embedding', 'neg_item_graph_embedding', 'user_graph_embedding']].copy()
df_binary_neg.columns = ['user_id', 'item_id', 'item_seq', 'item_text_embedding', 'user_text_embedding', 'user_cum_text_embedding', 'item_bpr_embedding', 'user_bpr_embedding', 'item_graph_embedding', 'user_graph_embedding']
df_binary_neg['label'] = 0

In [35]:
df_binary = pd.concat([df_binary_pos, df_binary_neg])

In [7]:
with open('train_bpr_all_vectors_128_01_transe.pkl', 'rb') as f:
    df_train = pkl.load(f)

In [11]:
user_embs = df_train[['user_id', 'user_text_embedding']].groupby('user_id')['user_text_embedding'].mean().to_dict()

In [14]:
df['user_text_embedding'] = df['user_id'].map(user_embs)

In [15]:
df_binary['user_text_embedding'] = df_binary['user_id'].map(user_embs)

In [24]:
user_seqs = df_train[['user_id', 'item_seq']].groupby('user_id')['item_seq'].last().to_dict()

In [28]:
df['item_seq_train'] = df['user_id'].map(user_seqs)
df_binary['item_seq_train'] = df_binary['user_id'].map(user_seqs)

In [29]:
with open('test_bpr_all_vectors_128_01_transe.pkl', 'wb') as f:
    pkl.dump(df, f)

In [30]:
with open('test_binary_all_vectors_128_01_transe.pkl', 'wb') as f:
    pkl.dump(df_binary, f)

In [31]:
df.head()

Unnamed: 0,user_id,pos_item_id,neg_item_id,item_seq,pos_item_text_embedding,neg_item_text_embedding,user_text_embedding,user_cum_text_embedding,pos_item_bpr_embedding,neg_item_bpr_embedding,user_bpr_embedding,pos_item_graph_embedding,neg_item_graph_embedding,user_graph_embedding,item_seq_train
0,0,6897,7001,[6897],"[0.00029639987, 0.014771659, 0.014058901, -0.0...","[-0.021708943, -0.07313048, 0.0040764194, 0.01...","[-0.022005824, -0.033391304, -0.013403211, -0....","[0.00029639987, 0.014771659, 0.014058901, -0.0...","[-0.004581114, 0.0626458, -0.02941907, -0.0164...","[0.02627153, -0.042739343, 0.0686576, 0.071189...","[-0.16485311, 0.13305487, -0.109800704, -0.159...","[-0.019909197, 0.015652379, -0.18591776, -0.19...","[-0.09981072, -0.17273627, -0.06508206, -0.185...","[-0.21868221, 0.107340455, 0.20054282, 0.15184...","[6863, 6864]"
1,1,6920,6998,[6920],"[-0.001356997, -0.027733184, 0.010143433, -0.0...","[0.06441419, 0.036574744, 0.0004071652, -0.046...","[0.034128785, -0.054559205, 0.0015806267, -0.0...","[-0.001356997, -0.027733184, 0.010143433, -0.0...","[-0.031264078, 0.065558866, 0.002417838, -0.05...","[0.11860577, -0.10629058, 0.09773831, 0.059424...","[-0.15681404, 0.14787957, -0.14133789, -0.1282...","[0.21109915, 0.16339235, 0.031736396, -0.01513...","[-0.033010166, -0.1609984, -0.0942061, -0.1157...","[0.16515407, 0.18278724, -0.07278056, -0.06131...","[6865, 6866]"
2,2,6937,6992,[6937],"[0.009646171, 0.015768941, 0.0102583, 0.011016...","[0.030553505, -0.08276028, 0.021069285, -0.022...","[0.017988527, -0.038172964, -0.008762296, -0.0...","[0.009646171, 0.015768941, 0.0102583, 0.011016...","[-0.048642207, 0.019007254, -0.033599593, -0.0...","[0.10899715, -0.09564237, 0.10501876, 0.085825...","[-0.12063337, 0.13376932, -0.13144864, -0.0975...","[-0.07782954, 0.19538878, 0.08653913, -0.04023...","[0.041632142, 0.070655555, -0.13588788, -0.160...","[-0.21761492, 0.1274874, 0.19495715, -0.179376...","[6867, 6868, 6869, 6870, 6871, 6872, 6873, 687..."
3,2,6961,7043,"[6937, 6961]","[0.03704412, -0.011753682, 0.0050762245, 0.021...","[-0.007348519, -0.056608632, -0.011322977, 0.0...","[0.017988527, -0.038172964, -0.008762296, -0.0...","[0.023345144, 0.0020076297, 0.007667262, 0.016...","[-0.014944556, -0.028715644, 0.0095228795, 0.0...","[0.120994814, -0.089827366, 0.10192869, 0.0929...","[-0.12063337, 0.13376932, -0.13144864, -0.0975...","[0.16374268, -0.07315293, 0.10547099, 0.158405...","[0.03657927, -0.060817167, -0.195846, -0.21245...","[-0.21761492, 0.1274874, 0.19495715, -0.179376...","[6867, 6868, 6869, 6870, 6871, 6872, 6873, 687..."
4,2,6895,7040,"[6937, 6961, 6895]","[-0.031121682, -0.08368267, 8.339915e-05, 0.06...","[-0.004670808, -0.05079265, 0.0073155914, -0.0...","[0.017988527, -0.038172964, -0.008762296, -0.0...","[0.0051895357, -0.026555805, 0.0051393076, 0.0...","[-0.073091574, 0.10878877, -0.07272676, -0.079...","[0.09851998, -0.10024271, 0.09955378, 0.079989...","[-0.12063337, 0.13376932, -0.13144864, -0.0975...","[0.18731615, 0.13930675, 0.13596186, 0.1819212...","[-0.022520965, 0.16019231, -0.16499598, -0.088...","[-0.21761492, 0.1274874, 0.19495715, -0.179376...","[6867, 6868, 6869, 6870, 6871, 6872, 6873, 687..."
