# Init

In [2]:
import itertools
import numpy as np
import os
import pandas as pd
import pathlib
import pickle
import random
import re
import scipy.sparse
import string

from ast import literal_eval
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

tqdm.pandas()

In [3]:
# CATEGORY = 'Beauty'
# CATEGORY = 'DVDs'
# CATEGORY = 'Food & Drink'
CATEGORY = 'Games'
# CATEGORY = 'Internet'

signal = 'like'
# signal = 'write'
# signal = 'both'

In [4]:
DATA_PATH = '/sise/bshapira-group/lilachzi/models/magnn/data'
save_prefix = os.path.join(DATA_PATH, f'preprocessed/Ciao{CATEGORY.replace(" & ", "_")}_{signal}_processed/')
raw_prefix = os.path.join(DATA_PATH, f'raw/Ciao{CATEGORY.replace(" & ", "_")}_{signal}/')

if not os.path.exists(save_prefix):
    os.mkdir(save_prefix)
if not os.path.exists(raw_prefix):
    os.mkdir(raw_prefix)

save_prefix, raw_prefix

('/sise/bshapira-group/lilachzi/models/magnn/data/preprocessed/CiaoGames_like_processed/',
 '/sise/bshapira-group/lilachzi/models/magnn/data/raw/CiaoGames_like/')

# Data Loading

In [5]:
CSVS_PATH = '/sise/bshapira-group/lilachzi/csvs/'
train_set = pd.read_csv(os.path.join(CSVS_PATH, 'train_set.csv'), converters={'y_true': literal_eval})
train_set = train_set[train_set['category'] == CATEGORY]
train_set.reset_index(inplace=True)

val_set = pd.read_csv(os.path.join(CSVS_PATH, 'valid_set.csv'), converters={'y_true': literal_eval})
val_set = val_set[val_set['category'] == CATEGORY]
val_set.reset_index(inplace=True)

test_set = pd.read_csv(os.path.join(CSVS_PATH, 'test_set.csv'), converters={'y_true': literal_eval})
test_set = test_set[test_set['category'] == CATEGORY]
test_set.reset_index(inplace=True)

In [6]:
ciao = pd.read_csv(os.path.join(CSVS_PATH, 'ciao_4core.tsv'), sep='\t')
ciao = ciao[ciao['category'] == CATEGORY]

# Preprocess data

In [7]:
# get unique user_ids and product_ids from *all* sets, because some ids exist only in some of the sets
users = set(set(train_set['voter_id'].unique()) | set(val_set['voter_id'].unique()) | set(test_set['voter_id'].unique()))
products = set(set(train_set['product_id'].unique()) | set(val_set['product_id'].unique()) | set(test_set['product_id'].unique()))

reviews = []
for row in train_set['y_true']:
    reviews.extend(list(row.keys()))
for row in val_set['y_true']:
    reviews.extend(list(row.keys()))
for row in test_set['y_true']:
    reviews.extend(list(row.keys()))
reviews = set(reviews)

In [8]:
num_users = len(users)
num_reviews = len(reviews)
num_products = len(products)

print(f'Users: {num_users}')
print(f'Reviews: {num_reviews}')
print(f'Products: {num_products}')

Users: 4269
Reviews: 8919
Products: 909


In [9]:
def node_mapping(nodes):
    # id2idx = mapping betweem the node original id and its sorted index
    # idx2id = mapping between the sorted index and the corresponsing original id
    id2idx, idx2id = {}, {}

    for idx, node in enumerate(nodes):
        id2idx[node] = idx
        idx2id[idx] = node

    return id2idx, idx2id

users_id2idx, users_idx2id = node_mapping(users)
reviews_id2idx, reviews_idx2id = node_mapping(reviews)
products_id2idx, products_idx2id = node_mapping(products)

In [10]:
mappings = {
    'umap': users_idx2id,
    'rmap': reviews_idx2id,
    'pmap': products_idx2id
}

with open(os.path.join(save_prefix, 'mappings.pickle'), 'wb') as f:
    pickle.dump(mappings, f)

## Build user-review interactions and negative samples

### User-review interaction 

#### Liked interactions

In [46]:
user_review, review_product, user_product = [], set(), set()
idx_dict = {'train': [], 'valid': [], 'test': []}
positive_vote = [3, 4, 5]

train_set['voter_idx'] = train_set['voter_id'].map(users_id2idx)
train_set['product_idx'] = train_set['product_id'].map(products_id2idx)

count = 0
for _, row in tqdm(train_set.iterrows(), total=len(train_set), desc='like train interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
    
    # if the user liked one of the reviews in the product, add user-product interaction
    if any([r in row['y_true'].values() for r in positive_vote]):
        user_product.add((user_idx, product_idx))
        
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        review_product.add((review_idx, product_idx))
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['train'].append(count)
            count += 1
            
val_set['voter_idx'] = val_set['voter_id'].map(users_id2idx)
val_set['product_idx'] = val_set['product_id'].map(products_id2idx)

for _, row in tqdm(val_set.iterrows(), total=len(val_set), desc='valid interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
        
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['valid'].append(count)
            count += 1
            
test_set['voter_idx'] = test_set['voter_id'].map(users_id2idx)
test_set['product_idx'] = test_set['product_id'].map(products_id2idx)

for _, row in tqdm(test_set.iterrows(), total=len(test_set), desc='test interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
    
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['test'].append(count)
            count += 1

like train interactions: 100%|██████████| 242500/242500 [00:12<00:00, 19694.98it/s]
valid interactions: 100%|██████████| 81173/81173 [00:03<00:00, 23758.28it/s]
test interactions: 100%|██████████| 81402/81402 [00:03<00:00, 23839.27it/s]


#### Write interactions

In [54]:
user_review, review_product, user_product = [], set(), set()
idx_dict = {'train': [], 'valid': [], 'test': []}
positive_vote = [3, 4, 5]

ciao['user_idx'] = ciao['user_id'].map(users_id2idx)
ciao['review_idx'] = ciao['review_id'].map(reviews_id2idx)
ciao['product_idx'] = ciao['product_id'].map(products_id2idx)

count = 0
for _, row in tqdm(ciao.iterrows(), total=len(ciao), desc='write train interactions'):
    if row['user_id'] not in list(users):
        continue
        
    user_idx = row['user_idx']
    review_idx = row['review_idx']
    product_idx = row['product_idx']
    
    user_review.append((user_idx, review_idx))
    review_product.add((review_idx, product_idx))
    user_product.add((user_idx, product_idx))
    idx_dict['train'].append(count)
    count += 1
    
val_set['voter_idx'] = val_set['voter_id'].map(users_id2idx)
val_set['product_idx'] = val_set['product_id'].map(products_id2idx)

for _, row in tqdm(val_set.iterrows(), total=len(val_set), desc='valid interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
    
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['valid'].append(count)
            count += 1
            
test_set['voter_idx'] = test_set['voter_id'].map(users_id2idx)
test_set['product_idx'] = test_set['product_id'].map(products_id2idx)

for _, row in tqdm(test_set.iterrows(), total=len(test_set), desc='test interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
        
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['test'].append(count)
            count += 1

write train interactions: 100%|██████████| 27964/27964 [00:06<00:00, 4404.06it/s]
valid interactions: 100%|██████████| 81173/81173 [00:03<00:00, 23384.65it/s]
test interactions: 100%|██████████| 81402/81402 [00:03<00:00, 23296.65it/s]


#### Both interactions

In [54]:
user_review, author_review, review_product, user_product = [], [], set(), set()
idx_dict = {'train': [], 'valid': [], 'test': []}
positive_vote = [3, 4, 5]

count = 0
# Like interactions
train_set['voter_idx'] = train_set['voter_id'].map(users_id2idx)
train_set['product_idx'] = train_set['product_id'].map(products_id2idx)
for _, row in tqdm(train_set.iterrows(), total=len(train_set), desc='like train interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
    
    # if the user liked one of the reviews in the product, add user-product interaction
    if any([r in row['y_true'].values() for r in positive_vote]):
        user_product.add((user_idx, product_idx))
        
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        review_product.add((review_idx, product_idx))
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['train'].append(count)
            count += 1
# Write interactions
ciao['user_idx'] = ciao['user_id'].map(users_id2idx)
ciao['review_idx'] = ciao['review_id'].map(reviews_id2idx)
ciao['product_idx'] = ciao['product_id'].map(products_id2idx)
for _, row in tqdm(ciao.iterrows(), total=len(ciao), desc='write train interactions'):
    if row['user_id'] not in list(users):
        continue
        
    user_idx = row['user_idx']
    review_idx = row['review_idx']
    product_idx = row['product_idx']
    
    author_review.append((user_idx, review_idx))
    review_product.add((review_idx, product_idx))
    user_product.add((user_idx, product_idx))
    
val_set['voter_idx'] = val_set['voter_id'].map(users_id2idx)
val_set['product_idx'] = val_set['product_id'].map(products_id2idx)
    
for _, row in tqdm(val_set.iterrows(), total=len(val_set), desc='valid interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
        
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['valid'].append(count)
            count += 1
            
test_set['voter_idx'] = test_set['voter_id'].map(users_id2idx)
test_set['product_idx'] = test_set['product_id'].map(products_id2idx)

for _, row in tqdm(test_set.iterrows(), total=len(test_set), desc='test interactions'):
    user_idx = row['voter_idx']
    product_idx = row['product_idx']
        
    for review_id, vote in row['y_true'].items():
        review_idx = reviews_id2idx[review_id]
        
        if vote in positive_vote:    
            user_review.append((user_idx, review_idx))
            idx_dict['test'].append(count)
            count += 1

like train interactions: 100%|██████████| 145090/145090 [00:07<00:00, 19170.95it/s]
write train interactions: 100%|██████████| 12084/12084 [00:02<00:00, 5960.60it/s]
valid interactions: 100%|██████████| 48240/48240 [00:02<00:00, 23348.60it/s]
test interactions: 100%|██████████| 48588/48588 [00:02<00:00, 23269.41it/s]


### Negative samples

In [55]:
user_review = np.array(user_review).astype(int)
review_product = list(review_product)
user_product = list(user_product)

train_idx = idx_dict['train']
val_idx = idx_dict['valid']
test_idx = idx_dict['test']

In [56]:
review_product_map = ciao[['review_id', 'product_id']].set_index('review_id')

In [57]:
def get_negative_reviews(user_idx, y_true, pos_candidates):
    y_true = {reviews_id2idx[r_id]: rate for r_id, rate in y_true.items()}
    neg_y_true = {r_idx: rate for r_idx, rate in y_true.items() if (user_idx, r_idx) not in pos_candidates}
    
    return neg_y_true

if signal == 'like':
    train_pos_candidates = set(map(tuple, user_review[train_idx]))
    val_pos_candidates = set(map(tuple, user_review[val_idx]))
    test_pos_candidates = set(map(tuple, user_review[test_idx]))
elif signal == 'write':
    train_set['voter_idx'] = train_set['voter_id'].map(users_id2idx)
    
    train_pos_candidates = set(map(tuple, user_review[train_idx]))
    val_pos_candidates = set(map(tuple, user_review[val_idx]))
    test_pos_candidates = set(map(tuple, user_review[test_idx]))
elif signal == 'both':
    train_pos_candidates = np.concatenate([user_review[train_idx], np.array(author_review).astype(int)])
    train_pos_candidates = set(map(tuple, train_pos_candidates))
    val_pos_candidates = set(map(tuple, user_review[val_idx]))
    test_pos_candidates = set(map(tuple, user_review[test_idx]))
    
neg_train_set = train_set.copy()
neg_train_set['y_true'] = neg_train_set.progress_apply(
    lambda r: get_negative_reviews(r['voter_idx'], r['y_true'], train_pos_candidates), axis=1
)

neg_val_set = val_set.copy()
neg_val_set['y_true'] = neg_val_set.progress_apply(
    lambda r: get_negative_reviews(r['voter_idx'], r['y_true'], val_pos_candidates), axis=1
)

neg_test_set = test_set.copy()
neg_test_set['y_true'] = neg_test_set.progress_apply(
    lambda r: get_negative_reviews(r['voter_idx'], r['y_true'], test_pos_candidates), axis=1
)

100%|██████████| 145090/145090 [00:01<00:00, 88613.39it/s] 
100%|██████████| 48240/48240 [00:00<00:00, 86159.01it/s]
100%|██████████| 48588/48588 [00:00<00:00, 85739.53it/s]


#### Product review set negative sampling

For each user-review positive interaction, we will take random negative interaction from the same product set review, related to relevant review_id. 

In [58]:
train_neg_candidates = []
new_train_idx = []
for idx, (user_idx, review_idx) in tqdm(enumerate(train_pos_candidates), total=len(train_pos_candidates)):
    review_id = reviews_idx2id[review_idx]
    product_id = review_product_map.loc[review_id]['product_id']
    
    try:
        neg_reviews = neg_train_set[
            (neg_train_set['voter_idx'] == user_idx) & 
            (neg_train_set['product_id'] == product_id)
        ]['y_true'].values[0]
        
        r_idx = random.choice(list(neg_reviews.keys()))
        train_neg_candidates.append((user_idx, r_idx))
        new_train_idx.append(idx)
    except:
        if signal == 'write':
            neg_reviews = neg_train_set[
                (neg_train_set['voter_idx'] == user_idx) & 
                (neg_train_set['product_id'] == product_id)
            ]
            if neg_reviews.empty:
                neg_reviews = neg_train_set[(neg_train_set['product_id'] == product_id)]['y_true'].values[0]
                r_idx = random.choice(list(neg_reviews.keys()))
                train_neg_candidates.append((user_idx, r_idx))
                new_train_idx.append(idx)
        
        continue
        
val_neg_candidates = []
for idx, row in tqdm(neg_val_set.iterrows(), total=len(neg_val_set)):
    user_idx = row['voter_idx']
    neg_reviews = row['y_true']
    
    for r_idx in neg_reviews.keys():
        val_neg_candidates.append((user_idx, r_idx))
        
test_neg_candidates = []
for idx, row in tqdm(neg_test_set.iterrows(), total=len(neg_test_set)):
    user_idx = row['voter_idx']
    neg_reviews = row['y_true']
    
    for r_idx in neg_reviews.keys():
        test_neg_candidates.append((user_idx, r_idx))
        
np.savez(save_prefix + 'train_val_test_neg_user_review.npz',
         train_neg_user_review=train_neg_candidates,
         val_neg_user_review=val_neg_candidates,
         test_neg_user_review=test_neg_candidates)
np.savez(save_prefix + 'train_val_test_pos_user_review.npz',
         train_pos_user_review=user_review[new_train_idx],
         val_pos_user_review=user_review[val_idx],
         test_pos_user_review=user_review[test_idx])

100%|██████████| 219026/219026 [01:49<00:00, 2008.10it/s]
100%|██████████| 48240/48240 [00:01<00:00, 26147.65it/s]
100%|██████████| 48588/48588 [00:01<00:00, 26187.39it/s]


## Build adjacency matrix and metapaths

In [59]:
user_review = pd.DataFrame(user_review, columns=['user_idx', 'review_idx'])
user_review = user_review.loc[new_train_idx].drop_duplicates().reset_index(drop=True)

if signal == 'both':
    author_review = pd.DataFrame(np.array(list(author_review)).astype(int), columns=['author_idx', 'review_idx'])
review_product = pd.DataFrame(np.array(review_product).astype(int), columns=['review_idx', 'product_idx'])
user_product = pd.DataFrame(np.array(user_product).astype(int), columns=['user_idx', 'product_idx'])

In [60]:
# build the adjacency matrix
# 0 for user, 1 for review, 2 for product
dim = num_users + num_reviews + num_products

type_mask = np.zeros((dim), dtype=int)
type_mask[num_users:num_users+num_reviews] = 1
type_mask[num_users+num_reviews:] = 2

adj_mat = np.zeros((dim, dim), dtype=int)
for _, row in tqdm(user_review.iterrows(), total=len(user_review)):
    uid = row['user_idx']
    rid = num_users + row['review_idx']
    adj_mat[uid, rid] = 1
    adj_mat[rid, uid] = 1
if signal == 'both':
    for _, row in tqdm(author_review.iterrows(), total=len(author_review)):
        aid = row['author_idx']
        rid = num_users + row['review_idx']
        adj_mat[aid, rid] = 2
        adj_mat[rid, aid] = 2
for _, row in tqdm(review_product.iterrows(), total=len(review_product)):
    rid = num_users + row['review_idx']
    pid = num_users + num_reviews + row['product_idx']
    adj_mat[rid, pid] = 1
    adj_mat[pid, rid] = 1
for _, row in tqdm(user_product.iterrows(), total=len(user_product)):
    uid = row['user_idx']
    pid = num_users + num_reviews + row['product_idx']
    adj_mat[uid, pid] = 1
    adj_mat[pid, uid] = 1

100%|██████████| 208850/208850 [00:06<00:00, 33648.18it/s]
100%|██████████| 11620/11620 [00:00<00:00, 35405.72it/s]
100%|██████████| 12084/12084 [00:00<00:00, 32901.35it/s]
100%|██████████| 152874/152874 [00:04<00:00, 35329.48it/s]


In [61]:
with open(os.path.join(raw_prefix, 'user_review.dat'), 'w') as f:
    for line in tqdm(user_review.to_numpy().astype(str)):
        line = '\t'.join(line)
        f.write(f"{line}\n")
if signal == 'both':
    with open(os.path.join(raw_prefix, 'author_review.dat'), 'w+') as f:
        for line in tqdm(author_review.to_numpy().astype(str)):
            line = '\t'.join(line)
            f.write(f"{line}\n")
with open(os.path.join(raw_prefix, 'review_product.dat'), 'w') as f:
    for line in tqdm(review_product.to_numpy().astype(str)):
        line = '\t'.join(line)
        f.write(f"{line}\n")
with open(os.path.join(raw_prefix, 'user_product.dat'), 'w') as f:
    for line in tqdm(user_product.to_numpy().astype(str)):
        line = '\t'.join(line)
        f.write(f"{line}\n")

100%|██████████| 208850/208850 [00:00<00:00, 424315.57it/s]
100%|██████████| 11620/11620 [00:00<00:00, 168474.48it/s]
100%|██████████| 12084/12084 [00:00<00:00, 177994.00it/s]
100%|██████████| 152874/152874 [00:00<00:00, 249152.34it/s]


In [62]:
user_review_list = {i: adj_mat[i, num_users:num_users+num_reviews].nonzero()[0] for i in range(num_users)}
review_user_list = {i: adj_mat[num_users + i, :num_users].nonzero()[0] for i in range(num_reviews)}
review_product_list = {i: adj_mat[num_users + i, num_users+num_reviews:].nonzero()[0] for i in range(num_reviews)}
product_review_list = {i: adj_mat[num_users + num_reviews + i, num_users:num_users+num_reviews].nonzero()[0] for i in range(num_products)}
user_product_list = {i: adj_mat[i, num_users+num_reviews:].nonzero()[0] for i in range(num_users)}
product_user_list = {i: adj_mat[num_users+num_reviews+i, :num_users].nonzero()[0] for i in range(num_products)}

if signal == 'both':
    author_review_list = {i: np.where(adj_mat[i, num_users:num_users+num_reviews] == 2)[0] for i in range(num_users)}
    review_author_list = {i: np.where(adj_mat[num_users + i, :num_users] == 2)[0] for i in range(num_reviews)}

In [63]:
# 0-1-0
def create_u_r_u():
    print('Create u-r-u list')
    u_r_u = []
    for r, u_list in tqdm(review_user_list.items()):
        u_r_u.extend([(u1, r, u2) for u1 in u_list for u2 in u_list])
    u_r_u = np.array(u_r_u)
    u_r_u[:, 1] += num_users
    sorted_index = sorted(list(range(len(u_r_u))), key=lambda i : u_r_u[i, [0, 2, 1]].tolist())
    u_r_u = u_r_u[sorted_index]
    return u_r_u

# 1-2-1
def create_r_p_r():
    print('Create r-p-r list')
    r_p_r = []
    for p, r_list in tqdm(product_review_list.items()):
        r_p_r.extend([(r1, p, r2) for r1 in r_list for r2 in r_list])
    r_p_r = np.array(r_p_r)
    r_p_r += num_users
    r_p_r[:, 1] += num_reviews
    sorted_index = sorted(list(range(len(r_p_r))), key=lambda i : r_p_r[i, [0, 2, 1]].tolist())
    r_p_r = r_p_r[sorted_index]
    return r_p_r

#0-1-2-1-0
def create_u_r_p_r_u():
    print('Create u-r-p-r-u list')
    u_r_p_r_u = []
    r_p_r = create_r_p_r()
    for r1, p, r2 in tqdm(r_p_r):
        if len(review_user_list[r1 - num_users]) == 0 or len(review_user_list[r2 - num_users]) == 0:
            continue
        candidate_u1_list = np.random.choice(len(review_user_list[r1 - num_users]), int(0.2 * len(review_user_list[r1 - num_users])), replace=False)
        candidate_u1_list = review_user_list[r1 - num_users][candidate_u1_list]
        candidate_u2_list = np.random.choice(len(review_user_list[r2 - num_users]), int(0.2 * len(review_user_list[r2 - num_users])), replace=False)
        candidate_u2_list = review_user_list[r2 - num_users][candidate_u2_list]
        u_r_p_r_u.extend([(u1, r1, p, r2, u2) for u1 in candidate_u1_list for u2 in candidate_u2_list])
    u_r_p_r_u = np.array(u_r_p_r_u)
    sorted_index = sorted(list(range(len(u_r_p_r_u))), key=lambda i : u_r_p_r_u[i, [0, 4, 1, 2, 3]].tolist())
    u_r_p_r_u = u_r_p_r_u[sorted_index]
    return u_r_p_r_u

# For writers version, where for each user/review there is only one matching review/user
def create_a_r_p_r_a():
    print('Create u-r-p-r-u list')
    u_r_p_r_u = []
    r_p_r = create_r_p_r()
    for r1, p, r2 in r_p_r:
        if len(review_user_list[r1 - num_users]) == 0 or len(review_user_list[r2 - num_users]) == 0:
            continue
        candidate_u1_list = review_user_list[r1 - num_users]
        candidate_u2_list = review_user_list[r2 - num_users]
        u_r_p_r_u.extend([(u1, r1, p, r2, u2) for u1 in candidate_u1_list for u2 in candidate_u2_list])
    u_r_p_r_u = np.array(u_r_p_r_u)
    sorted_index = sorted(list(range(len(u_r_p_r_u))), key=lambda i : u_r_p_r_u[i, [0, 4, 1, 2, 3]].tolist())
    u_r_p_r_u = u_r_p_r_u[sorted_index]
    return u_r_p_r_u

# 1-0-1
def create_r_u_r():
    print('Create r-u-r list')
    r_u_r = []
    for u, r_list in tqdm(user_review_list.items()):
        r_u_r.extend([(r1, u, r2) for r1 in r_list for r2 in r_list])
    r_u_r = np.array(r_u_r)
    r_u_r[:, [0, 2]] += num_users
    sorted_index = sorted(list(range(len(r_u_r))), key=lambda i : r_u_r[i, [0, 2, 1]].tolist())
    r_u_r = r_u_r[sorted_index]
    return r_u_r

def create_r_a_r():
    print('Create r-u-r list')
    r_u_r = []
    for u, r_list in user_review_list.items():
        r_u_r.extend([(r1, u, r2) for r1 in r_list for r2 in r_list])
    r_u_r = np.array(r_u_r)
    r_u_r[:, [0, 2]] += num_users
    # sorted_index = sorted(list(range(len(r_u_r))), key=lambda i : r_u_r[i, [0, 2, 1]].tolist())
    # r_u_r = r_u_r[sorted_index]
    return r_u_r

# 0-2-0
def create_u_p_u():
    print('Create u-p-u list')
    u_p_u = []
    for p, u_list in tqdm(product_user_list.items()):
        u_p_u.extend([(u1, p, u2) for u1 in u_list for u2 in u_list])
    u_p_u = np.array(u_p_u)
    u_p_u[:, [1]] += num_users + num_reviews
    sorted_index = sorted(list(range(len(u_p_u))), key=lambda i : u_p_u[i, [0, 2, 1]].tolist())
    u_p_u = u_p_u[sorted_index]
    return u_p_u

# 2-0-2
def create_p_u_p():
    print('Create p-u-p list')
    p_u_p = []
    for u, p_list in tqdm(user_product_list.items()):
        p_u_p.extend([(p1, u, p2) for p1 in p_list for p2 in p_list])
    p_u_p = np.array(p_u_p)
    p_u_p[:, [0, 2]] += num_users + num_reviews
    sorted_index = sorted(list(range(len(p_u_p))), key=lambda i : p_u_p[i, [0, 2, 1]].tolist())
    p_u_p = p_u_p[sorted_index]
    return p_u_p

# # 1-2-0-2-1
def create_r_p_u_p_r():
    print('Create r-p-u-p-r list')
    offset = num_users+num_reviews
    r_p_u_p_r = []
    p_u_p = create_p_u_p()
    for p1, u, p2 in tqdm(p_u_p):
        if len(product_user_list[p1 - offset]) == 0 or len(product_user_list[p2 - offset]) == 0:
            continue
        candidate_r1_list = np.random.choice(len(product_review_list[p1 - offset]), int(0.2 * len(product_review_list[p1 - offset])), replace=False)
        candidate_r1_list = product_review_list[p1 - offset][candidate_r1_list]
        candidate_r2_list = np.random.choice(len(product_review_list[p2 - offset]), int(0.2 * len(product_review_list[p2 - offset])), replace=False)
        candidate_r2_list = product_review_list[p2 - offset][candidate_r2_list]
        r_p_u_p_r.extend([(r1, p1, u, p2, r2) for r1 in candidate_r1_list for r2 in candidate_r2_list])
    r_p_u_p_r = np.array(r_p_u_p_r)
    sorted_index = sorted(list(range(len(r_p_u_p_r))), key=lambda i : r_p_u_p_r[i, [0, 4, 1, 2, 3]].tolist())
    r_p_u_p_r = r_p_u_p_r[sorted_index]
    return r_p_u_p_r

Metapaths:
* (0, 1, 0) = u_r_u: two users who interacted (liked) same review
* (0, 1, 2, 1, 0) = u_r_p_r_u: two users who interacted with two reviews under same product
* (0, 2, 0) = u_p_u: two users who interacted with reviews under same product
* (1, 0, 1) = r_u_r: two reviews who got interactions from same user 
* (1, 2, 1) = r_p_r: two reviews under same product
* (1, 2, 0, 2, 1) = r_p_u_p_r: two reviews under 2 products who got interactions from same user

In [111]:
if signal == 'like':
    expected_metapaths = [
        [(0, 1, 0), (0, 1, 2, 1, 0), (0, 2, 0)],
        [(1, 0, 1), (1, 2, 1), (1, 2, 0, 2, 1)]
    ]
    # create the directories if they do not exist
    for i in range(len(expected_metapaths)):
        pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)

    metapath_indices_mapping = {(0, 1, 0): create_u_r_u,
                                (0, 1, 2, 1, 0): create_u_r_p_r_u,
                                (0, 2, 0): create_u_p_u, 
                                (1, 0, 1): create_r_u_r,
                                (1, 2, 1): create_r_p_r,
                                (1, 2, 0, 2, 1): create_r_p_u_p_r}
elif signal == 'write':
    expected_metapaths = [
        [(0, 1, 2, 1, 0), (0, 2, 0)],
        [(1, 0, 1), (1, 2, 1), (1, 2, 0, 2, 1)]
    ]
    # create the directories if they do not exist
    for i in range(len(expected_metapaths)):
        pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)

    metapath_indices_mapping = {(0, 1, 2, 1, 0): create_a_r_p_r_a,
                                (0, 2, 0): create_u_p_u, 
                                (1, 2, 1): create_r_p_r,
                                (1, 0, 1): create_r_a_r,
                                (1, 2, 0, 2, 1): create_r_p_u_p_r}
    
elif signal == 'both':
    expected_metapaths = [
    [(0, 1, 0), (0, 1, 2, 1, 0), (3, 1, 2, 1, 3), (0, 2, 0)],
        [(1, 0, 1), (3, 0, 3), (1, 2, 1), (1, 2, 0, 2, 1)]
    ]
    # create the directories if they do not exist
    for i in range(len(expected_metapaths)):
        pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)
    
    metapath_indices_mapping = {(0, 1, 0): create_u_r_u,
                                (0, 1, 2, 1, 0): create_u_r_p_r_u,
                                (3, 1, 2, 1, 3): create_a_r_p_r_a,
                                (0, 2, 0): create_u_p_u, 
                                (1, 0, 1): create_r_u_r,
                                (3, 0, 3): create_r_a_r,
                                (1, 2, 1): create_r_p_r,
                                (1, 2, 0, 2, 1): create_r_p_u_p_r}

In [112]:
# write all things
target_idx_lists = [np.arange(num_users), np.arange(num_reviews), np.arange(num_products)]
offset_list = [0, num_users, num_products]
for i, metapaths in enumerate(expected_metapaths):
    for metapath in metapaths:
        edge_metapath_idx_array = metapath_indices_mapping[metapath]()
        
        with open(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.pickle', 'wb') as out_file:
            target_metapaths_mapping = {}
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                target_metapaths_mapping[target_idx] = edge_metapath_idx_array[left:right, ::-1]
                left = right
            pickle.dump(target_metapaths_mapping, out_file)
        
        with open(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist', 'w') as out_file:
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                neighbors = edge_metapath_idx_array[left:right, -1] - offset_list[i]
                neighbors = list(map(str, neighbors))
                if len(neighbors) > 0:
                    out_file.write('{} '.format(target_idx) + ' '.join(neighbors) + '\n')
                else:
                    out_file.write('{}\n'.format(target_idx))
                left = right

scipy.sparse.save_npz(save_prefix + 'adj_mat.npz', scipy.sparse.csr_matrix(adj_mat))
np.save(save_prefix + 'node_types.npy', type_mask)

Create u-r-u list


100%|██████████| 14091/14091 [00:01<00:00, 13188.60it/s]


Create u-r-p-r-u list
Create r-p-r list


100%|██████████| 1813/1813 [00:00<00:00, 24286.89it/s]
100%|██████████| 175081/175081 [00:04<00:00, 35272.62it/s]


Create u-r-p-r-u list
Create r-p-r list


100%|██████████| 1813/1813 [00:00<00:00, 23953.41it/s]


Create u-p-u list


100%|██████████| 1813/1813 [00:03<00:00, 553.50it/s]


Create r-u-r list


100%|██████████| 4417/4417 [00:14<00:00, 305.42it/s]


Create r-u-r list
Create r-p-r list


100%|██████████| 1813/1813 [00:00<00:00, 29275.13it/s]


Create r-p-u-p-r list
Create p-u-p list


100%|██████████| 4417/4417 [00:04<00:00, 974.53it/s] 
100%|██████████| 31703371/31703371 [14:15<00:00, 37059.25it/s]


# Features

In [14]:
clean_ciao = pd.read_csv('/sise/bshapira-group/lilachzi/csvs/ciao_4core.tsv', sep='\t')
# clean_ciao = pd.read_csv('/sise/bshapira-group/lilachzi/csvs/ciao_4core_summary.tsv', sep='\t')
clean_ciao = clean_ciao[clean_ciao['category'] == CATEGORY]

In [16]:
def split_tokens_with_punctuation(tokens):
    punctuation_pattern = re.compile(r'[^\w\s]|\s{2,}')
    new_tokens = []
    for token in tokens:
        sub_tokens = punctuation_pattern.split(token)
        for sub_token in sub_tokens:
            new_tokens.append(sub_token)
    return new_tokens

def filter_tokens(tokens):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation) | {'*', '&'}
    alphabetic_pattern = re.compile(r'[a-zA-Z]+')
    tokens = [token.lower() for token in tokens if token not in punctuation and alphabetic_pattern.fullmatch(token)]
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

def tokenize_text(text, tokenizer='word'):
    """
    Preprocess and tokenize the text. 

    The preprocessing includes removal of punctuaion marks, numbers and stop words. 
    The return value if list of filtered tokens or sentences. 
    """
    # Remove words with repeating characters
    repeat_pattern = re.compile(r'([^\W\d_])\1{2,}')
    text = re.sub(repeat_pattern, lambda x: x.group()[0] * 2, text)
    
    # Tokenize and split tokens with punctuation
    sentences = sent_tokenize(text)
    sentences = [word_tokenize(sentence) for sentence in sentences]
    sentences = [split_tokens_with_punctuation(tokens) for tokens in sentences]

    # Remove stop words, punctuation and non-alphabetic characters
    sentences = [filter_tokens(tokens) for tokens in sentences]
    
    # Return wanted structure based on tokenizer parameter
    if tokenizer == 'word':
        return list(itertools.chain(*sentences))
    else:
        return [' '.join(sentence) for sentence in sentences]

## Word2Vec

In [50]:
w2v_model = KeyedVectors.load('../../PersonzalizedReviews/w2v_ciao_amazon/w2v_ciao_amazon.kv')

In [51]:
clean_ciao['tokens'] = clean_ciao['clean_review'].progress_apply(tokenize_text)
# clean_ciao['tokens'] = clean_ciao['summary'].progress_apply(tokenize_text)

100%|██████████| 12084/12084 [03:51<00:00, 52.26it/s]


In [52]:
def w2v_vectorize(words):
    vectors = []
    
    for word in words:
        if len(word) == 1:
            continue
        try:    
            vectors.append(w2v_model[word])
        except:
            continue
    
    return np.average(vectors, axis=0)

clean_ciao['embedding'] = clean_ciao['tokens'].progress_apply(w2v_vectorize)

100%|██████████| 12084/12084 [00:04<00:00, 2690.53it/s]


In [53]:
user_features = []
for user_idx, user_id in tqdm(users_idx2id.items()):
    embeddings = np.array(clean_ciao[clean_ciao['user_id'] == user_id]['embedding'].tolist())
    
    if len(embeddings) == 0:
        embeddings = np.zeros((300, ))
        user_features.append(embeddings)
        continue
        
    user_features.append(np.mean(embeddings, axis=0))

100%|██████████| 4410/4410 [00:01<00:00, 2414.40it/s]


In [54]:
review_features = []
for review_idx, review_id in tqdm(reviews_idx2id.items()):
    embedding = clean_ciao[clean_ciao['review_id'] == review_id]['embedding'].values[0]
    review_features.append(embedding)

100%|██████████| 12084/12084 [00:04<00:00, 2526.45it/s]


In [55]:
product_features = []
for product_idx, product_id in tqdm(products_idx2id.items()):
    embeddings = np.array(clean_ciao[clean_ciao['product_id'] == product_id]['embedding'].tolist())
    product_features.append(np.mean(embeddings, axis=0))

100%|██████████| 1318/1318 [00:00<00:00, 2229.05it/s]


In [56]:
np.savez(save_prefix + 'w2v_implicit_features.npz',
         user_features=user_features,
         review_features=review_features,
         product_features=product_features)

In [20]:
save_prefix

'/sise/bshapira-group/lilachzi/models/magnn/data/preprocessed/CiaoBeauty_like_processed/'

## User Profile

In [11]:
w2v_model = KeyedVectors.load('../../PersonzalizedReviews/w2v_ciao_amazon/w2v_ciao_amazon.kv')

In [24]:
clean_ciao['tokens'] = clean_ciao['clean_review'].progress_apply(tokenize_text)
profiles['tokens'] = profiles['profile'].progress_apply(tokenize_text)

100%|██████████| 8919/8919 [01:43<00:00, 86.37it/s] 
100%|██████████| 3805/3805 [00:16<00:00, 226.99it/s]


In [18]:
profiles = os.path.join('/sise/bshapira-group/lilachzi/models/llm/llama_cpp_python', 
                        f'ciao_{CATEGORY}', f'{signal}_profiles.csv')
profiles = pd.read_csv(profiles)

In [25]:
def w2v_vectorize(words):
    vectors = []
    
    for word in words:
        if len(word) == 1:
            continue
        try:    
            vectors.append(w2v_model[word])
        except:
            continue
    
    return np.average(vectors, axis=0)

clean_ciao['embedding'] = clean_ciao['tokens'].progress_apply(w2v_vectorize)
profiles['embedding'] = profiles['tokens'].progress_apply(w2v_vectorize)

100%|██████████| 8919/8919 [00:05<00:00, 1758.59it/s]
100%|██████████| 3805/3805 [00:00<00:00, 4191.29it/s]


In [43]:
user_features = []
for user_idx, user_id in tqdm(users_idx2id.items()):
    try:
        embeddings = profiles[profiles['user_id'] == user_id]['embedding'].values[0]
    except:
        embeddings = np.zeros((300, ))
    
    user_features.append(embeddings)

100%|██████████| 4269/4269 [00:01<00:00, 3612.03it/s]


In [21]:
review_features = []
for review_idx, review_id in tqdm(reviews_idx2id.items()):
    embedding = clean_ciao[clean_ciao['review_id'] == review_id]['embedding'].values[0]
    review_features.append(embedding)

100%|██████████| 8919/8919 [00:02<00:00, 3742.75it/s]


In [22]:
product_features = []
for product_idx, product_id in tqdm(products_idx2id.items()):
    embeddings = np.array(clean_ciao[clean_ciao['product_id'] == product_id]['embedding'].tolist())
    product_features.append(np.mean(embeddings, axis=0))

100%|██████████| 909/909 [00:00<00:00, 2738.15it/s]


In [46]:
np.savez(save_prefix + 'w2v_profiles_implicit_features.npz',
         user_features=user_features,
         review_features=review_features,
         product_features=product_features)

In [45]:
save_prefix

'/sise/bshapira-group/lilachzi/models/magnn/data/preprocessed/CiaoGames_like_processed/'

# Add info to model results

In [15]:
import re

def split_model_name(name):
    pattern = r'(?P<category>[^_]+)_(?P<sub_feats_type>\w+_)?(?P<feats_type>\d+)_(?P<lr>[^_]+)_(?P<weight_decay>[^_]+)_(?P<dropout_rate>[^_]+)_(?P<batch_size>[^_]+)(?P<signal>.+)?'
    matches = re.match(pattern, name)

    matches = matches.groupdict()
    matches['category'] = matches['category'].lstrip('Ciao')
    # matches['sub_feats_type'] = matches['sub_feats_type'].rstrip('_') if matches['sub_feats_type'] else matches['sub_feats_type']
    matches['signal'] = matches['signal'].lstrip('_') if matches['signal'] else matches['signal']
    matches['signal'] = 'like' if not matches['signal'] else matches['signal']

    if matches['sub_feats_type'] and any([sig in matches['sub_feats_type'] for sig in ['like', 'write', 'both']]):
        signal = matches['sub_feats_type'].split('_')[0]
        matches['signal'] = signal
        matches['sub_feats_type'] = matches['sub_feats_type'].replace(f'{signal}_', '')

    return matches

mr = pd.read_csv('/sise/bshapira-group/lilachzi/models/magnn/model_results.csv')
mr_info = mr['model'].apply(lambda x: pd.Series(split_model_name(x)))
mr = pd.concat([mr_info, mr], axis=1)

In [16]:
mr.to_csv('/sise/bshapira-group/lilachzi/models/magnn/model_info_results.csv', index=False)