In [2]:
import numpy as np
import pandas as pd
import pickle
import os
import random

from pathlib import Path
from tqdm import tqdm

tqdm.pandas()

In [52]:
# category = 'Beauty'
# category = 'DVDs'
# category = 'Food & Drink'
# category = 'Internet'
category = 'Games'

# signal = 'like'
signal = 'write'
# signal = 'both'

summary = False

num_candidates = 10

In [4]:
output_path = '/sise/bshapira-group/lilachzi/models/LlamaRec/data/preprocessed'

In [5]:
data_path = '/sise/bshapira-group/lilachzi/csvs/'
votes_file = os.path.join(data_path, 'votes_4core_split_60_20_20.tsv')
aspects_reviews_file = os.path.join(data_path, 'ciao_4core_gpt_aspects.csv')
summary_reviews_file = os.path.join(data_path, 'ciao_4core_summary.csv')

votes_df = pd.read_csv(votes_file, sep='\t')
if summary:
    reviews_df = pd.read_csv(summary_reviews_file)
else:
    reviews_df = pd.read_csv(aspects_reviews_file)

In [6]:
positive_vote = [3, 4, 5]

written_review = {}
liked_reviews = {}
umap = {}
rmap = {}
ridx2text = {}

for idx, row in tqdm(votes_df.iterrows(), total=len(votes_df)):
    review_id = row['review_id']
    voter_id = row['voter_id']
    vote = row['vote']
    categ = row['category']
    split = row['split']
    
    if vote in positive_vote:
        if voter_id not in liked_reviews:
            liked_reviews[voter_id] = {}
        if split not in liked_reviews[voter_id]:
            liked_reviews[voter_id][split] = []
        liked_reviews[voter_id][split].append((review_id, categ))
        
    if voter_id in umap:
        continue
        
    umap[voter_id] = len(umap)
        
for idx, row in tqdm(reviews_df.iterrows(), total=len(reviews_df)):
    review_id = row['review_id']
    user_id = row['user_id']
    if summary:
        text = row['summary']
    else:
        text = row['aspects']
    
    if user_id not in written_review:
        written_review[user_id] = []
    written_review[user_id].append(review_id)
    
    if review_id in rmap:
        continue

    rmap[review_id] = len(rmap)
    ridx2text[rmap[review_id]] = text

100%|██████████| 2755654/2755654 [01:52<00:00, 24466.81it/s]
100%|██████████| 135880/135880 [00:06<00:00, 22132.63it/s]


In [7]:
like_history = 30
write_history = 10

def load_history(voter_id, rmap, written_review, liked_reviews, split, history_type, category=None):
    history_write = [rmap[i] for i in written_review[voter_id]]
    if split in liked_reviews.get(voter_id, []):
        liked_reviews_user = liked_reviews[voter_id][split]
    else:
        liked_reviews_user = []
    if category:
        history_like = [rmap[liked_pair[0]] for liked_pair in liked_reviews_user if liked_pair[1] != category]
    else:
        history_like = [rmap[liked_pair[0]] for liked_pair in liked_reviews_user]

    if history_type == 'write':
        history = np.random.permutation(history_write)[:write_history]
    elif history_type == 'like':
        history = np.random.permutation(history_like)[:like_history]
    else:  # 'both'
        history = (list(np.random.permutation(history_write)[:write_history]) +
                   list(np.random.permutation(history_like)[:like_history]))

    history = list(history)
    return history

In [8]:
train_set = pd.read_csv(os.path.join(data_path, 'train_set.csv'), converters={'y_true': eval})
train_set = train_set[train_set['category'] == category]

In [41]:
train_seq = []
train_cand = []

for idx, row in tqdm(train_set.iterrows(), total=len(train_set)):
    pid = row['product_id']
    uidx = umap[row['voter_id']]
    candidates = {rmap[rid]: vote for rid, vote in row['y_true'].items()}
    answers = [rid for rid, vote in candidates.items() if vote in [3, 4, 5]]
    user_history = load_history(row['voter_id'], rmap, written_review, liked_reviews, 'train', signal, category)
    
    for answer in answers:
        train_seq.append(user_history + [answer])
        negative_cands = [c for c, vote in candidates.items() if vote not in [3, 4, 5]]
        negative_cands = random.sample(negative_cands, min(3, len(negative_cands)))
        train_cand.append(negative_cands)

100%|██████████| 48170/48170 [00:38<00:00, 1262.90it/s]


In [55]:
valid_set = pd.read_csv(
    os.path.join(output_path, f'ciao_{category.replace(" & ", "_")}_{signal}', f'valid_{str(num_candidates)}_candidates.csv'),
    converters={'candidates': eval}
)

In [57]:
val_seq = []
val_cand = []
val_labels = []

for idx, row in tqdm(valid_set.iterrows(), total=len(valid_set)):
    pid = row['product_id']
    uidx = umap[row['voter_id']]
    
    candidates = {rmap[rid]: vote for rid, vote in row['candidates'].items()}
    user_history = load_history(row['voter_id'], rmap, written_review, liked_reviews, 'test', signal, category)
    
    val_seq.append(user_history)
    val_cand.append([c for c in candidates.keys()])
    val_labels.append(candidates)

100%|██████████| 15829/15829 [00:05<00:00, 2821.05it/s]


In [58]:
test_set = pd.read_csv(
    os.path.join(output_path, f'ciao_{category.replace(" & ", "_")}_{signal}', f'test_{str(num_candidates)}_candidates.csv'),
    converters={'candidates': eval}
)

In [59]:
test_seq = []
test_cand = []
test_labels = []

for idx, row in tqdm(test_set.iterrows(), total=len(test_set)):
    pid = row['product_id']
    uidx = umap[row['voter_id']]
    
    candidates = {rmap[rid]: vote for rid, vote in row['candidates'].items()}
    user_history = load_history(row['voter_id'], rmap, written_review, liked_reviews, 'test', signal, category)
    
    test_seq.append(user_history)
    test_cand.append([c for c in candidates.keys()])
    test_labels.append(candidates)

100%|██████████| 15792/15792 [00:05<00:00, 2822.91it/s]


In [60]:
dataset = {
    'train_seq': train_seq,
    'train_cand': train_cand,
    'val_seq': val_seq,
    'val_cand': val_cand,
    'val_labels': val_labels,
    'test_seq': test_seq,
    'test_cand': test_cand,
    'test_labels': test_labels,
    'meta': ridx2text,
    'umap': umap,
    'smap': rmap
}

In [64]:
if summary:
    dataset_path = Path(
        f'/sise/bshapira-group/lilachzi/models/LlamaRec/data/preprocessed/ciao_{category.replace(" & ", "_")}_{signal}_summary/dataset.pkl'
    )
else:
    dataset_path = Path(
        f'/sise/bshapira-group/lilachzi/models/LlamaRec/data/preprocessed/ciao_{category.replace(" & ", "_")}_{signal}/dataset_{str(num_candidates)}.pkl'
    )
dataset_path

PosixPath('/sise/bshapira-group/lilachzi/models/LlamaRec/data/preprocessed/ciao_Games_write/dataset_10.pkl')

In [65]:
with dataset_path.open('wb') as f:
    pickle.dump(dataset, f)

In [80]:
# evaluation = {
#     'valid_labels': val_labels,
#     'test_labels': test_labels
# }

In [81]:
# if summary:
#     evaluation_path = Path(
#         f'/sise/bshapira-group/lilachzi/models/LlamaRec/data/preprocessed/ciao_{category.replace(" & ", "_")}_{signal}_summary/evaluation.pkl'
#     )
# else:
#     evaluation_path = Path(
#         f'/sise/bshapira-group/lilachzi/models/LlamaRec/data/preprocessed/ciao_{category.replace(" & ", "_")}_{signal}/evaluation.pkl'
#     )

In [82]:
# with evaluation_path.open('wb') as f:
#     pickle.dump(evaluation, f)