In [1]:
import pandas as pd
import numpy as np
import os
import polars as pl

# Load Yelp data from raw_data directory
raw_data_path = '../../../../Books/Books/raw_data/'
target_path = '../../../../Books/Books/dataset/'
filter_size = 5

In [39]:
import ijson
f = open(os.path.join(raw_data_path, 'meta_Books.json'), 'rb')
parser = ijson.parse(f, multiple_values = True)

In [19]:
import csv
with open(os.path.join(raw_data_path, 'BookIDtoBrand.csv'), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter='\t')
        data = []
        for prefix, event, value in parser:
            # print(prefix)
            if prefix=="asin":
                # print(value)
                data.append(value)

            elif prefix=="brand":
                data.append(value)
                # print(value)
            # elif prefix=="business_id":
            #     data.append(value)
            # elif prefix=="date":
            #     data.append(value)
            elif event == "end_map":
                writer.writerow(data)
                data = []

In [15]:
import ijson
f = open(os.path.join(raw_data_path, 'Books_5.json'), 'rb')
parser = ijson.parse(f, multiple_values = True)

In [16]:
import csv
with open(os.path.join(raw_data_path, 'Books5Reduced.csv'), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=' ')
        data = []
        for prefix, event, value in parser:
            if prefix=="reviewerID":
                data.append(value)
            elif prefix=="asin":
                data.append(value)
            elif prefix=="unixReviewTime":
                data.append(value)
            elif event == "end_map":
                writer.writerow(data)
                data = []

In [40]:
review_df = pd.read_csv(os.path.join(raw_data_path, 'Books5Reduced.csv'), sep = ' ', header=None)
id2cat = pd.read_csv(os.path.join(raw_data_path, 'BookIDtoBrand.csv'), sep = '\t', header=None)

  id2cat = pd.read_csv(os.path.join(raw_data_path, 'BookIDtoBrand.csv'), sep = '\t', header=None)


In [41]:
review_df = review_df.rename(columns={0:'user_id', 1:'business_id', 2:'date'})
id2cat = id2cat.rename(columns={0:'category', 1:'business_id'})

In [28]:
review_df = review_df.merge(id2cat, on='business_id')

In [42]:
# Create business_id to incremental id mapping
unique_business_ids = review_df['business_id'].unique()
business_id_to_idx = {business_id: idx for idx, business_id in enumerate(unique_business_ids)}

# Create user_id to incremental id mapping
unique_user_ids = review_df['user_id'].unique()
user_id_to_idx = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}


In [43]:
id2cat['index'] = id2cat['business_id'].map(business_id_to_idx)

In [50]:
id2cat = id2cat.drop(['business_id'], axis=1)

In [47]:
id2cat = id2cat.dropna(axis=0)

In [52]:
id2cat['index'] = id2cat['index'].astype(int)

In [57]:

id2cat = id2cat.iloc[:, [1, 0]]

In [59]:
id2cat.to_csv(os.path.join(target_path, 'id2cat.tsv'), sep='\t', index=False)

In [60]:
# Filter items by popularity
item_counts = review_df['business_id'].value_counts()
popular_items = item_counts[item_counts >= filter_size].index.tolist()

# Filter reviews to only include popular items
filtered_reviews = review_df[review_df['business_id'].isin(popular_items)]

# Filter users by activity
user_counts = filtered_reviews['user_id'].value_counts()
active_users = user_counts[user_counts >= filter_size].index.tolist()

# Filter reviews to only include active users
filtered_reviews = filtered_reviews[filtered_reviews['user_id'].isin(active_users)]

# Get unique user IDs and shuffle them
user_ids = filtered_reviews['user_id'].unique()
np.random.shuffle(user_ids)

# Split users into train/valid/test
num_users = len(user_ids)
split_1 = int(num_users * 0.8)
split_2 = int(num_users * 0.9)
train_users = user_ids[:split_1]
valid_users = user_ids[split_1:split_2]
test_users = user_ids[split_2:]

# Process each split
train_data = filtered_reviews[filtered_reviews['user_id'].isin(train_users)]
valid_data = filtered_reviews[filtered_reviews['user_id'].isin(valid_users)]
test_data = filtered_reviews[filtered_reviews['user_id'].isin(test_users)]

In [61]:
train_data.shape[0]

7110044

In [78]:
from tqdm import tqdm
# Process validation data
valid_formatted = pd.DataFrame(columns=['user_id', 'given_user_history', 'predicting_items'])

with open(os.path.join(target_path, 'validation.tsv'), "w", newline='') as csv_file:
    writer = csv.writer(csv_file, delimiter="\t")
    writer.writerow(["user_id", "given_user_history", "predicting_items"])
    for user_id, group in tqdm(valid_data.groupby('user_id')):
        # Sort by date first
        sorted_group = group.sort_values('date')
        business_ids = sorted_group['business_id'].tolist()
        split_idx = int(len(business_ids) * 0.8)
        given_history = [business_id_to_idx[bid] for bid in business_ids[:split_idx]]
        predicting_items = [business_id_to_idx[bid] for bid in business_ids[split_idx:]]

        writer.writerow([user_id_to_idx[user_id], given_history, predicting_items])
    
    
    # new_row = pd.DataFrame({
    #     'user_id': [user_id_to_idx[user_id]],
    #     'given_user_history': [given_history],
    #     'predicting_items': [predicting_items]
    # })
    # valid_formatted = pd.concat([valid_formatted, new_row], ignore_index=True)

# valid_formatted.to_csv(os.path.join(target_path, 'validation.tsv'), sep='\t', index=False)


100%|██████████| 60367/60367 [00:15<00:00, 3791.88it/s]


In [77]:

# Process test data
test_formatted = pd.DataFrame(columns=['user_id', 'given_user_history', 'predicting_items'])


with open(os.path.join(target_path, 'test.tsv'), "w", newline='') as csv_file:
    writer = csv.writer(csv_file, delimiter="\t")
    writer.writerow(["user_id", "given_user_history", "predicting_items"])
    for user_id, group in tqdm(test_data.groupby('user_id')):
        # Sort by date first
        sorted_group = group.sort_values('date')
        business_ids = sorted_group['business_id'].tolist()
        split_idx = int(len(business_ids) * 0.8)
        given_history = [business_id_to_idx[bid] for bid in business_ids[:split_idx]]
        predicting_items = [business_id_to_idx[bid] for bid in business_ids[split_idx:]]

        writer.writerow([user_id_to_idx[user_id], given_history, predicting_items])
        
        # new_row = pd.DataFrame({
        #     'user_id': [user_id_to_idx[user_id]],
        #     'given_user_history': [given_history],
        #     'predicting_items': [predicting_items]
        # })
        # test_formatted = pd.concat([test_formatted, new_row], ignore_index=True)

# test_formatted.to_csv(os.path.join(target_path, 'test.tsv'), sep='\t', index=False)


100%|██████████| 60367/60367 [00:16<00:00, 3634.41it/s]


In [79]:

import csv
# Process training data
train_formatted = pd.DataFrame(columns=['uid', 'his_seq', 'next_item'])

with open(os.path.join(target_path, 'training.tsv'), "w", newline='') as csv_file:
    writer = csv.writer(csv_file, delimiter="\t")
    writer.writerow(["uid", "his_seq", "next_item"])
    for user_id, group in tqdm(train_data.groupby('user_id')):
        # Sort by date first
        sorted_group = group.sort_values('date')
        business_ids = sorted_group['business_id'].tolist()
        
        # Create all possible sequences
        for i in range(1, len(business_ids)):
            history = [business_id_to_idx[bid] for bid in business_ids[:i]]
            target = business_id_to_idx[business_ids[i]]
            
            writer.writerow([user_id_to_idx[user_id], history, target])
#             new_row = pd.DataFrame({
#                 'uid': [user_id_to_idx[user_id]],
#                 'his_seq': [history],
#                 'next_item': [target]
#             })
#             train_formatted = pd.concat([train_formatted, new_row], ignore_index=True)

# train_formatted.to_csv(os.path.join(target_path, 'training.tsv'), sep='\t', index=False)


100%|██████████| 482934/482934 [08:23<00:00, 958.88it/s] 


In [70]:
# write to file




[user_id_to_idx[user_id], history, target]


[554874,
 [162535, 162537, 162538, 162539, 162540, 162541, 162536, 162533, 170311],
 170312]

In [80]:
with open(os.path.join(target_path, 'training.tsv'), newline='') as f:
  reader = csv.reader(f)
  row1 = next(reader)  # gets the first line
  print(row1)
  row = next(reader)
  print(row)
  row = next(reader)
  print(row)

['uid\this_seq\tnext_item']
['412749\t[46615]\t133994']
['412749\t[46615', ' 133994]\t202452']


In [72]:
row = next(reader)
print(row)

ValueError: I/O operation on closed file.

In [74]:
with open(os.path.join('../../../../Books/Books/raw_data/training.tsv'), newline='') as f:
  reader = csv.reader(f)
  row1 = next(reader)  # gets the first line
  print(row1)
  row = next(reader)
  print(row)
  row = next(reader)
  print(row)

['user_id\tuser_history\tnext_item']
['0\t[17978]\t901']
['0\t[17978', ' 901]\t97224']
