In [1]:
import pandas as pd
import numpy as np
import os
import polars as pl

# Load Yelp data from raw_data directory
raw_data_path = '../../../../Yelp/Yelp/raw_data/'
target_path = '../../../../Yelp/Yelp/dataset/'
filter_size = 5

In [14]:
import ijson
f = open(os.path.join(raw_data_path, 'yelp_academic_dataset_review.json'), 'rb')
parser = ijson.parse(f, multiple_values = True)

In [15]:
import csv
with open(os.path.join(raw_data_path, 'yelp_review_reduced.csv'), "w") as csv_file:
        writer = csv.writer(csv_file, delimiter=' ')
        data = []
        for prefix, event, value in parser:
            if prefix=="review_id":
                data.append(value)
            elif prefix=="user_id":
                data.append(value)
            elif prefix=="business_id":
                data.append(value)
            elif prefix=="date":
                data.append(value)
            elif event == "end_map":
                writer.writerow(data)
                data = []

In [2]:
review_df = pd.read_csv(os.path.join(raw_data_path, 'yelp_review_reduced.csv'), sep = ' ', header=None)

In [3]:
review_df = review_df.rename(columns={0:'review_id', 1:'user_id', 2:'business_id', 3:'date'})

In [4]:
# Create business_id to incremental id mapping
unique_business_ids = review_df['business_id'].unique()
business_id_to_idx = {business_id: idx for idx, business_id in enumerate(unique_business_ids)}

# Create user_id to incremental id mapping
unique_user_ids = review_df['user_id'].unique()
user_id_to_idx = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}


In [5]:
# Filter items by popularity
item_counts = review_df['business_id'].value_counts()
popular_items = item_counts[item_counts >= filter_size].index.tolist()

# Filter reviews to only include popular items
filtered_reviews = review_df[review_df['business_id'].isin(popular_items)]

# Filter users by activity
user_counts = filtered_reviews['user_id'].value_counts()
active_users = user_counts[user_counts >= filter_size].index.tolist()

# Filter reviews to only include active users
filtered_reviews = filtered_reviews[filtered_reviews['user_id'].isin(active_users)]

# Get unique user IDs and shuffle them
user_ids = filtered_reviews['user_id'].unique()
np.random.shuffle(user_ids)

# Split users into train/valid/test
num_users = len(user_ids)
split_1 = int(num_users * 0.8)
split_2 = int(num_users * 0.9)
train_users = user_ids[:split_1]
valid_users = user_ids[split_1:split_2]
test_users = user_ids[split_2:]

# Process each split
train_data = filtered_reviews[filtered_reviews['user_id'].isin(train_users)]
valid_data = filtered_reviews[filtered_reviews['user_id'].isin(valid_users)]
test_data = filtered_reviews[filtered_reviews['user_id'].isin(test_users)]

In [None]:
train_data.shape[0]

14061228

In [None]:
from tqdm import tqdm
# Process validation data
valid_formatted = pd.DataFrame(columns=['user_id', 'given_user_history', 'predicting_items'])

for user_id, group in tqdm(valid_data.groupby('user_id')):
    # Sort by date first
    sorted_group = group.sort_values('date')
    business_ids = sorted_group['business_id'].tolist()
    split_idx = int(len(business_ids) * 0.8)
    given_history = [business_id_to_idx[bid] for bid in business_ids[:split_idx]]
    predicting_items = [business_id_to_idx[bid] for bid in business_ids[split_idx:]]
    
    
    new_row = pd.DataFrame({
        'user_id': [user_id_to_idx[user_id]],
        'given_user_history': [given_history],
        'predicting_items': [predicting_items]
    })
    valid_formatted = pd.concat([valid_formatted, new_row], ignore_index=True)

valid_formatted.to_csv(os.path.join(target_path, 'validation.tsv'), sep='\t', index=False)


  0%|          | 0/28712 [00:00<?, ?it/s]


NameError: name 'writer' is not defined

In [32]:

# Process test data
test_formatted = pd.DataFrame(columns=['user_id', 'given_user_history', 'predicting_items'])

for user_id, group in tqdm(test_data.groupby('user_id')):
    # Sort by date first
    sorted_group = group.sort_values('date')
    business_ids = sorted_group['business_id'].tolist()
    split_idx = int(len(business_ids) * 0.8)
    given_history = [business_id_to_idx[bid] for bid in business_ids[:split_idx]]
    predicting_items = [business_id_to_idx[bid] for bid in business_ids[split_idx:]]
    
    new_row = pd.DataFrame({
        'user_id': [user_id_to_idx[user_id]],
        'given_user_history': [given_history],
        'predicting_items': [predicting_items]
    })
    test_formatted = pd.concat([test_formatted, new_row], ignore_index=True)

test_formatted.to_csv(os.path.join(target_path, 'test.tsv'), sep='\t', index=False)


100%|██████████| 28712/28712 [01:05<00:00, 436.80it/s]


In [9]:

import csv
# Process training data
train_formatted = pd.DataFrame(columns=['uid', 'his_seq', 'next_item'])

with open(os.path.join(target_path, 'training.tsv'), "w") as csv_file:
    writer = csv.writer(csv_file, delimiter="\t")
    writer.writerow(["uid", "his_seq", "next_item"])
    for user_id, group in tqdm(train_data.groupby('user_id')):
        # Sort by date first
        sorted_group = group.sort_values('date')
        business_ids = sorted_group['business_id'].tolist()
        
        # Create all possible sequences
        for i in range(1, len(business_ids)):
            history = [business_id_to_idx[bid] for bid in business_ids[:i]]
            target = business_id_to_idx[business_ids[i]]
            
            writer.writerow([user_id_to_idx[user_id], history, target])
#             new_row = pd.DataFrame({
#                 'uid': [user_id_to_idx[user_id]],
#                 'his_seq': [history],
#                 'next_item': [target]
#             })
#             train_formatted = pd.concat([train_formatted, new_row], ignore_index=True)

# train_formatted.to_csv(os.path.join(target_path, 'training.tsv'), sep='\t', index=False)


100%|██████████| 229692/229692 [03:41<00:00, 1035.92it/s]


In [None]:
# write to file




display(train_formatted.head())
