In [8]:
import os
import polars as pl
import csv
import numpy as np 

# Load Yelp data from raw_data directory
raw_data_path = './raw_data/'
target_path = './dataset/'
filter_size = 5

In [None]:
# Load yelp_academic_dataset_business.json using polars
business_df = pl.read_ndjson(os.path.join(raw_data_path, 'yelp_academic_dataset_business.json'))
display(business_df.head())

In [None]:
# Create business_id to category mapping
business_categories = business_df.select(['business_id', 'categories'])

# Save business_id to category mapping to TSV file
with open(os.path.join(target_path, 'businessid2category.tsv'), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    for row in business_categories.iter_rows():
        business_id, categories = row
        if categories:  # Only write if categories exist
            writer.writerow([business_id, categories])


In [None]:
# Load yelp_academic_dataset_review.json using polars
review_df = pl.read_ndjson(os.path.join(raw_data_path, 'yelp_academic_dataset_review.json'))
display(review_df.head())

review_id,user_id,business_id,stars,useful,funny,cool,text,date
str,str,str,f64,i64,i64,i64,str,str
"""KU_O5udG6zpxOg-VcAEodg""","""mh_-eMZ6K5RLWhZyISBhwA""","""XQfwVwDr-v0ZS3_CbbE5Xw""",3.0,0,0,0,"""If you decide to eat here, jus…","""2018-07-07 22:09:11"""
"""BiTunyQ73aT9WBnpR9DZGw""","""OyoGAe7OKpv6SyGZT5g77Q""","""7ATYjTIgM3jUlt4UM3IypQ""",5.0,1,0,1,"""I've taken a lot of spin class…","""2012-01-03 15:28:18"""
"""saUsX_uimxRlCVr67Z4Jig""","""8g_iMtfSiwikVnbP2etR0A""","""YjUWPpI6HXG530lwP-fb2A""",3.0,0,0,0,"""Family diner. Had the buffet. …","""2014-02-05 20:30:30"""
"""AqPFMleE6RsU23_auESxiA""","""_7bHUi9Uuf5__HHc_Q8guQ""","""kxX2SOes4o-D3ZQBkiMRfA""",5.0,1,0,1,"""Wow! Yummy, different, delic…","""2015-01-04 00:01:03"""
"""Sx8TMOWLNuJBWer-0pcmoA""","""bcjbaE6dDog4jkNY91ncLQ""","""e4Vwtrqf-wpJfwesgvdgxQ""",4.0,1,0,1,"""Cute interior and owner (?) ga…","""2017-01-14 20:54:15"""


In [7]:
# Create reviewerID to incremental id mapping
user_ids = review_df['user_id'].unique()
user_id_map = {reviewer_id: idx for idx, reviewer_id in enumerate(user_ids)}

# Create asin to incremental id mapping
business_ids = review_df['business_id'].unique()
business_id_map = {asin: idx for idx, asin in enumerate(business_ids)}

# Save mappings to files
with open(os.path.join(target_path, 'user2idx.tsv'), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    for reviewer_id, idx in user_id_map.items():
        writer.writerow([reviewer_id, idx])

with open(os.path.join(target_path, 'business2idx.tsv'), 'w', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    for asin, idx in business_id_map.items():
        writer.writerow([asin, idx])

In [9]:
# Filter items by popularity and users by activity in one pass
filtered_reviews = (
    review_df
    .with_row_index()
    .join(
        review_df.group_by('business_id').len().filter(pl.col('len') >= filter_size),
        on='business_id'
    )
    .join(
        review_df.group_by('user_id').len().filter(pl.col('len') >= filter_size),
        on='user_id'
    )
    .drop(['index', 'len'])
)

# Get unique user IDs and create splits
user_ids = filtered_reviews['user_id'].unique()
num_users = len(user_ids)
split_1 = int(num_users * 0.8)
split_2 = int(num_users * 0.9)

# Create train/valid/test splits using polars expressions
train_users = set(user_ids[:split_1])
valid_users = set(user_ids[split_1:split_2])
test_users = set(user_ids[split_2:])

train_data = filtered_reviews.filter(pl.col('user_id').is_in(train_users))
valid_data = filtered_reviews.filter(pl.col('user_id').is_in(valid_users))
test_data = filtered_reviews.filter(pl.col('user_id').is_in(test_users))

In [12]:
# Create validation dataframe with user history and items to predict
validation_df = (
    valid_data
    .group_by('user_id')
    .agg([
        pl.col('business_id').map_elements(lambda x: [business_id_map[item] for item in x], return_dtype=pl.List(pl.Int64)).alias('given_user_history'),
        pl.col('business_id').map_elements(lambda x: [business_id_map[item] for item in x], return_dtype=pl.List(pl.Int64)).alias('predicting_items')
    ])
    .rename({'user_id': 'user_id'})
    .with_columns([
        pl.col('user_id').replace_strict(user_id_map)
    ])
)

display(validation_df.head())


user_id,given_user_history,predicting_items
i64,list[i64],list[i64]
1760170,"[11818, 80995, … 53104]","[11818, 80995, … 53104]"
1081595,"[105282, 106190, … 37822]","[105282, 106190, … 37822]"
1586128,"[44340, 92836, … 4963]","[44340, 92836, … 4963]"
184996,"[83563, 40379, … 89621]","[83563, 40379, … 89621]"
1234531,"[23707, 17940, … 111285]","[23707, 17940, … 111285]"


In [13]:
# Create validation dataframe with user history and items to predict
test_df = (
    test_data
    .group_by('user_id')
    .agg([
        pl.col('business_id').map_elements(lambda x: [business_id_map[item] for item in x], return_dtype=pl.List(pl.Int64)).alias('given_user_history'),
        pl.col('business_id').map_elements(lambda x: [business_id_map[item] for item in x], return_dtype=pl.List(pl.Int64)).alias('predicting_items')
    ])
    .rename({'user_id': 'user_id'})
    .with_columns([
        pl.col('user_id').replace_strict(user_id_map)
    ])
)

display(validation_df.head())


user_id,given_user_history,predicting_items
i64,list[i64],list[i64]
1760170,"[11818, 80995, … 53104]","[11818, 80995, … 53104]"
1081595,"[105282, 106190, … 37822]","[105282, 106190, … 37822]"
1586128,"[44340, 92836, … 4963]","[44340, 92836, … 4963]"
184996,"[83563, 40379, … 89621]","[83563, 40379, … 89621]"
1234531,"[23707, 17940, … 111285]","[23707, 17940, … 111285]"


In [22]:
# Create training sequences with user history and next item to predict
train_sequences = []
for user_id, group in train_data.sort('date').group_by('user_id'):
    history = [business_id_map[item] for item in group['business_id']]
    user_id = user_id_map[user_id[0]]
    
    # Create sequences of increasing length
    for i in range(1, len(history)):
        train_sequences.append({
            'user_id': user_id,
            'given_user_history': history[:i],
            'predicting_items': history[i]
        })

train_df = pl.DataFrame(train_sequences)
display(train_df.head())

user_id,given_user_history,predicting_items
i64,list[i64],i64
1986308,[1335],2846
1986308,"[1335, 2846]",59274
1986308,"[1335, 2846, 59274]",61237
1986308,"[1335, 2846, … 61237]",85896
1986308,"[1335, 2846, … 85896]",28402
