In [None]:
import pandas as pd
import json
import numpy as np
from matplotlib import pyplot as plt

In [3]:
def read_json_in_chunks(file_path, chunk_size=10000):
    """Read large JSON file in chunks"""
    chunks = []
    
    with open(file_path, 'r') as file:
        chunk = []
        for i, line in enumerate(file):
            chunk.append(line)
            
            if (i + 1) % chunk_size == 0:
                chunk_df = pd.read_json('\n'.join(chunk), lines=True)
                chunks.append(chunk_df)
                chunk = []  
        
        # process remaining lines
        if chunk:
            chunk_df = pd.read_json('\n'.join(chunk), lines=True)
            chunks.append(chunk_df)
    
    return pd.concat(chunks, ignore_index=True)

In [None]:
reviews_file_path = "/Users/huwenjie/Developer/bt4222 code/bt4222_project_models/GoogleMaps_Recommender/data/processed/sf/sf-reviews.json"
reviews_df = read_json_in_chunks(reviews_file_path)
restaurants_file_path = "/Users/huwenjie/Developer/bt4222 code/bt4222_project_models/GoogleMaps_Recommender/data/processed/sf/sf-restaurants.json"
restaurants_df = read_json_in_chunks(restaurants_file_path)

In [7]:
print(reviews_df.shape)
print(restaurants_df.shape)

(806675, 6)
(3721, 15)


In [9]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806675 entries, 0 to 806674
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  806675 non-null  float64
 1   name     806675 non-null  object 
 2   time     806675 non-null  int64  
 3   rating   806675 non-null  int64  
 4   text     426960 non-null  object 
 5   gmap_id  806675 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 36.9+ MB


In [10]:
sf_reviews_per_user = reviews_df["user_id"].value_counts()
sf_review_per_restaurant =  reviews_df["gmap_id"].value_counts()

print(f"Total number of reviews: {len(reviews_df):,}")

print(f"\nReviews per user:")
print(f"  Mean: {sf_reviews_per_user.mean():.2f}")
print(f"  Median: {sf_reviews_per_user.median():.2f}")

print(f"\nReviews per restaurant:")
print(f"  Mean: {sf_review_per_restaurant.mean():.2f}")
print(f"  Median: {sf_review_per_restaurant.median():.2f}")

Total number of reviews: 806,675

Reviews per user:
  Mean: 4.43
  Median: 2.00

Reviews per restaurant:
  Mean: 216.79
  Median: 115.00


In [None]:

# Step 1: randomly choose 1/3 of the unique users
unique_users = reviews_df['user_id'].unique()
half_users = np.random.choice(unique_users,
                              size=len(unique_users)//3,
                              replace=False)

# Step 2: keep only reviews from those selected users
filtered_df = reviews_df[reviews_df['user_id'].isin(half_users)]

# Step 3: for each selected user, sample at most 5 reviews
sampled_df = (
    filtered_df
    .groupby('user_id', group_keys=False)
    .apply(lambda x: x.sample(n=min(5, len(x)), random_state=42))
    .reset_index(drop=True)
)

print(sampled_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154902 entries, 0 to 154901
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user_id  154902 non-null  float64
 1   name     154902 non-null  object 
 2   time     154902 non-null  int64  
 3   rating   154902 non-null  int64  
 4   text     83567 non-null   object 
 5   gmap_id  154902 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 7.1+ MB
None


  .apply(lambda x: x.sample(n=min(5, len(x)), random_state=42))


In [None]:
import numpy as np
import pandas as pd

def sample_reviews(reviews_df, scale_by=3, max_reviews_per_user=5, random_state=42):
    """
    Randomly samples a subset of users and reviews from a reviews DataFrame.
    
    Parameters
    ----------
    reviews_df : pd.DataFrame
        The DataFrame containing reviews with a 'user_id' column.
    user_ratio : float, optional
        The fraction of unique users to include in the sample (default is 1/3).
    max_reviews_per_user : int, optional
        The maximum number of reviews to sample per selected user (default is 5).
    random_state : int, optional
        Random seed for reproducibility.
    
    Returns
    -------
    pd.DataFrame
        A DataFrame containing the sampled reviews.
    """
    user_ratio = 1/scale_by
    # Step 1: randomly choose a subset of users
    unique_users = reviews_df['user_id'].unique()
    n_users_to_sample = max(1, int(len(unique_users) * user_ratio))
    sampled_users = np.random.choice(unique_users, size=n_users_to_sample, replace=False)
    
    # Step 2: filter reviews for selected users
    filtered_df = reviews_df[reviews_df['user_id'].isin(sampled_users)]
    
    # Step 3: sample up to `max_reviews_per_user` reviews per user
    sampled_df = (
        filtered_df
        .groupby('user_id', group_keys=False)
        .apply(lambda x: x.sample(n=min(max_reviews_per_user, len(x)), random_state=random_state))
        .reset_index(drop=True)
    )
    
    return sampled_df


In [None]:
sampled_df = sample_reviews(user_ratio=0.33)

In [60]:
sampled_df.to_json('data/sampled/sampled_sf.json', orient='records', lines=True) 