In [1]:
from pathlib import Path
import nbformat

def load_notebook(notebook_path):
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)
    code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
    exec('\n'.join(code_cells), globals())

In [2]:
# yelp datasets file
yelp_datasets_path = '../../Data_preprocessing/yelp_datasets/'

# yelp dataset (from yelp offical website)
yelp_offical_dataset_path = yelp_datasets_path + 'yelp_dataset_official/'

# business, user and review dataset
yelp_academic_dataset_business_path = yelp_offical_dataset_path + 'yelp_academic_dataset_business.json'
yelp_academic_dataset_user_path = yelp_offical_dataset_path + 'yelp_academic_dataset_user.json'
yelp_academic_dataset_review_path = yelp_offical_dataset_path + 'yelp_academic_dataset_review.json'

# yelp photo dataset (from yelp official website)
yelp_offical_photo_dataset_path = yelp_datasets_path + 'yelp_dataset_official_photos/'

# photo dataset
photo_dataset_path = yelp_offical_photo_dataset_path + 'photos/'

In [3]:
# merge user data and business data to orginal dataset

import pandas as pd
import json
import os

# Define a function to read JSON files and read them in chunks
def read_json_with_chunk(file_path, chunk_size):
    chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size)
    return pd.concat(chunks)

# Read user, business and original data
user_data = read_json_with_chunk(yelp_academic_dataset_user_path, 10000)
business_data = read_json_with_chunk(yelp_academic_dataset_business_path, 10000)

# Add prefix to column names in user_data and business_data
user_data = user_data.add_prefix('user_').rename(columns={'user_user_id': 'user_id'})
business_data = business_data.add_prefix('business_').rename(columns={'business_business_id': 'business_id'})

file_path = "research_original_set_13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars.csv"
original_data = pd.read_csv(file_path)

# Merge user data into original data
original_data = pd.merge(original_data, user_data, how='left', left_on='user_id', right_on='user_id')

# Merge business data into original data
original_data = pd.merge(original_data, business_data, how='left', left_on='business_id', right_on='business_id')

# Save to csv
original_data.to_csv(f"research_original_set_with_user_and_business_data.csv", index=False)

In [4]:
# read original dataset

file_path = "research_original_set_with_user_and_business_data.csv"
original_data = pd.read_csv(file_path)

In [7]:
# import written function and variable

parent_directory = Path('../../')
data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'

load_notebook(data_preprocessing_utils_path)

In [6]:
# kfold_cross_validation(original_data, train_ratio=0.9, test_ratio=0.1, fold_num=10)

Data has successfully split into training set and test set and saved to csv files !
Training data has successfully split into 10 folds and saved to csv files !


In [8]:
training_data, validation_data, test_data = dataset_split(original_data, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, stratify=(False, "stars"))

Original dataset size: (13996, 44), ratio is 1.0
Training dataset size: (11196, 44), ratio is 0.7999428408116604
Validation dataset size: (1400, 44), ratio is 0.10002857959416976
Test dataset size: (1400, 44), ratio is 0.10002857959416976


In [9]:
# save as csv

training_data.to_csv("research_training_set.csv", index=False)
validation_data.to_csv("research_validation_set.csv", index=False)
test_data.to_csv("research_test_set.csv", index=False)