In [2]:
import os
import pandas as pd
import numpy as np

files = os.listdir('./')
user_data = pd.read_csv([file for file in files if 'user' in file and 'csv' in file][0])
print(user_data.shape)
business_data = pd.read_csv([file for file in files if 'business' in file and 'csv' in file][0])
print(business_data.shape)

(1808, 5)
(1573, 5)


In [4]:
folder_name = "../../original/"

train_data = pd.read_csv(os.path.join(folder_name, "research_training_set.csv"))
valid_data = pd.read_csv(os.path.join(folder_name, "research_validation_set.csv"))
test_data = pd.read_csv(os.path.join(folder_name, "research_test_set.csv"))

In [5]:
train_data = pd.merge(train_data, user_data, on='user_id')
train_data = pd.merge(train_data, business_data, on='business_id')
print("Train data shape:", train_data.shape)

valid_data = pd.merge(valid_data, user_data, on='user_id')
valid_data = pd.merge(valid_data, business_data, on='business_id')
print("Valid data shape:", valid_data.shape)

test_data = pd.merge(test_data, user_data, on='user_id')
test_data = pd.merge(test_data, business_data, on='business_id')
print("Test data shape:", test_data.shape)

Train data shape: (11196, 52)
Valid data shape: (1400, 52)
Test data shape: (1400, 52)


In [7]:
# load written data

from pathlib import Path
import nbformat

def load_notebook(notebook_path):
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)
    code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
    exec('\n'.join(code_cells), globals())

# import written function and variable

parent_directory = Path('../../../')
data_preprocessing_utils = parent_directory / 'data_preprocessing_utils.ipynb'
experiment_related_utils = parent_directory / 'experiment_related_utils.ipynb'

load_notebook(data_preprocessing_utils)
load_notebook(experiment_related_utils)

In [11]:
filtered_reviews, train_cal_results = sample_reviews_and_calculate_price_then_return_data(train_data, percent=100, 
                                                        user_reviews_num=100000,
                                                        user_comparison="max",
                                                        user_reviews_num_range=None,
                                                        business_reviews_num=100000,
                                                        business_comparison="max",
                                                        business_reviews_num_range=None,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
print_nested_dict(train_cal_results)

Percent: 100 %
User threshold reviews num: 100000
User reviews num comparison method: max
User threshold reviews num range: None
Business threshold reviews num: 100000
Business reviews num comparison method: max
Business threshold reviews num range: None
Generative AI model: GPT-3.5 Turbo
Sampling_method: random
Column: None


-----------------------------------------------------------


percent: 100
user_reviews_num_threshold: 100000
user_comparison_method: max
user_reviews_num_range: None
business_reviews_num_threshold: 100000
business_comparison_method: max
business_reviews_num_range: None
sampling_method: random
column: None
sampled_percent_reviews_num: 11196
filtered_reviews_num: 11196
filtered_users_count: 1808
filtered_businesses_count: 1573
min_reviews_per_user: 1
mean_reviews_per_user: 6.192477876106195
max_reviews_per_user: 61
min_reviews_per_business: 1
mean_reviews_per_business: 7.117609663064209
max_reviews_per_business: 50
genai: GPT-3.5 Turbo
model_NT_price_per_token_inp

In [17]:
filtered_reviews, valid_cal_results = sample_reviews_and_calculate_price_then_return_data(valid_data, percent=100, 
                                                        user_reviews_num=100000,
                                                        user_comparison="max",
                                                        user_reviews_num_range=None,
                                                        business_reviews_num=100000,
                                                        business_comparison="max",
                                                        business_reviews_num_range=None,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
print_nested_dict(valid_cal_results)

Percent: 100 %
User threshold reviews num: 100000
User reviews num comparison method: max
User threshold reviews num range: None
Business threshold reviews num: 100000
Business reviews num comparison method: max
Business threshold reviews num range: None
Generative AI model: GPT-3.5 Turbo
Sampling_method: random
Column: None


-----------------------------------------------------------


percent: 100
user_reviews_num_threshold: 100000
user_comparison_method: max
user_reviews_num_range: None
business_reviews_num_threshold: 100000
business_comparison_method: max
business_reviews_num_range: None
sampling_method: random
column: None
sampled_percent_reviews_num: 1400
filtered_reviews_num: 1400
filtered_users_count: 944
filtered_businesses_count: 918
min_reviews_per_user: 1
mean_reviews_per_user: 1.4830508474576272
max_reviews_per_user: 8
min_reviews_per_business: 1
mean_reviews_per_business: 1.5250544662309369
max_reviews_per_business: 7
genai: GPT-3.5 Turbo
model_NT_price_per_token_input: 

In [18]:
filtered_reviews, test_cal_results = sample_reviews_and_calculate_price_then_return_data(test_data, percent=100, 
                                                        user_reviews_num=100000,
                                                        user_comparison="max",
                                                        user_reviews_num_range=None,
                                                        business_reviews_num=100000,
                                                        business_comparison="max",
                                                        business_reviews_num_range=None,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
print_nested_dict(test_cal_results)

Percent: 100 %
User threshold reviews num: 100000
User reviews num comparison method: max
User threshold reviews num range: None
Business threshold reviews num: 100000
Business reviews num comparison method: max
Business threshold reviews num range: None
Generative AI model: GPT-3.5 Turbo
Sampling_method: random
Column: None


-----------------------------------------------------------


percent: 100
user_reviews_num_threshold: 100000
user_comparison_method: max
user_reviews_num_range: None
business_reviews_num_threshold: 100000
business_comparison_method: max
business_reviews_num_range: None
sampling_method: random
column: None
sampled_percent_reviews_num: 1400
filtered_reviews_num: 1400
filtered_users_count: 958
filtered_businesses_count: 872
min_reviews_per_user: 1
mean_reviews_per_user: 1.4613778705636744
max_reviews_per_user: 6
min_reviews_per_business: 1
mean_reviews_per_business: 1.6055045871559632
max_reviews_per_business: 11
genai: GPT-3.5 Turbo
model_NT_price_per_token_input:

In [20]:
# Model	Training	Input usage	Output usage
# gpt-3.5-turbo	$8.00 / 1M tokens	$3.00 / 1M tokens	$6.00 / 1M tokens
# davinci-002	$6.00 / 1M tokens	$12.00 / 1M tokens	$12.00 / 1M tokens
# babbage-002	$0.40 / 1M tokens	$1.60 / 1M tokens	$1.60 / 1M tokens
# For a training file with 100,000 tokens trained over 3 epochs, the expected cost would be ~$2.40 USD.

training_tokens = train_cal_results['filtered_users_count']*train_cal_results['avg_token_count_per_user'] + train_cal_results['filtered_businesses_count']*train_cal_results['avg_token_count_per_business']

epoch = 3

training_price = (240/1000000) * epoch
input_usage_price = (90/1000000)
output_usage_price = (180/1000000)

fine_tune_price = training_tokens * training_price

print("Training price:", fine_tune_price)

input_tokens = valid_cal_results['sampled_percent_reviews_num'] * (valid_cal_results['avg_token_count_per_user'] + valid_cal_results['avg_token_count_per_business'])
input_usage_price = input_tokens * input_usage_price

print("Input usage price:", input_usage_price)

Training price: 2878.28928
Input usage price: 67.98531584413428


In [23]:
# Apply the function to calculate token count for each review

user_cols = list(user_data.columns)
user_argument_text_col = [col for col in user_cols if "argument" in col][0]
business_cols = list(business_data.columns)
business_argument_text_col = [col for col in business_cols if "argument" in col][0]

user_data['token_count_nltk'] = user_data[user_argument_text_col].apply(calculate_token_count_nltk)
business_data['token_count_nltk'] = user_data[user_argument_text_col].apply(calculate_token_count_nltk)

# Calculate average token count per user and per business

total_token_count_per_user = user_data.groupby('user_id')['token_count_nltk'].sum()
avg_token_count_per_user = total_token_count_per_user.mean()

print("GPT, avg_token_count_per_user:", avg_token_count_per_user)
print("GPT, avg_token_count_per_business:", avg_token_count_per_business)

total_token_count_per_business = business_data.groupby('business_id')['token_count_nltk'].sum()
avg_token_count_per_business = total_token_count_per_business.mean()

# calculate training, using price

training_tokens = train_cal_results['filtered_users_count'] * avg_token_count_per_user + train_cal_results['filtered_businesses_count'] * avg_token_count_per_business

epoch = 3

training_price = (240/1000000) * epoch
input_usage_price = (90/1000000)
output_usage_price = (180/1000000)

fine_tune_price = training_tokens * training_price

print("Training price:", fine_tune_price)

input_tokens = valid_cal_results['sampled_percent_reviews_num'] * (avg_token_count_per_user + avg_token_count_per_business)
input_usage_price = input_tokens * input_usage_price

print("Input usage price:", input_usage_price)

model_NT_price_per_token_output = 0.000045
model_NT_price_per_token_input = (model_NT_price_per_token_output / 3)
model_NT_price_per_token_input_and_output = ((model_NT_price_per_token_input + model_NT_price_per_token_output) / 2)

input_price = input_tokens * model_NT_price_per_token_input

print("GPT predicting ratings price based on argumented results:", input_price)

input_original_tokens = valid_cal_results['sampled_percent_reviews_num'] * (valid_cal_results['avg_token_count_per_user'] + valid_cal_results['avg_token_count_per_business'])
input_original_price = input_original_tokens * model_NT_price_per_token_input

print("GPT predicting ratings price based on original results:", input_original_price)

GPT, avg_token_count_per_user: 135.05862831858408
GPT, avg_token_count_per_business: 135.15066751430388
Training price: 328.88016000000005
Input usage price: 34.04637127494389
GPT predicting ratings price based on argumented results: 5.674395212490647
GPT predicting ratings price based on original results: 11.330885974022378
