In [57]:
import pandas as pd
import numpy as np
import tiktoken
import json

In [58]:
MODEL_NAME = 'GPT-4o'
PRICES_PATH = './prices.json'
f = open(PRICES_PATH)
PRICES_DICT = json.load(f)[MODEL_NAME]
EMBEDDING_COST_PER_1M_TOKEN_SMALL = 0.02
EMBEDDING_COST_PER_1M_TOKEN_LARGE = 0.13
TOKEN_PER_PRODUCT = 120

In [59]:
comments_path = '../data/sample/comments.csv'
comments_df = pd.read_csv(comments_path)

In [60]:
comments_df.head()

Unnamed: 0,id,title,body,created_at,rate,recommendation_status,is_buyer,product_id,advantages,disadvantages,likes,dislikes,seller_title,seller_code,true_to_size_rate
0,49541440,خیلی عالیه,پیشنهاد میکنم,2 خرداد 1402,5.0,recommended,True,6505713,,,0,0,دیجی‌کالا,5A52N,
1,25614636,کتاب,تصویرسازی و کیفیت کاغذ و متن خیلی خوب هستند. ک...,30 شهریور 1400,0.0,recommended,True,1743432,['تصویرسازی و متن کتاب و کیفیت کاغذ و قیمت عالی'],['ندارد'],1,0,سیاره ی کتاب,AJ9XP,
2,19155784,کرم عش آلوئه ورا,من مدتهاست بعنوان کرم مرطوب کننده دست، از این ...,17 فروردین 1400,4.0,recommended,True,1096464,,,0,0,بازرگانی باتیس,ACN47,
3,54371206,,زخم صورتم که گود شده بود با استفاده مداوم روزا...,15 مهر 1402,5.0,recommended,True,1810442,,,0,0,شونیا,EXNF7,
4,54536587,,خوش بو و رولش راحت میچرخه,20 مهر 1402,5.0,recommended,True,3358296,,,0,0,آیرین پلاست,C9YXK,


In [63]:
def estimate_tokens(paragraph):
    enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(paragraph))

# Calculate tokens for each paragraph
comments_df['tokens_count'] = comments_df['body'].dropna().apply(estimate_tokens)  # Update 'paragraph_column' with the correct column name
n_product = comments_df['product_id'].nunique()

# Calculate the total number of tokens
body_total_tokens = comments_df['tokens_count'].sum()

total_cost_to_summarize = (body_total_tokens / 1000000) * PRICES_DICT['INPUT_COST_PER_1M_TOKEN'] + (TOKEN_PER_PRODUCT * n_product / 1000000) * PRICES_DICT['OUTPUT_COST_PER_1M_TOKEN']

print(f'Total number of tokens: {body_total_tokens}')
print(f'Total cost for summarizing the dataset with {MODEL_NAME}: ${total_cost_to_summarize:.2f}')

Total number of tokens: 236833.0
Total cost for summarizing the dataset with GPT-4o: $1.80


In [62]:
title_tokens = comments_df['title'].dropna().apply(estimate_tokens).sum()
n_product = comments_df['product_id'].nunique()
# suppose that every summarization has 100 tokens
body_tokens = n_product * TOKEN_PER_PRODUCT
embedding_total_tokens = body_tokens + title_tokens

total_cost_to_embedding_small = (embedding_total_tokens / 1000000) * EMBEDDING_COST_PER_1M_TOKEN_SMALL
total_cost_to_embedding_large = (embedding_total_tokens / 1000000) * EMBEDDING_COST_PER_1M_TOKEN_LARGE

print(f'Total number of tokens: {embedding_total_tokens}')
print(f'Total cost for embedding the dataset with small model: ${total_cost_to_embedding_small:.2f}')
print(f'Total cost for embedding the dataset with large model: ${total_cost_to_embedding_large:.2f}')

Total number of tokens: 76404
Total cost for embedding the dataset with small model: $0.00
Total cost for embedding the dataset with large model: $0.01
