In [1]:
import pandas as pd
import tiktoken

In [2]:
ECONDING_NAME = 'cl100k_base'
enc = tiktoken.get_encoding(ECONDING_NAME)
assert enc.decode(enc.encode("mengão campeão")) == "mengão campeão"

In [3]:
prompt = 'classifique a sentença <<PHRASE>> como positiva, negativa ou neutra. Responda apenas com "positiva", "negativa" ou "neutra".'
prompt_tokens = len(enc.encode(prompt))
prompt_tokens

38

In [4]:
facebook_corpus = pd.read_csv('data/processed/facebook_cachaca.csv').dropna(subset=['text']).text.values
twitter_corpus = pd.read_csv('data/raw/twitter-dataset-CachacaOM-completo.csv').text.values

In [5]:
def count_tokens(corpus):
    return sum(len(enc.encode(text)) + prompt_tokens for text in corpus)

In [6]:
facebook_corpus_tokens = count_tokens(facebook_corpus)
twitter_corpus_tokens = count_tokens(twitter_corpus)
facebook_corpus_tokens, twitter_corpus_tokens

(1776264, 1533719)

In [16]:
BASE = 1_000_000
prices = {
    'gpt-4o': {'input': 5, 'output': 15},
    'gpt-4-turbo': {'input': 10, 'output': 30},
    'gpt-4': {'input': 30, 'output': 60},
    'gpt-3.5-turbo-0125': {'input': 0.5, 'output': 1.5},
}

In [19]:
def get_prices(corpus, tokens):
    for model, price in prices.items():
        print(f'-> {model}')
        print(f"input: {(price['input'] * tokens) / BASE:.2f} USD")
        print(f"output: {(price['output'] * 2 * corpus.shape[0]) / BASE:.2f} USD")
        print()

In [22]:
print('Facebook:')
get_prices(facebook_corpus, facebook_corpus_tokens)

print('Twitter:')
get_prices(twitter_corpus, twitter_corpus_tokens)

Facebook:
-> gpt-4o
input: 8.88 USD
output: 1.04 USD

-> gpt-4-turbo
input: 17.76 USD
output: 2.08 USD

-> gpt-4
input: 53.29 USD
output: 4.15 USD

-> gpt-3.5-turbo-0125
input: 0.89 USD
output: 0.10 USD

Twitter:
-> gpt-4o
input: 7.67 USD
output: 0.53 USD

-> gpt-4-turbo
input: 15.34 USD
output: 1.07 USD

-> gpt-4
input: 46.01 USD
output: 2.13 USD

-> gpt-3.5-turbo-0125
input: 0.77 USD
output: 0.05 USD

