In [1]:
import os
os.chdir("../../")

In [13]:
import pandas as pd
import tiktoken

from src.rag_pipeline.load_docs import load_docs_from_csv
from src.rag_pipeline.chunking_strategies import chunk_by_recursive_split
from src.env_loader import load_api_keys

In [14]:
openai_api_key = load_api_keys('OPENAI_API_KEY')

In [3]:

# Load our CSV file into a pandas DataFrame
df = pd.read_csv('data/cnn_dailymail_validation_subset.csv')
df.head()

Unnamed: 0,article,highlights,id
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,a4942dd663020ca54575471657a0af38d82897d6
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,4157bc4da185971e2742f349d69a037343bc0d95
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,60736693e3b1b32d14337a317190c6606e879a85
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",2e6613d531843515bf5401286cc3e45c4df530d2


In [4]:
df = df.head(10)

### Calculate the cost of embedding data

In [5]:
# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate word length of article
def get_article_word_length(article):
    word_list = article.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# I'm using the text-embedding-ada-002 model
# Pricing here - https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.0001

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost(df: pd.DataFrame):
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['article'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost



In [6]:
# quick check on total token amount for price estimation
total_cost = get_total_embeddings_cost(df)
print("estimated price to embed this content using ada-002 model = $" + str(total_cost))

estimated price to embed this content using ada-002 model = $0.0006951


### Load the documents from csv

In [8]:
documents = load_docs_from_csv(as_document=True)

In [9]:
len(documents)

1000

In [10]:
split_docs = chunk_by_recursive_split(documents, 1000, 200)

--Split 1000 documents into 5030 chunks.--
