In [1]:
import os
os.chdir("../../")

In [2]:
import pandas as pd
import tiktoken

from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from src.rag_pipeline.load_docs import load_docs_from_csv
from src.rag_pipeline.chunking_strategies import chunk_by_recursive_split
from src.env_loader import load_api_keys

In [3]:
openai_api_key = load_api_keys('OPENAI_API_KEY')

In [8]:

# Load our CSV file into a pandas DataFrame
df = pd.read_csv('data/cnn_dailymail_validation_subset.csv')
df.head()

Unnamed: 0,article,highlights,id
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,a4942dd663020ca54575471657a0af38d82897d6
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,4157bc4da185971e2742f349d69a037343bc0d95
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,60736693e3b1b32d14337a317190c6606e879a85
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",2e6613d531843515bf5401286cc3e45c4df530d2


In [9]:
# df = df.head(10)

### Calculate the cost of embedding data

In [10]:
# Helper functions to help us create the embeddings

# Helper func: calculate number of tokens
def num_tokens_from_string(string: str, encoding_name = "cl100k_base") -> int:
    if not string:
        return 0
    # Returns the number of tokens in a text string
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Helper function: calculate word length of article
def get_article_word_length(article):
    word_list = article.split()
    num_words = len(word_list)
    return num_words

# Helper function: calculate cost of embedding num_tokens
# text-embedding-ada-002 - 0.0001 for 1000 tokens
# Using text-embedding-3-large - 0.00013 for 1000 tokens
# Pricing here - https://openai.com/pricing
def get_embedding_cost(num_tokens):
    return num_tokens/1000*0.00013

# Helper function: calculate total cost of embedding all content in the dataframe
def get_total_embeddings_cost(df: pd.DataFrame):
    total_tokens = 0
    for i in range(len(df.index)):
        text = df['article'][i]
        token_len = num_tokens_from_string(text)
        total_tokens = total_tokens + token_len
    total_cost = get_embedding_cost(total_tokens)
    return total_cost



In [11]:
# quick check on total token amount for price estimation
total_cost = get_total_embeddings_cost(df)
print("estimated price to embed this content using ada-002 model = $" + str(total_cost))

estimated price to embed this content using ada-002 model = $0.10470407999999999


### Load the documents from csv

In [10]:
documents = load_docs_from_csv(as_document=True)

In [11]:
len(documents)

1000

In [12]:
split_docs = chunk_by_recursive_split(documents, 1000, 200)

--Split 1000 documents into 5030 chunks.--


In [13]:
split_docs = split_docs[:2]

### Create embeddings for the documents

In [14]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

### Initialize the vectorstore
Instructions [here](https://python.langchain.com/v0.2/docs/integrations/vectorstores/pgvector/)(official langchain docs) on loading postgres container with pgvector enabled
#### 1. First approach - initialize vectorstore with documents

In [None]:
# Test connection string
connection_string = "postgresql+psycopg://langchain:langchain@localhost:5432/langchain"

In [15]:
# collection_name = "cnn_dailymail_validation_subset"

# vectorstore = PGVector.from_documents(
#     embedding=embeddings,
#     documents=split_docs,
#     collection_name=collection_name,
#     connection=connection_string,
#     use_jsonb=True,
# )

###### Dropping tables in pgvector
```vectorstore.drop_tables()```

#### 2. Second approach - Initialize vectorstore first and add documents later

In [20]:
collection_name = "cnn_dailymail_validation_subset_2"

vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection_string,
    use_jsonb=True,
)

In [23]:
# vectorstore.drop_tables()
vectorstore.add_documents(split_docs)

['3d0bc0fc-bd13-4ed7-ae0b-d0e49aad7c22',
 '441b81cd-21c8-4347-9569-ae7be1796ada']

In [24]:
retriever = vectorstore.as_retriever()

In [25]:
retriever.invoke("What was the power of Broussard's gift?")

[Document(metadata={'id': 'a4942dd663020ca54575471657a0af38d82897d6', 'source': 'cnn_dailymail', 'start_index': 0}, page_content='(CNN)Share, and your gift will be multiplied. That may sound like an esoteric adage, but when Zully Broussard selflessly decided to give one of her kidneys to a stranger, her generosity paired up with big data. It resulted in six patients receiving transplants. That surprised and wowed her. "I thought I was going to help this one person who I don\'t know, but the fact that so many people can have a life extension, that\'s pretty big," Broussard told CNN affiliate KGO. She may feel guided in her generosity by a higher power. "Thanks for all the support and prayers," a comment on a Facebook page in her name read. "I know this entire journey is much bigger than all of us. I also know I\'m just the messenger." CNN cannot verify the authenticity of the page. But the power that multiplied Broussard\'s gift was data processing of genetic profiles from donor-recipie