In [1]:
from openai import OpenAI

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os
from dotenv import load_dotenv



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

load_dotenv("../.env")


True

In [28]:

def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

doc = read_doc('documents/') 

doc 


[]

In [3]:
from datasets import load_dataset

 

In [None]:

# Healthcare
ds = load_dataset("abhinand/MedEmbed-training-triplets-v1", "default")


In [10]:

ds = load_dataset("alvanlii/finance-textbooks")
ds

DatasetDict({
    train: Dataset({
        features: ['book_text', 'book_title'],
        num_rows: 60
    })
})

In [13]:
print(ds["train"][0])
len(ds["train"])

{'book_text': 'Artificial Intelligence in Finance - AI Applications in Finance - Overview of AI in Finance\nThe integration of artificial intelligence (AI) in finance has transformed the financial services industry in recent years. AI has been applied in various aspects of finance, including investment analysis, portfolio management, risk assessment, and compliance. This section provides an overview of AI in finance, its benefits, challenges, and future prospects.\nHistorical Development of AI\nThe concept of artificial intelligence dates back to the 1950s when computer scientists like Alan Turing, Marvin Minsky, and John McCarthy pioneered the field. Initially, AI focused on developing machines that could perform tasks that typically required human intelligence, such as problem-solving, learning, and perception. In the 1980s, AI research experienced a decline due to limited computing power and lack of data. However, with advancements in computing power, data storage, and machine learn

60

In [None]:
# Healthcare
docs = []

for row in ds["train"]:
    docs.append(
        Document(
            page_content=row["pos"],
            metadata={
                "domain": "healthcare",
                "dataset": "MedEmbed",
                "query_hint": row["query"]
            }
        )
    )


KeyError: 'pos'

In [19]:
# Financial
docs = []

for row in ds["train"]:
    docs.append(
        Document(
            page_content=row["book_text"],
            metadata={
                "domain": "finance",
                "source": "alvanlii/finance-textbooks",
                "book_title": row["book_title"]
}
        )
    )


In [20]:
len(docs)


60

In [21]:
def chunk_data(docs,chunk_Size=800,chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_Size,chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(docs)
    return docs

chunked_data = chunk_data(docs)

In [22]:
from langchain_openai import OpenAIEmbeddings

embeddings =OpenAIEmbeddings(model='text-embedding-3-small',api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x3684fa510>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x3684fae40>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

Large amount of data so going for only 20%

In [None]:
# texts = [doc.page_content for doc in chunked_data]

# vectors = embeddings.embed_documents(texts)

# print(len(vectors))          # number of chunks
# print(len(vectors[0]))       # embedding dimension (1536)

KeyboardInterrupt: 

In [28]:
# sample_size = int(0.04* len(chunked_data))
sample_size = int(0.2* len(chunked_data))

sampled_docs = chunked_data[:sample_size]
sampled_docs
sample_size

6386

In [29]:

texts = [doc.page_content for doc in sampled_docs]
vectors = embeddings.embed_documents(texts)

In [30]:
from pinecone import Pinecone as PineconeClient
from langchain_pinecone import PineconeVectorStore as Pinecone

pc = PineconeClient(api_key=os.environ['PINECONE_API_KEY'])
index_name = 'financial'

In [31]:
print(len(vectors[0]))  

1536


In [32]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

if index_name not in [i["name"] for i in pc.list_indexes()]:
    print("Creating Index",index_name)
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

Creating Index financial


In [33]:
import uuid

pinecone_vectors = []

for doc, vector in zip(sampled_docs, vectors):
    pinecone_vectors.append(
        (
            str(uuid.uuid4()),     # unique ID
            vector,                # 1536-dim embedding
            doc.metadata | {
                "text": doc.page_content   # optional but recommended
            }
        )
    )


In [34]:
BATCH_SIZE = 100

for i in range(0, len(pinecone_vectors), BATCH_SIZE):
    batch = pinecone_vectors[i : i + BATCH_SIZE]
    index.upsert(vectors=batch)

In [35]:
stats = index.describe_index_stats()
print(stats)

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 6386}},
 'total_vector_count': 6386,
 'vector_type': 'dense'}
