load relevant libraries

In [None]:
from pinecone import Pinecone,ServerlessSpec
from openai import OpenAI
import numpy as np
import itertools
import pandas as pd 
import uuid
import os
from dotenv import load_dotenv


#loading the environment
load_dotenv()

#loading file
filepath = ''
df  = pd.read_csv(filepath,chunksize=100)
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

creating the index to store and query the data

In [None]:
def create_or_connect_index(name):
    # initializing connection
    pc = Pinecone(
        api_key=os.getenv('pinecone_api_key'),
        pool_threads=30  # defines the number of simultaneous processes allowed
    )
    
    if name in pc.list_indexes().to_list():
        print("index already exists: connected successfully")
    else:
        # creating index
        pc.create_index(
            name=name,
            dimensions=1536,  # openai models output dimensions at 1536
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
    
    return pc.Index(name, pool_threads=30)

Injesting data into the index via parallel batching

In [None]:
def ingest(df, index, namespace):
    import uuid

    async_results = []
    for chunk in df:
        ids = [str(uuid.uuid4()) for _ in range(len(chunk))]
        texts = chunk['text'].tolist()
        metadata = [{'id': id_, 'text': text} for id_, text in zip(ids, texts)]

        #creating embeddings for texts
        embeddings = client.embeddings.create(
            input=texts,
            model='text-embedding-ada-002'
        )['data']
        embeds = [emb['embedding'] for emb in embeddings]
        vectors = [(id_, emb, meta) for id_, emb, meta in zip(ids, embeds, metadata)]
        async_result = index.upsert(    
            vectors=vectors,
            async_req=True,
            namespace=namespace
        )
        async_results.append(async_result)
    # Wait for all async upserts to finish
    [result.get() for result in async_results]
    return 'uploaded'

Retrieval function

In [None]:
def retrieve(index, query, namespace, top_k=5, embed_model='text-embedding-ada-002'):
    #create embeddings for the query
    query_embedding = client.embeddings.create(
        input = query,
        model = embed_model
    )['data'][0]['embedding']

    #performing the query
    result = index.query(
        vector = query_embedding,
        top_k = top_k,
        include_metadata = True,
        namespace = namespace
    )

    documents =[]
    source = []

    for match in result['matches']:
        documents.append(match['metadata']['text'])
        source.append(match['metadata']['id'])

    return documents, source

context builder function

In [None]:
def context_builder(user_input, context_documents):
    sys_prompt = "Use the following context to answer the question."
    context = "\ncontext:\n" + "\n\n".join(context_documents)
    query = f"\nQuestion: {user_input} \nAnswer:"
    prompt = f"{sys_prompt} {context} {query}"
    return prompt

connecting to chat model

In [None]:
def chat(prompt, model ='gpt-4o-mini', temperature=1):
    response = client.chat.completions.create(
        model = model,
        messages = [
            {"role": "system", "content": "You are a helpful assistant who answers questions based on the provided context. If the context does not contain the answer, respond with 'I don't know'."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature
    )
    return response['choices'][0]['message']['content']

Putting it all together

In [None]:
index = create_or_connect_index('Rag-index')
ingest(df, index, 'Rag-namespace')
query = "What is RAG?"
documents, source = retrieve(index, query, 'Rag-namespace', top_k=5)
prompt = context_builder(query, documents)
response = chat(prompt)
for source_doc in source:
    print(f"Source Document ID: {source_doc}")
