In [13]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain import embeddings
from langchain.chains import RetrievalQA

### DOCUMENT LOADING

In [16]:
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(r"C:\Users\ANITA\Downloads\Copy of symptom_precaution.csv",encoding='latin-1')
document = loader.load()


### DOCUMENT SPLITTING

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000, chunk_overlap = 200)
splits = text_splitter.split_documents(document)
print(len(splits))

41


In [27]:
print(splits[0])

page_content='Disease: Drug Reaction
Precaution: stop irritation' metadata={'source': 'C:\\Users\\ANITA\\Downloads\\Copy of symptom_precaution.csv', 'row': 0}


### CREATING EMBEDDINGS

In [18]:
from langchain import embeddings

In [30]:
from langchain.embeddings import OpenAIEmbeddings
import time

embedding = OpenAIEmbeddings(api_key = "OPENAI_API_KEY")

# Batch size for processing embeddings
batch_size = 100
embedded_data = []

for start_index in range(0, len(splits), batch_size):
    batch_splits = splits[start_index:start_index + batch_size]

    batch_embeddings = embedding.embed_documents([split.page_content for split in batch_splits])

    for counter, embeddings in enumerate(batch_embeddings):
      split = splits[counter]
      data_object = {
          'id' : start_index + counter,
          'vector' : embeddings,
          'text' : split.page_content,
          'metadata' : {"source": split.metadata['source']}

      }
      embedded_data.append(data_object)


    print(f"Processed batch {start_index // batch_size + 1}")
    time.sleep(1)  # Pause to avoid rate limiting

# print("Embedding complete. Embedded data:")
# for data in embedded_data:
#     print(f"ID: {data['id']}, vector: {data['vector']}, metadata: {data['metadata']}")


Processed batch 1
Embedding complete. Embedded data:
ID: 0, vector: [0.0027306546491218165, 0.0019340080427187876, 0.024900146709074023, -0.0011694576073394993, -0.014497653162447144, 0.03565817079848507, -0.0027997853292957915, -0.008954045679342795, 0.0006398686317642534, 0.0035421151057510696, 0.00419720882488044, 0.011218892807043931, 0.012983366374745443, 0.0214633744347847, -0.0038219291205420264, 0.02052846623520307, 0.040214197269293005, 0.00961901560906385, 0.009151561509273034, -0.008308827239482501, -0.023003996231859884, 0.02746785156684454, 0.01016547566829772, -0.01512970363195939, 0.0065608134018853926, 0.01961989397631753, 0.00979677901781072, -0.053144892693968875, -0.021832070153949095, 0.02258263049394853, 0.0019142564364426984, -0.00643901223730315, 0.01938287609798837, -0.011574420555860276, 0.013957776622725992, 0.002312579739644213, 0.005790502503347801, -0.0075187648510841466, 0.008025721930596488, -0.008038889900944533, 0.013707590774048789, 0.00175953581165164

### CREATING PINECONE INDEX

In [21]:
! pip install pinecone-client




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
from pinecone import Pinecone
import time

pc = Pinecone(api_key="PINECONE_API_KEY")
index_name = "csv-data"
Index = pc.Index(index_name)

### UPSERTING VECTORS INTO PINECONE INDEX

In [35]:
batch_size = 50  # Adjust the batch size as needed
num_data = len(embedded_data)

for start_index in range(0, num_data, batch_size):
    batch_data = embedded_data[start_index:start_index + batch_size]

    items = [{"id": str(entry["id"]),"values": entry["vector"],"metadata": {**entry["metadata"],"text":entry["text"]}} for entry in batch_data]

    try:
        # Upsert batch data into the Pinecone index
        upserted_count = Index.upsert(items)
        print(f"Upserted {upserted_count} items.")
    except Exception as e:
        print(f"Error upserting batch: {e}")

    # Optional: Add a delay to avoid exceeding API rate limits
    time.sleep(0.1)  # Adjust the delay as needed

Upserted {'upserted_count': 41} items.


In [36]:
! pip install langchain_openai
! pip install langchain_pinecone




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
import os 
os.environ["OPENAI_API_KEY"]="OPENAI_API_KEY"
os.environ["PINECONE_API_KEY"]="PINECONE_API_KEY"

from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
index_name = "csv-data"
vectordb = PineconeVectorStore.from_existing_index(index_name, embeddings)

### SEMANTIC SEARCH

In [39]:
query = "what are precaution of migrane"
retriever=vectordb.similarity_search(query, k=3)
retriever

[Document(metadata={'source': 'C:\\Users\\ANITA\\Downloads\\Copy of symptom_precaution.csv'}, page_content='Disease: Migraine\nPrecaution: meditation'),
 Document(metadata={'source': 'C:\\Users\\ANITA\\Downloads\\Copy of symptom_precaution.csv'}, page_content='Disease: Malaria\nPrecaution: Consult nearest hospital'),
 Document(metadata={'source': 'C:\\Users\\ANITA\\Downloads\\Copy of symptom_precaution.csv'}, page_content='Disease: Hypertension\nPrecaution: meditation')]

In [40]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                 chain_type = "stuff",
                                 retriever = vectordb.as_retriever(search_kwargs={"k":3}),
                                 return_source_documents = False
                                       )

In [41]:
def process_llm_response(llm_response):
  print(llm_response['result'])


In [43]:
query = "what are precaution of migrane in 3 points"
llm_response = qa_chain(query)
process_llm_response(llm_response)


1. Practicing meditation or other relaxation techniques regularly.
2. Avoiding triggers such as stress, certain foods, and lack of sleep.
3. Consulting a doctor for proper diagnosis and treatment.
