### Preprocess data needed for LangChain and Retrieval Augment Generation & Question Answering

In [1]:
import json
import pandas as pd

json_data = json.load(open('merged_file.json'))
print(len(json_data['data']))
print(json_data['data'][0])
print(json_data['data'][0]['paragraphs'][0]['context'])
new_d = []
for i in range(len(json_data['data'])):
  each_d = {}
  each_d['context'] = json_data['data'][i]['paragraphs'][0]['context']
  new_d.append(each_d)
df = pd.DataFrame(new_d)
# Save the DataFrame to a CSV file
df.to_csv('merged_file.csv', index=False)

4532
{'title': 'https://www.webmd.com/digestive-disorders/digestive-diseases-gastritis', 'paragraphs': [{'qas': [{'url': 'https://www.webmd.com/digestive-disorders/qa/what-are-the-symptoms-of-gastritis', 'id': '46a31525fae6a6b203c9be1e0eb7017d', 'answers': [{'text': 'However, the most common symptoms include: Nausea or recurrent upset stomach Abdominal bloating Abdominal pain Vomiting Indigestion Burning or gnawing feeling in the stomach between meals or at night Hiccups Loss of appetite Vomiting blood or coffee ground-like material Black, tarry stools To diagnose gastritis, your doctor will review your personal and family medical history, perform a thorough physical evaluation, and may recommend any of the following tests: Upper endoscopy.', 'answer_span': [7], 'answer_start': 1688, 'answer_starts': [[1688, 482]]}], 'question': 'What are the symptoms of gastritis?', 'is_impossible': False}, {'url': 'https://www.webmd.com/digestive-disorders/qa/what-does-the-treatment-for-gastritis-inv

In [1]:
from operator import itemgetter
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.vectorstores import FAISS
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

  from .autonotebook import tqdm as notebook_tqdm


### Use langchain CSV loader to load the CSV file and use recursive character text splitter

In [2]:
#use langchain to load csv file
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='./data_val_consec.csv', source_column="context")

data = loader.load()

# Split documents

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
splits = text_splitter.split_documents(loader.load())

In [3]:
# Embed and store splits
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings

In [4]:
#embeddings_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings_model_name = "alibidaran/medical_transcription_generator"
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

No sentence-transformers model found with name /home2/kolubex/.cache/torch/sentence_transformers/alibidaran_medical_transcription_generator. Creating a new one with MEAN pooling.


In [1]:
vectorstore = Chroma.from_documents(documents=splits,embedding=embeddings, persist_directory="./vectorstore_val_consec")
retriever = vectorstore.as_retriever()
# Prompt
# https://smith.langchain.com/hub/rlm/rag-prompt

from langchain import hub
rag_prompt = hub.pull("rlm/rag-prompt")

In [2]:
vectorstore.persist()
vectorstore = None

In [37]:
vectorstore = Chroma(persist_directory="./vectorstore_train", embedding_function=embeddings)

In [38]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch
from langchain.llms import HuggingFaceHub

In [39]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-small'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=False, device_map='auto')

pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=128
)

hf = HuggingFacePipeline(pipeline=pipeline)

In [40]:
retriever = vectorstore.as_retriever()
# Prompt
# https://smith.langchain.com/hub/rlm/rag-prompt

from langchain import hub
rag_prompt = hub.pull("rlm/rag-prompt")

In [41]:
 # RAG chain

from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | hf
)

In [44]:
import time
s = time.time()
ress = rag_chain.invoke("What are the symptoms of ischemic heart disease?")
print(ress)
print(time.time()-s)

systolic heart failure
7.006139039993286
