# RAG (Langchain and Mistral)

### Import dependencies

In [1]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_mistralai import MistralAIEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import Chroma, FAISS
import os

In [2]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = "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"
os.environ['MISTRAL_API_KEY'] = "Uaj41DARGjquAh7l2HGuwVef2LAsVfeb"
mistral_api_key = "Uaj41DARGjquAh7l2HGuwVef2LAsVfeb"

In [3]:
os.environ['HF_TOKEN'] = "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"`

## CSV

In [61]:
from langchain.embeddings import CohereEmbeddings
from langchain_cohere import ChatCohere
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

In [59]:
cohere_key = "8acOGXby2VEQ70UErIkns9I0qGLUu9QT6KIYfzpA"

embeddings_model = CohereEmbeddings(cohere_api_key = cohere_key)
loader = CSVLoader(file_path="/Users/jean-sebastiengaultier/Desktop/UChicago/Academic/Hackathon/mtsamples_with_rand_names.csv")
data = loader.load()
data_test = data[:10]
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
split_csv = text_splitter.split_documents(data_test)
vectorstore = Chroma.from_documents(documents=split_csv, 
                                    embedding=embeddings_model)

#retriever = vectorstore.as_retriever()

Created a chunk of size 1346, which is longer than the specified 1000
Created a chunk of size 2446, which is longer than the specified 1000
Created a chunk of size 4437, which is longer than the specified 1000
Created a chunk of size 1633, which is longer than the specified 1000
Created a chunk of size 4349, which is longer than the specified 1000
Created a chunk of size 4030, which is longer than the specified 1000
Created a chunk of size 1538, which is longer than the specified 1000


In [82]:
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.1})

retrieved_docs = retriever.invoke("How many people have allergies?")

prompt = hub.pull("rlm/rag-prompt")

llm = ChatCohere(model="command-r", cohere_api_key = cohere_key)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

for chunk in rag_chain.stream("How many people have allergies?"):
    print(chunk, end="", flush=True)



According to the retrieved context, it is unclear how many people have allergies. However, it is estimated that around 30% of the global population suffers from some form of allergic disease. Allergies are quite common, affecting people of all ages and backgrounds.

At least two people have allergies. A 23-year-old female suffers from allergic rhinitis, and a 42-year-old male is allergic to penicillin.

## Indexing

In [13]:
DATA_PATH = "/Users/jean-sebastiengaultier/Desktop/UChicago/Academic/Hackathon/data_test"

# Load data using Markdown Document Loader
def load_markdown_documents(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith('.md'):
            file_path = os.path.join(directory, filename)
            loader = UnstructuredMarkdownLoader(file_path)
            documents.append(loader.load())
    return documents

documents = load_markdown_documents(DATA_PATH)

In [18]:
# Split them into chunks
def split_documents_by_paragraph(documents):
    text_splitter = CharacterTextSplitter(
        separator="\n\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
    split = []
    for doc in documents:
        chunks = text_splitter.split_documents(doc)
        split.extend(chunks)  # Flatten the list of lists
    return split

split = split_documents_by_paragraph(documents)

In [19]:
def create_embeddings_and_store(split_docs, mistral_api_key):
    embeddings = MistralAIEmbeddings(api_key=mistral_api_key)
    vectorstore = Chroma(embedding_function=embeddings)
    vectorstore.add_documents(split_docs)
    return vectorstore

vectorstore = create_embeddings_and_store(split, mistral_api_key)

An error occurred with MistralAI: 'data'


KeyError: 'data'

In [5]:
embeddings = MistralAIEmbeddings(api_key=mistral_api_key)
vectorstore = Chroma("langchain_store", embedding_function=embeddings)



In [6]:
vectorstore.add_documents(split[0])

['0bc171ba-cfdb-4a3e-b681-64e743606356',
 'fa5f5df1-b004-4809-88de-72be27c6fd47']

In [7]:
vectorstore.add_documents(split[1])

['420d91d6-8467-40cc-8621-5f146ee0b290',
 'e526e493-d4e8-4c13-8aa3-2ed14029ebaf']

In [8]:
vectorstore.add_documents(split[2])

['64ab3b7d-af67-48c3-8edf-47194192a537',
 'fc59d596-ac84-4339-91b4-e8c902ed3aca']

In [10]:
split[3]

[Document(page_content='Patient Data: 4\n\nName: Eugene Hewitt\n\nSample Name:  2-D Echocardiogram - 2\n\nMedical Field:  Cardiovascular / Pulmonary\n\nDescription:  2-D Echocardiogram\n\nKeywords: cardiovascular / pulmonary, 2-d, doppler, echocardiogram, annular, aortic root, aortic valve, atrial, atrium, calcification, cavity, ejection fraction, mitral, obliteration, outflow, regurgitation, relaxation pattern, stenosis, systolic function, tricuspid, valve, ventricular, ventricular cavity, wall motion, pulmonary artery', metadata={'source': '/Users/jean-sebastiengaultier/Desktop/UChicago/Academic/Hackathon/data_test/4.md'}),
 Document(page_content='Transcription: 1.  The left ventricular cavity size and wall thickness appear normal.  The wall motion and left ventricular systolic function appears hyperdynamic with estimated ejection fraction of 70% to 75%.  There is near-cavity obliteration seen.  There also appears to be increased left ventricular outflow tract gradient at the mid cav

In [9]:
vectorstore.add_documents(split[3])

An error occurred with MistralAI: 'data'


KeyError: 'data'

In [50]:
vectorstore.add_documents(split[4])

['a890c275-930e-4458-b305-c94ff76cb5f7',
 'ba432d36-2690-46a9-9b6c-e860bd5100ac']

In [51]:
vectorstore.add_documents(split[5])

['33159831-ed1d-4858-bcac-e654b4660be8']

In [52]:
split[5]

[Document(page_content='Patient Data: 9\n\nName: Johnnie Davis\n\nSample Name:  2-D Echocardiogram - 4\n\nMedical Field:  Cardiovascular / Pulmonary\n\nDescription:  Echocardiogram and Doppler\n\nKeywords: cardiovascular / pulmonary, ejection fraction, lv systolic function, cardiac chambers, regurgitation, tricuspid, normal lv systolic function, normal lv systolic, ejection fraction estimated, normal lv, lv systolic, systolic function, function ejection, echocardiogram, doppler, lv, systolic, ejection, mitral, valve\n\nTranscription: DESCRIPTION:,1.  Normal cardiac chambers size.,2.  Normal left ventricular size.,3.  Normal LV systolic function.  Ejection fraction estimated around 60%.,4.  Aortic valve seen with good motion.,5.  Mitral valve seen with good motion.,6.  Tricuspid valve seen with good motion.,7.  No pericardial effusion or intracardiac masses.,DOPPLER:,1.  Trace mitral regurgitation.,2.  Trace tricuspid regurgitation.,IMPRESSION:,1.  Normal LV systolic function.,2.  Eject

In [46]:
retriever = vectorstore.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'MistralAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x1747854d0>)