In [258]:

import os
import openai
from getpass import getpass

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_pinecone import PineconeVectorStore
from langchain_mistralai import ChatMistralAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from typing import List,Optional
load_dotenv()

True

In [2]:
COHERE_TOKEN = os.getenv('COHERE_TOKEN')
MISTRAL_TOKEN = os.getenv('MISTRAL_TOKEN')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE')

In [3]:
loader = UnstructuredMarkdownLoader('processed_data/pone.0083988.md')
loaded_documents = loader.load()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=128)
docs = text_splitter.split_documents(loaded_documents)
len(docs)

45

In [5]:
docs

[Document(metadata={'source': 'processed_data/pone.0083988.md'}, page_content='Mice Fed Rapamycin Have an Increase in Lifespan Associated with Major Changes in the Liver Transcriptome\n\nWilson C. Fok, William H. Wood, 3rd, Yongqing Zhang, Kevin G. Becker, Viviana I. Pe1, Yidong Chen, Alex Bokov, Yiqiang Zhang, Adam B. Salmon, Vivian Diaz, Martin Javors, Arlan Richardson*\n\nAffiliations\n\n1 Department of Cellular and Structural Biology, The University of Texas Health Science Center at San Antonio, San Antonio, Texas, United States of America\n\n2 Barshop Institute for Longevity and Aging Studies, The University of Texas Health Science Center at San Antonio, San Antonio, Texas, United States of America\n\n3 Department of Epidemiology & Biostatistics, The University of Texas Health Science Center at San Antonio, San Antonio, Texas, United States of America\n\n4 Greehey Children’s Cancer Research Institute, The University of Texas Health Science Center at San Antonio, San Antonio, Texas

In [6]:
#model_name = 'intfloat/multilingual-e5-large'
#model_name = 'BAAI/bge-small-en-v1.5'
model_name = 'BAAI/bge-m3'
#model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

In [7]:
model_name = model_name
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'mps'},
    encode_kwargs=encode_kwargs
)

  return self.fget.__get__(instance, owner)()


In [8]:
vectorstore_from_docs = FAISS.from_documents(docs,
        embedding=embeddings
    )
retriever = vectorstore_from_docs.as_retriever(search_kwargs={"k": 10})

In [9]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [176]:
query = """What animals are used in this study (divide animals in groups and describe each group)? 
Give an aswer in proper JSON format using double quotes around keys and values format: 
For example: {"animals":[{"species":"animal_species1",
         "strain":"animal_strain1",
         "group":"control",#for this field write only control or experiment group this animal group in 
         "sex":"animal_sex1",
         "number_of_animals":"number_of_animals1"},
         {"species":"animal_species2",
         "strain":"animal_strain2",
         "group":"experiment",#for this field write only control or experiment group this animal group in
         "sex":"animal_sex2",
         "number_of_animals":"number_of_animals2"}]"""

In [177]:
relevant_docs = retriever.get_relevant_documents(query)
pretty_print_docs(relevant_docs)

Document 1:

B

Control Male, Rapa Male, Control Female, Rapa Female

Figure 3. Multidimensional scaling and heatmap analyses shows the separations of Rapa and control groups. Multidimensional scaling analysis was conducted using all the probes detected (25,697 probes) on the microarrays (A). Control males are shown in red, Rapa males in pink, control females in blue, and Rapa females in cyan. Each dot represents one sample and the lines indicate the three nearest neighbors. In the females, the Rapa mice segregate separately from the control mice, while in the males, 7 Rapa mice group similar to the control males and 6 of the Rapa males do not group with the control males. From this analysis, we separated the Rapa males into two groups for subsequent analysis, Rapa-1 males as the group that appears similar to the control males and Rapa-2 males as the group that appears different from the control males. The heatmap analysis shows the expression of all transcripts that were observed to c

In [178]:
compressor = CohereRerank(model='rerank-english-v3.0',
                          cohere_api_key=COHERE_TOKEN)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

Document 1:

Lifespan Study

A total of 175 mice were used in the lifespan study with 40 mice in the control male group and 45 mice in each of the following groups: Rapa males, control females, and Rapa females. Mice were maintained in pathogen-free barrier conditions with 5 mice per cage and were permitted to live out their lives until death due to natural causes without censoring, with the exception of 2 mice, which were terminated early due to fatal neoplastic disease. The mice used in the lifespan study were not disturbed except to check on the mice twice each day and to remove dead mice. Survival analysis were done using Cox proportional hazard and Weibull’s accelerated failure time models. The mean, 80%, 50%, 10%, and maximum survival data were calculated for each group from the survival analysis.

Gompertz mortality analysis
----------------------------------------------------------------------------------------------------
Document 2:

Materials and Methods

Animals and feeding

In [179]:
llm = ChatMistralAI(model_name="mistral-large-latest",temperature=0.1,api_key=MISTRAL_TOKEN)

In [180]:
class Animal(BaseModel):
    species: str = Field(description="Species of the animal")
    strain: str = Field(description="Strain of the animal")
    group: str = Field(description="Control or experiment group")
    sex: str = Field(description="Sex of the animal")
    number_of_animals: str = Field(description='Number of animals in this group')

class AnimalList(BaseModel):
    animals: list[Animal]
parser = PydanticOutputParser(pydantic_object=AnimalList)


In [181]:
prompt_template = """
You are an assistant. Use the following information to answer the question very shortly?
Context: {context}
Question: {question}
Answer:
"""

# Create a PromptTemplate instance, note the use of both 'context' and 'query'
prompt = PromptTemplate(template=prompt_template, input_variables=["query","context"],
                        partial_variables={"format_instructions": parser.model_json_schema()})

In [182]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=compression_retriever,
                                 chain_type_kwargs={
                                     "prompt": prompt,
                                     "document_variable_name": "context"
                                 })

In [183]:
result = qa.run(query)

In [184]:
answer  = parser.invoke(result)

In [247]:
class AnimalDetails(BaseModel):
    treatment: str = Field(description="What type of treatment or intervention are used?")
    way_of_administration: str = Field(description="What way of administation are used?")
    age_at_start: int = Field(description="Age of the start of treamtment")
    duration_unit: str = Field(description="In which units age of the start was Month/Week/Day and e.t.c")
    dosage: str = Field(description="Dosage of administration")
class AnimalDetailsList(BaseModel):
    animal_details: List[AnimalDetails]
parser2 = PydanticOutputParser(pydantic_object=AnimalDetailsList)

In [248]:
animal_descriptions = [
    f"{animal.sex} {animal.species} {animal.group} {animal.strain}" 
    for animal in answer.animals
]
all_animals_description = ", ".join(animal_descriptions)

In [249]:
all_animals_description

'male mouse control C57BL/6, male mouse experiment C57BL/6, female mouse control C57BL/6, female mouse experiment C57BL/6'

In [250]:
query2 = query2 = f"""
Describe what intervention is used for each groups of animals: {all_animals_description}. 
Give an answer in proper JSON format using double quotes around keys and values. 
For example: 
{{
  "animal_details": [
    {{
      "treatment": "treatment1", # short name of the treatment or control
      "way_of_administration": "way_of_administration1",# Food, Intravenous, Water, Intraperitoneal
      "age_at_start": "age_at_start1",#write only value for example 2 (second month of the life)
      "duration_unit": "duration_unit1"# Year, Month, Week, Day and e.t.c
      "dosage": "dosage1"#only doage values
    }},
    {{
      "treatment": "treatment2",#short name of the treatment or control
      "way_of_administration": "way_of_administration2",# Food, Intravenous, Water, Intraperitoneal
      "age_at_start": "age_at_start2",#write only value for example 2 (second month of the life)
      "duration_unit": "duration_unit2"# Year, Month, Week, Day and e.t.c
      "dosage": "dosage2"#only doage values
    }}
  ]
}}
"""

prompt2 = PromptTemplate(
    template=prompt_template,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser2.model_json_schema()},
)

qa2 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": prompt2, "document_variable_name": "context"},
)
result2 = qa2.run(query2)
answer2 = parser2.invoke(result2)
print(answer2)

animal_details=[AnimalDetails(treatment='control', way_of_administration='Food', age_at_start=4, duration_unit='Month', dosage='Eudragit capsule'), AnimalDetails(treatment='Rapa', way_of_administration='Food', age_at_start=4, duration_unit='Month', dosage='14 ppm'), AnimalDetails(treatment='control', way_of_administration='Food', age_at_start=4, duration_unit='Month', dosage='Eudragit capsule'), AnimalDetails(treatment='Rapa', way_of_administration='Food', age_at_start=4, duration_unit='Month', dosage='14 ppm')]


In [269]:
class AnimalResults(BaseModel):
    median_treatment: Optional[float] = Field(description="Median treatment duration in units")
    max_treatment: Optional[float] = Field(description="Max treatment duration in units")
    treatment_units: str = Field(description="In what units measured lifespan")
    p_value: Optional[str] = Field(description="p-value for statistical analysis")
class AnimalResultsList(BaseModel):
    animal_results: List[AnimalResults]
parser3 = PydanticOutputParser(pydantic_object=AnimalResultsList)

In [270]:
query3 = f"""
Write life-span results for each group of animals: {all_animals_description}. 
Give an answer in proper JSON format using double quotes around keys and values. 
For example: 
{{
  "animal_results": [
    {{
      "median_treatment": 10.5, # median treatment lifespan of the group (only value)
      "max_treatment": 15.3,# max treatment lifespan of the group (only value)
      "treatment_units":"treatment_units1" # In what units measured lifespan Month, Age, Week
      "p_value":0.01 #p-value of statistical test if exist (only value)
    }},
    {{
      "median_treatment": 12.4, # median treatment lifespan of the group (only value)
      "max_treatment": 20.4,# max treatment lifespan of the group (only value)
      "treatment_units":"treatment_units2"# In what units measured lifespan Month, Age, Week
      "p_value":null #p-value of statistical test if exist (only value)
    }}
  ]
}}
"""

prompt3 = PromptTemplate(
    template=prompt_template,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser3.model_json_schema()},
)

qa3 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": prompt3, "document_variable_name": "context"},
)
result3 = qa3.run(query3)
answer3 = parser3.invoke(result3)
print(answer3)

animal_results=[AnimalResults(median_treatment=28.0, max_treatment=38.0, treatment_units='months', p_value=None), AnimalResults(median_treatment=30.0, max_treatment=41.0, treatment_units='months', p_value='0.01'), AnimalResults(median_treatment=28.0, max_treatment=38.0, treatment_units='months', p_value=None), AnimalResults(median_treatment=32.0, max_treatment=45.0, treatment_units='months', p_value='0.01')]


In [267]:
answer3

AnimalResultsList(animal_results=[AnimalResults(median_treatment=27.0, max_treatment=38.0, p_value=None), AnimalResults(median_treatment=29.0, max_treatment=41.0, p_value='0.05'), AnimalResults(median_treatment=27.0, max_treatment=38.0, p_value=None), AnimalResults(median_treatment=31.0, max_treatment=45.0, p_value='0.05')])

In [265]:
all_animals_description

'male mouse control C57BL/6, male mouse experiment C57BL/6, female mouse control C57BL/6, female mouse experiment C57BL/6'