In [None]:

import os
from getpass import getpass

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_pinecone import PineconeVectorStore
from langchain_mistralai import ChatMistralAI
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from typing import List,Optional
import pandas as pd
load_dotenv()

In [None]:
COHERE_TOKEN = os.getenv('COHERE_TOKEN')
MISTRAL_TOKEN = os.getenv('MISTRAL_TOKEN')
OPENAI_TOKEN = os.getenv('OPENAI_TOKEN')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_TOKEN')

In [None]:
loader = UnstructuredMarkdownLoader('processed_data/j.celrep.2013.07.030.md')
loaded_documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=128)
docs = text_splitter.split_documents(loaded_documents)
len(docs)

In [None]:
#model_name = 'intfloat/multilingual-e5-large'
#model_name = 'BAAI/bge-small-en-v1.5'
model_name = 'BAAI/bge-m3'
#model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

In [None]:
model_name = model_name
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'mps'},
    encode_kwargs=encode_kwargs
)

In [None]:
vectorstore_from_docs = FAISS.from_documents(docs,
        embedding=embeddings
    )
retriever = vectorstore_from_docs.as_retriever(search_kwargs={"k": 10})

In [None]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [None]:
query = """What animals are used in this study?
         """

In [None]:
relevant_docs = retriever.get_relevant_documents(query)
pretty_print_docs(relevant_docs)

In [None]:
compressor = CohereRerank(model='rerank-english-v3.0',
                          cohere_api_key=COHERE_TOKEN)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

In [None]:
llm = ChatMistralAI(model_name="mistral-large-latest",temperature=0,api_key=MISTRAL_TOKEN)
#llm = ChatOpenAI(
#    model_name="gpt-4o-mini",  
#    openai_api_key=OPENAI_TOKEN, 
#    temperature=0.1 
#)

In [None]:
class Animal(BaseModel):
    species: str = Field(description="Species of the animal")
    strain: str = Field(description="Strain of the animal")
    group: str = Field(description="Control or experiment group")
    gender: str = Field(description="Sex of the animal")
    n_treatment: Optional[int] = Field(description='Number of animals in this group')
    n_control: Optional[int] = Field(description='Number of animals in control for this group')

class AnimalList(BaseModel):
    animals: list[Animal]
parser = PydanticOutputParser(pydantic_object=AnimalList)


In [None]:
prompt_template = """
You are an assistant. Use the following information to answer the question very shortly
Divide animals in groups ONLY from experimental groups (exclude any control or wildtype groups that are needed only for comparison with experimental groups) and describe each group
Give an aswer in proper JSON format using double quotes around keys and values format: 
For example: {{"animals":[{{"species":"animal_species1",
         "strain":"animal_strain1",
         "group":"experiment1",# Name of the group for example Rapa_male, KO ABC gene and e.t.c
         "gender":"animal_sex1",
         "n_treatment":25,##Number of animals in this specific group
         "n_control":40 #Number of animals in control relative to this group if no information - write null
         }},
         {{"species":"animal_species2",
         "strain":"animal_strain2",
         "group":"experiment2",# Name of the group for example Rapa_male, KO ABC gene and e.t.c
         "sex":"animal_sex2",
         "n_treatment":25,##Number of animals in this specific group
         "n_control":40 ##Number of animals in control relative to this group if no information - write null
         }}]}}
Context: {context}
Question: {question}
Answer:
"""

# Create a PromptTemplate instance, note the use of both 'context' and 'query'
prompt = PromptTemplate(template=prompt_template, input_variables=["query","context"],
                        partial_variables={"format_instructions": parser.model_json_schema()})

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=compression_retriever,
                                 chain_type_kwargs={
                                     "prompt": prompt,
                                     "document_variable_name": "context"
                                 })

In [None]:
result = qa.run(query)

In [None]:
answer  = parser.invoke(result)

In [None]:
answer

In [None]:
class AnimalDetails(BaseModel):
    treatment: str = Field(description="What type of treatment or intervention are used?")
    way_of_administration: str = Field(description="What way of administation are used?")
    age_at_start: int = Field(description="Age of the start of treamtment")
    duration_unit: str = Field(description="In which units age of the start was Month/Week/Day and e.t.c")
    dosage: str = Field(description="Dosage of administration")
class AnimalDetailsList(BaseModel):
    animal_details: List[AnimalDetails]
parser2 = PydanticOutputParser(pydantic_object=AnimalDetailsList)

In [None]:
animal_descriptions = [
    f"{animal.gender} {animal.species} {animal.group} {animal.strain}" 
    for animal in answer.animals
]
all_animals_description = ", ".join(animal_descriptions)

In [None]:
prompt_template2 = """
You are an assistant. Use the following information to answer the question very shortly
Describe what intervention is used for each groups of animals
Give an answer in proper JSON format using double quotes around keys and values. 
For example: 
{{
  "animal_details": [
    {{
      "treatment": "treatment1", # short name of the treatment or control
      "way_of_administration": "way_of_administration1",# Food, Intravenous, Water, Intraperitoneal, Genomic and e.t.c
      "age_at_start": 2,#write only value for example 2 (second month of the life)
      "duration_unit": "Months"# Year, Month, Week, Day and e.t.c if age_at_start equal to 0 then write here Days
      "dosage": "dosage1"#only doage values
    }},
    {{
      "treatment": "treatment2",#short name of the treatment or control
      "way_of_administration": "way_of_administration2",# Food, Intravenous, Water, Intraperitoneal,Genomic and e.t.c
      "age_at_start": 0,#write only value for example 2 (second month of the life)
      "duration_unit": "Days"# Year, Month, Week, Day and e.t.c if age_at_start equal to 0 then write here Days
      "dosage": "dosage2"#only doage values
    }}
  ]
}}
Context: {context}
Question: {question}
Answer:
"""

# Create a PromptTemplate instance, note the use of both 'context' and 'query'
prompt2 = PromptTemplate(template=prompt_template, input_variables=["query","context"],
                        partial_variables={"format_instructions": parser2.model_json_schema()})

In [None]:
query2 = f"""
What intervention is used for each groups of animals: {all_animals_description}?
"""

prompt2 = PromptTemplate(
    template=prompt_template2,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser2.model_json_schema()},
)

qa2 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": prompt2, "document_variable_name": "context"},
)
result2 = qa2.run(query2)
answer2 = parser2.invoke(result2)
print(answer2)

In [None]:
result2

In [None]:
answer2

In [None]:
class AnimalResults(BaseModel):
    median_treatment: Optional[float] = Field(description="Median treatment duration in units")
    max_treatment: Optional[float] = Field(description="Max treatment duration in units")
    treatment_units: str = Field(description="In what units measured lifespan")
    p_value: Optional[str] = Field(description="p-value for statistical analysis")
class AnimalResultsList(BaseModel):
    animal_results: List[AnimalResults]
parser3 = PydanticOutputParser(pydantic_object=AnimalResultsList)

In [None]:
prompt_template3 = """
Write life-span results for each group of animals. 
Give an answer in proper JSON format using double quotes around keys and values. 
For example: 
{{
  "animal_results": [
    {{
      "median_treatment": 10.5, # median treatment lifespan of the group (only value)
      "max_treatment": 15.3,# max treatment lifespan of the group (only value)
      "treatment_units":"treatment_units1" # In what units measured lifespan Month, Age, Week
      "p_value":0.01 #p-value of statistical test if exist (only value)
    }}
  ]
}}
Context: {context}
Question: {question}
Answer:
"""

In [None]:
query3 = f"""
Life-span results for each group of animals: {all_animals_description}
"""

prompt3 = PromptTemplate(
    template=prompt_template3,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser3.model_json_schema()},
)

qa3 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": prompt3, "document_variable_name": "context"},
)
result3 = qa3.run(query3)
answer3 = parser3.invoke(result3)
print(answer3)