In [562]:

import os
from getpass import getpass

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_pinecone import PineconeVectorStore
from langchain_mistralai import ChatMistralAI
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from dotenv import load_dotenv
from langchain.vectorstores import FAISS
from typing import List,Optional
import pandas as pd
load_dotenv()

True

In [563]:
OPENAI_TOKEN = os.getenv('OPENAI_TOKEN')
COHERE_TOKEN = os.getenv('COHERE_TOKEN')
MISTRAL_TOKEN = os.getenv('MISTRAL_TOKEN')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE')
file_path = 'processed_data/s41418-019-0422-6.md'
fule_name = 's41418-019-0422-6'

In [564]:
loader = UnstructuredMarkdownLoader(file_path)
loaded_documents = loader.load()

In [565]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=1000)
docs = text_splitter.split_documents(loaded_documents)
len(docs)

9

In [566]:
#model_name = 'intfloat/multilingual-e5-large'
#model_name = 'BAAI/bge-small-en-v1.5'
model_name = 'BAAI/bge-m3'
#model_name = 'dunzhang/stella_en_1.5B_v5'

#model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

In [567]:
model_name = model_name
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'mps'},
    encode_kwargs=encode_kwargs
)

In [548]:
query = """What groups of animals are used in this study?
         """

In [549]:
vectorstore_from_docs = FAISS.from_documents(docs,
        embedding=embeddings
    )
retriever = vectorstore_from_docs.as_retriever(search_kwargs={"k": 10})

In [550]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [551]:
relevant_docs = retriever.get_relevant_documents(query)
pretty_print_docs(relevant_docs)

Document 1:

Statistical analysis

All statistical analyses were performed using GraphPad Prism software version 6 (GraphPad software Inc). No statistical methods were used to predetermine sample sizes, but our sample sizes are similar to or greater than those reported in previous publications [35–37]. Data represent the mean and standard error of the mean (SEM). Unpaired two-tailed Student’s t test was used for the comparison of two means. ANOVA followed by Bonferroni’s or Tukey’s post hoc test were used for the multiple group analysis. Log-rank test was used for disease onset and survival analysis. The significance level for the two-sided analyses was set at P < 0.05.

Results

CNS-specific LanCL1 transgene expression improves the survival and motor function of ALS mice

As a first step towards the understanding of the role of LanCL1 in ALS, we characterized the temporal expression patterns of LanCL1 in SOD1G93A mice. Since a decline in body weight is highly correlated with denervati

In [552]:
compressor = CohereRerank(model='rerank-english-v3.0',
                          cohere_api_key=COHERE_TOKEN)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

Document 1:

Here, we report that LanCL1 protects MNs against degeneration and reduces the severity of disease manifestations in the mouse model of ALS. By crossing LanCL1 conditional transgenic mice with the SOD1G93A mouse model, we found that CNS-specific expression of LanCL1 transgene significantly prolongs lifespan, delays disease onset, decelerates symptomatic progression, and improves motor performance in SOD1G93A mice. Reciprocally, CNS-specific deletion of LanCL1 causes neurodegenerative phenotypes, including MN loss, neuroinflammation, and oxidative damage in spinal cord. We further show that loss of LanCL1 leads to a decrease in AKT phosphorylation, whereas LanCL1 transgene restores AKT phosphorylation and mitigates oxidative stress in ALS mice. Findings suggest that LanCL1 modulates MN survival by scavenging ROS and enhancing AKT activity.

Material and methods

Animals

The generation and characterization of the LanCL1 conditional knockin (termed LanCL1 cKI) mice and LanCL1

In [555]:
llm = ChatMistralAI(model_name="mistral-large-latest",temperature=0,api_key=MISTRAL_TOKEN)

In [556]:
class Animal(BaseModel):
    species: str = Field(description="Species of the animal")
    strain: str = Field(description="Strain of the animal")
    group: str = Field(description="Control or experiment group")
    gender: str = Field(description="Sex of the animal")
    n_treatment: Optional[int] = Field(description='Number of animals in this group')
    n_control: Optional[int] = Field(description='Number of animals in control group')

class AnimalList(BaseModel):
    animals: list[Animal]
parser = PydanticOutputParser(pydantic_object=AnimalList)


In [557]:
prompt_template = """
You are an assistant. Use the following information to answer the question very shortly.
Identify ONLY main experimental subject groups and describe them (for example female and male groups, groups with different treatments and e.t.c).
Control, wild types and any other groups that are not key subjects of the experiment use only for fill n_control field
Give an aswer in proper JSON format using double quotes around keys and values format
For example: {{"animals":[{{"species":"animal_species1",
         "strain":"animal_strain1", # ONLY strain name of the animal group
         "group":"experiment1",# Name of the group for example Rapa, KO "ABC" gene and e.t.c
         "gender":"male",
         "n_treatment":25,# Number of animals in this group
         "n_control":40 # Number of animals in control group
         }},
         {{"species":"animal_species2",
         "strain":"animal_strain2",# ONLY strain name of the animal group ONLY name
         "group":"experiment2",# Name of the group for example Rapa, KO ABC gene and e.t.c
         "sex":"female",
         "n_treatment":25,# Number of animals in this group
         "n_control":40 # Number of animals in control group
         }}]}}
Context: {context}
Question: {question}
Answer:
"""

# Create a PromptTemplate instance, note the use of both 'context' and 'query'
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser.model_json_schema()},
)

In [558]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=compression_retriever,
                                 chain_type_kwargs={
                                     "prompt": prompt,
                                     "document_variable_name": "context"
                                 })

In [559]:
result = qa.run(query)

NotFoundError: Error code: 404 - {'error': {'message': 'The model `gpt-4o` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [41]:
answer  = parser.invoke(result)

In [42]:
answer

AnimalList(animals=[Animal(species='mouse', strain='SOD1G93A', gender='male', n_treatment=15, n_control=17), Animal(species='mouse', strain='SOD1G93A; LanCL1 cKI', gender='male', n_treatment=15, n_control=15), Animal(species='mouse', strain='SOD1G93A', gender='female', n_treatment=21, n_control=18), Animal(species='mouse', strain='SOD1G93A; LanCL1 cKI', gender='female', n_treatment=16, n_control=16)])

In [19]:
class AnimalDetails(BaseModel):
    treatment: str = Field(description="What type of treatment or intervention are used?")
    way_of_administration: str = Field(description="What way of administation are used?")
    age_at_start: int = Field(description="Age of the start of treamtment")
    duration_unit: str = Field(description="In which units age of the start was Month/Week/Day and e.t.c")
    dosage: str = Field(description="Dosage of administration")
class AnimalDetailsList(BaseModel):
    animal_details: List[AnimalDetails]
parser2 = PydanticOutputParser(pydantic_object=AnimalDetailsList)

In [20]:
animal_descriptions = [
    f"{animal.gender} {animal.species} {animal.group} {animal.strain}" 
    for animal in answer.animals
]
all_animals_description = ", ".join(animal_descriptions)

In [21]:
all_animals_description

'male mouse G93A SOD1G93A, male mouse G93A; LanCL1 cKI SOD1G93A, female mouse G93A SOD1G93A, female mouse G93A; LanCL1 cKI SOD1G93A'

In [22]:
intro = f"""
You are an assistant. Use the following information to answer the question very shortly
Describe what intervention is used for each groups of animals (separated by ,): {all_animals_description}
Give an answer in proper JSON format using double quotes around keys and values. 
"""
prompt_template2 = intro+""" 
For example: 
{{
  "animal_details": [
    {{
      "treatment": "treatment1", # short name of the treatment
      "way_of_administration": "way_of_administration1",# Food, Intravenous, Water, Intraperitoneal, Genomic and e.t.c
      "age_at_start": 2,#write only value for example 2 (second month of the life)
      "duration_unit": "Months", # Year, Month, Week, Day and e.t.c if age_at_start equal to 0 then write here Days
      "dosage": "dosage1"#only doage values
    }},
    {{
      "treatment": "treatment2",#short name of the treatment
      "way_of_administration": "way_of_administration2",# Food, Intravenous, Water, Intraperitoneal,Genomic and e.t.c
      "age_at_start": 0, #write only value for example 2 (second month of the life)
      "duration_unit": "Days", # Year, Month, Week, Day and e.t.c if age_at_start equal to 0 then write here Days
      "dosage": "dosage2"#only doage values
    }}
  ]
}}
Context: {context}
Question: {question}
Answer:
"""

# Create a PromptTemplate instance, note the use of both 'context' and 'query'
prompt2 = PromptTemplate(template=prompt_template, input_variables=["query","context"],
                        partial_variables={"format_instructions": parser2.model_json_schema()})

In [23]:
query2 = f"""
What treatment or intervention or manipulation are used for each groups of animals: {all_animals_description}?
"""

prompt2 = PromptTemplate(
    template=prompt_template2,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser2.model_json_schema()},
)

qa2 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": prompt2, "document_variable_name": "context"},
)
result2 = qa2.run(query=query2)
answer2 = parser2.invoke(result2)
print(answer2)

animal_details=[AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A'), AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A'), AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A'), AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A')]


In [24]:
answer2

AnimalDetailsList(animal_details=[AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A'), AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A'), AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A'), AnimalDetails(treatment='Genomic', way_of_administration='Genomic', age_at_start=0, duration_unit='Days', dosage='N/A')])

In [25]:
class AnimalResults(BaseModel):
    median_treatment: Optional[float] = Field(description="Median treatment duration in units")
    max_treatment: Optional[float] = Field(description="Max treatment duration in units")
    treatment_units: str = Field(description="In what units measured lifespan")
    p_value: Optional[str] = Field(description="p-value for statistical analysis")
class AnimalResultsList(BaseModel):
    animal_results: List[AnimalResults]
parser3 = PydanticOutputParser(pydantic_object=AnimalResultsList)

In [26]:
intro = f"""
Write lifespan results for each group of animals: {all_animals_description}
Give an answer in proper JSON format using double quotes around keys and values. 
"""
prompt_template3 = intro+"""
For example: 
{{
  "animal_results": [
    {{
      "median_treatment": 10.5, # median treatment lifespan of the group (only value)
      "max_treatment": 15.3,# max treatment lifespan of the group (only value)
      "treatment_units":"treatment_units1" # In what units measured lifespan Month, Age, Week
      "p_value":0.01 #p-value of statistical test if exist (only value)
    }}
  ]
}}
Context: {context}
Question: {question}
Answer:
"""

In [27]:
query3 = f"""
Lifespan or survival curve/results for {all_animals_description}
"""

prompt3 = PromptTemplate(
    template=prompt_template3,
    input_variables=["query", "context"],
    partial_variables={"format_instructions": parser3.model_json_schema()},
)

qa3 = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": prompt3, "document_variable_name": "context"},
)
result3 = qa3.run(query3)
answer3 = parser3.invoke(result3)
print(answer3)

animal_results=[AnimalResults(median_treatment=141.0, max_treatment=None, treatment_units='days', p_value=None), AnimalResults(median_treatment=163.0, max_treatment=None, treatment_units='days', p_value='0.01'), AnimalResults(median_treatment=146.0, max_treatment=None, treatment_units='days', p_value=None), AnimalResults(median_treatment=168.0, max_treatment=None, treatment_units='days', p_value='0.01')]


In [52]:
result_df = pd.DataFrame()

In [53]:
temp_df = pd.DataFrame(index=[x for x in range(len(answer.animals))])

In [54]:
temp_df['file'] = fule_name

In [55]:
for i, anim in enumerate(answer.animals):

    temp_df.loc[i, "species"] = anim.species
    temp_df.loc[i, "strain"] = anim.strain
    temp_df.loc[i, "gender"] = anim.gender
    temp_df.loc[i, "n_treatment"] = anim.n_treatment
    temp_df.loc[i, "n_control"] = anim.n_control

for i, anim in enumerate(answer2.animal_details):
    temp_df.loc[i, "treatment"] = anim.treatment
    temp_df.loc[i, "way_of_administration"] = anim.way_of_administration
    temp_df.loc[i, "age_at_start"] = anim.age_at_start
    temp_df.loc[i, "duration_unit"] = anim.duration_unit
    temp_df.loc[i, "dosage"] = anim.dosage
for i, anim in enumerate(answer3.animal_results):
    temp_df.loc[i, "median_treatment"] = anim.median_treatment
    temp_df.loc[i, "max_treatment"] = anim.max_treatment
    temp_df.loc[i, "p_value"] = anim.p_value

In [56]:
result_df = pd.concat([result_df,temp_df])

In [57]:
answer

AnimalList(animals=[Animal(species='mouse', strain='SOD1G93A', gender='male', n_treatment=15, n_control=17), Animal(species='mouse', strain='SOD1G93A; LanCL1 cKI', gender='male', n_treatment=15, n_control=15), Animal(species='mouse', strain='SOD1G93A', gender='female', n_treatment=21, n_control=18), Animal(species='mouse', strain='SOD1G93A; LanCL1 cKI', gender='female', n_treatment=16, n_control=16)])

In [58]:
result_df = result_df[
    (result_df["treatment"].notna())
    & (result_df["way_of_administration"].notna())
    & (result_df["age_at_start"].notna())
    & (result_df["duration_unit"].notna())
    & (result_df["p_value"].notna())
    & (result_df["treatment"]!="Control")
    
]

In [59]:
result_df

Unnamed: 0,file,species,strain,gender,n_treatment,n_control,treatment,way_of_administration,age_at_start,duration_unit,dosage,median_treatment,max_treatment,p_value
1,s41418-019-0422-6,mouse,SOD1G93A; LanCL1 cKI,male,15.0,15.0,Genomic,Genomic,0.0,Days,,163.0,,0.01
3,s41418-019-0422-6,mouse,SOD1G93A; LanCL1 cKI,female,16.0,16.0,Genomic,Genomic,0.0,Days,,168.0,,0.01


In [60]:
result_df.to_csv(f'{fule_name}.csv',index=False)

In [413]:
result_df = pd.DataFrame()
for file in os.listdir("results"):
    file_path = "results" + "/" + file
    temp_df = pd.read_csv(file_path)
    result_df = pd.concat([result_df,temp_df],ignore_index=True)

In [414]:
result_df = result_df.rename(columns={'treatment':'intervention'})

In [406]:
result_df.to_excel("lifespangpt_results_v1.xlsx",index=False)

In [409]:
df = pd.read_excel("CollidaData_2023.xlsx")

In [415]:
df.head(5)

Unnamed: 0,cohort_id,volunteer_name,volunteer_country,study_id,doi,pubmed_id,title,journal,year,full_text_URL,...,max_treatment,n_treatment,median_control,max_control,n_control,p_value,comment,timestamp,trust,extraction_method
0,53,Leon Peshkin,France,595,10.1371/journal.pone.0083988,24409289,Mice fed rapamycin have an increase in lifespa...,PloS one,2014,https://www.ncbi.nlm.nih.gov/pmc/articles/pmid...,...,36.0,45.0,27.0,32.5,45.0,0.0,,2023-02-05 02:46:28,2,Collida+DrugAge+ReviewbyEdouard
1,51,Leon Peshkin,France,595,10.1371/journal.pone.0083988,24409289,Mice fed rapamycin have an increase in lifespa...,PloS one,2014,https://www.ncbi.nlm.nih.gov/pmc/articles/pmid...,...,38.0,45.0,29.0,31.5,45.0,0.0,,2023-02-05 02:42:26,2,Collida+DrugAge+ReviewbyEdouard
2,140,Nickolai Leschov,Russia,594,10.1159/000212659,6519438,Dietary vitamin C improves the survival of mice,Gerontology,1984,https://www.ncbi.nlm.nih.gov/pmc/articles/pmid...,...,993.0,16.0,859.0,965.0,8.0,0.01,It should be noted that the ascorbic acid grou...,2023-02-27 17:42:25,2,Collida+DrugAge+ReviewbyEdouard
3,42,Edouard Debonneuil,France,593,10.1073/pnas.1717065115,29378959,Effects of rapamycin on growth hormone recepto...,Proceedings of the National Academy of Science...,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/pmid...,...,1090.0,27.0,1100.0,1275.0,20.0,0.0001,,2023-02-04 17:21:52,2,Collida+DrugAge+ReviewbyEdouard
4,50,Edouard Debonneuil,France,593,10.1073/pnas.1717065115,29378959,Effects of rapamycin on growth hormone recepto...,Proceedings of the National Academy of Science...,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/pmid...,...,1150.0,32.0,1110.0,1275.0,33.0,0.001,,2023-02-04 18:00:54,2,Collida+DrugAge+ReviewbyEdouard


In [416]:
df[df['extraction_method']=='Collida+DrugAge+ReviewbyEdouard']['doi'].unique()

array(['10.1371/journal.pone.0083988', '10.1159/000212659',
       '10.1073/pnas.1717065115', '10.1016/j.ebiom.2018.09.015',
       '10.1016/j.cmet.2020.08.004', '10.1016/0531-5565(75)90012-1',
       '10.1007/s10522-007-9100-z'], dtype=object)

In [417]:
df['files'] = [x.split('/')[1] for x in df['doi']]

In [418]:
files = os.listdir('processed_data')

In [419]:
from sklearn.metrics import jaccard_score,mean_absolute_error

In [420]:
file_list = result_df['file'].unique()

In [422]:
file_list[0]

'j.celrep.2013.07.030'

In [533]:
result_temp_df = result_df[result_df['file']==file_list[5]]

In [534]:
valid_temp_df = df[df['files']==file_list[5]]

In [535]:
valid_temp_df['doi'].unique()

array(['10.1073/pnas.1717065115'], dtype=object)

In [536]:
aminal_columns = ['species','strain','gender','n_treatment','n_control']
animal_details_columns = ['intervention','way_of_administration','age_at_start','duration_unit','dosage']
animal_results_columns = ['median_treatment','max_treatment','p_value']

In [537]:
result_answer = result_temp_df["median_treatment"].tolist()
valid_answer = valid_temp_df['median_treatment'].tolist()

In [538]:
result_answer

[975.0, 1056.0]

In [539]:
valid_answer

[955.0, 1055.0, 1055.0]

In [516]:
result_answer = ["40", "40","40"]

In [517]:
valid_answer = ["30.0","30.0","0"]

In [519]:
jaccard_score(valid_answer,result_answer,average="weighted")

0.4444444444444444

In [103]:
valid_temp_df['species'].tolist()

['mice', 'mice', 'mice']