# Parsing articles

In [12]:
from dotenv import load_dotenv
import os
from tqdm import tqdm
import time
import json

from utils.parser import ParserConfig, Parser
from rag.retriever import RetrieverConfig
from rag.llm import LLMConfig
from rag.pipeline import LifeSpanGPT

load_dotenv()
LLAMA_CLOUD = os.getenv('LLAMA_CLOUD')
COHERE_TOKEN = os.getenv("COHERE_TOKEN")
OPENAI_TOKEN = os.getenv("OPENAI_TOKEN")
ROOT_PATH_ARTICLES = 'data' #folder with articles
ROOT_PATH_RESULTS = 'pipeline_results/test'#folder to save results

In [None]:
# for each article in ROOT_PATH_ARTICLES we parsing it into processed_data folder
for file in tqdm(os.listdir(ROOT_PATH_ARTICLES)):
    file_path = os.path.join(ROOT_PATH_ARTICLES,file)
    parser_config = ParserConfig(path_to_file=file_path,
                                llama_cloud_token=LLAMA_CLOUD,
                                instruction=None)
    parser = Parser(parser_config)
    parser.create_parser()
    parser.parse()

# Generating answers

In [14]:
# for each file in processed_data run pipeline
for file in tqdm(os.listdir("processed_data")):
    os.makedirs(ROOT_PATH_RESULTS, exist_ok=True)
    file_name = file.split(".md")[0]
    print(file_name)
    config = RetrieverConfig(
        file_path=f"processed_data/{file}",
        embeding_model="BAAI/bge-small-en",
        reranker_model="rerank-english-v3.0",
        chunk_size=15000,
        chunk_overlap=2000,
        COHERE_TOKEN=COHERE_TOKEN,
    )
    llm_config = LLMConfig(model_name="gpt-4o", temperature=0.0, api_key=OPENAI_TOKEN)
    pipeline = LifeSpanGPT(config, llm_config)
    answer = pipeline.run_pipeline()
    with open(f"{ROOT_PATH_RESULTS}/{file_name}.json", "w", encoding="utf-8") as f:
        json.dump(answer, f, indent=4)
    time.sleep(30)

  0%|          | 0/59 [00:00<?, ?it/s]

molecules25225339
Creating retriever
Creating llm
Generating prompt...
What treatment or intervention or manipulation are used for mouse C57Bl/6 control male?
Generating prompt...
What are Lifespan or survival curve/results for mouse C57Bl/6 control male?
Generating prompt...
What treatment or intervention or manipulation are used for mouse mut(PD) mut(PD) male?
Generating prompt...
What are Lifespan or survival curve/results for mouse mut(PD) mut(PD) male?
Generating prompt...


  2%|▏         | 1/59 [01:35<1:31:52, 95.05s/it]

2008:754190
Creating retriever
Creating llm
Generating prompt...
What treatment or intervention or manipulation are used for mouse 129/Sv PARP-1−/− female?
Generating prompt...
What are Lifespan or survival curve/results for mouse 129/Sv PARP-1−/− female?
Generating prompt...
What treatment or intervention or manipulation are used for mouse 129/Sv PARP-1+/+ female?
Generating prompt...
What are Lifespan or survival curve/results for mouse 129/Sv PARP-1+/+ female?
Generating prompt...


  2%|▏         | 1/59 [02:20<2:16:07, 140.82s/it]


KeyboardInterrupt: 