In [1]:
# !pip install langchain_chroma langchain_ollama langchain langchain_community langchain_core

In [19]:
# !pip install rank_bm25

In [98]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import DataFrameLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain.retrievers.ensemble import EnsembleRetriever
from langchain.retrievers import BM25Retriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# !curl -fsSL https://ollama.com/install.sh | sh
# !ollama serve &
# !ollama run llama3.2 &

In [9]:
import pandas as pd
import re, json

Load NER data

In [None]:
ner = pd.read_csv('/content/drive/MyDrive/hackathon/mimic-ner/.tsv')

In [5]:
ner = pd.read_csv('/content/drive/MyDrive/hackathon/mimic-ner/ner_test_flat.tsv')

In [6]:
meds = ner[ner['ner_type']=='medication']['ner_text'].value_counts().head(10).index.tolist()
sysm = ner[ner['ner_type']=='symptom']['ner_text'].value_counts().head(10).index.tolist()

In [7]:
ents = meds+sysm

In [45]:
ents

['aspirin',
 'coumadin',
 'warfarin',
 'docusate sodium',
 'methadone',
 'tizanidine',
 'senna',
 'simvastatin',
 'omeprazole',
 'nitroglycerin',
 'chest pain',
 'fever',
 'shortness of breath',
 'sob',
 'orthopnea',
 'dysuria',
 'headache',
 'diarrhea',
 'cough',
 'constipation']

Load pubmed data and select subset with the most frequent keywords in NER data

In [10]:
file_path = '/content/drive/MyDrive/hackathon/pubmed.json'
# file_path = 'pubmed/2019-1-pubmed_articles.json'
with open(file_path, 'r', encoding='utf-8') as file:
    # Load JSON data from the file
    data = json.load(file)

In [11]:
pattern = '|'.join(map(re.escape, ents))

In [12]:
n = 0
samples = []
for d in data:
    if n < 10000:
        if (d['Abstract']!='') and (re.search(pattern, d['Abstract'])):
            samples.append(d)
            n = n + 1

In [63]:
df.head()

Unnamed: 0,PMID,Title,Journal,Authors,Abstract,combined
0,32283117,In-hospital cardiac arrest outcomes among pati...,Resuscitation,"['Fei Shao', 'Shuang Xu', 'Xuedi Ma', 'Zhoumin...",OBJECTIVE: To describe the characteristics and...,In-hospital cardiac arrest outcomes among pati...
1,32282986,Clinical characteristics and immunosuppressant...,American journal of transplantation : official...,"['Zibiao Zhong', 'Qiuyan Zhang', 'Haoyang Xia'...",Over 1 000 000 cases of coronavirus disease 20...,Clinical characteristics and immunosuppressant...
2,32282788,"""Very High-Grade"" Pulsatile Mass of an Aberran...",The American journal of case reports,"['Quang Van Le', 'Vy Thuc Nguyen']",BACKGROUND The prevalence of aberrant internal...,"""Very High-Grade"" Pulsatile Mass of an Aberran..."
3,32281959,Comparison of clinical characteristics of pati...,Turk Kardiyoloji Dernegi arsivi : Turk Kardiyo...,"['Bülent Özlek', 'Eda Özlek', 'Mehmet Tekinalp...",OBJECTIVE: The aim of this study was to assess...,Comparison of clinical characteristics of pati...
4,32281954,Coronary stent embolism to the right posterior...,Turk Kardiyoloji Dernegi arsivi : Turk Kardiyo...,"['Sinan Varol', 'İrfan Şahin', 'Gökmen Kum', '...",A 57-year-old male was admitted to the emergen...,Coronary stent embolism to the right posterior...


In [76]:
df = pd.DataFrame(samples)

In [77]:
df['Authors'] = df['Authors'].astype(str)
# df['combined'] = df['Title'] + ' ' + df['Abstract'] + ' ' + df['Authors'] + ' ' + df['Journal'] + ' ' + df['PMID']

RAG search for pubmed data

In [78]:
loader = DataFrameLoader(df, page_content_column="Abstract")
docs = loader.load()

In [79]:
docs[0]

Document(metadata={'PMID': '32283117', 'Title': 'In-hospital cardiac arrest outcomes among patients with COVID-19 pneumonia in Wuhan, China.', 'Journal': 'Resuscitation', 'Authors': "['Fei Shao', 'Shuang Xu', 'Xuedi Ma', 'Zhouming Xu', 'Jiayou Lyu', 'Michael Ng', 'Hao Cui', 'Changxiao Yu', 'Qing Zhang', 'Peng Sun', 'Ziren Tang']"}, page_content='OBJECTIVE: To describe the characteristics and outcomes of patients with severe COVID-19 and in-hospital cardiac arrest (IHCA) in Wuhan, China.\nMETHODS: The outcomes of patients with severe COVID-19 pneumonia after IHCA over a 40-day period were retrospectively evaluated. Between January 15 and February 25, 2020, data for all cardiopulmonary resuscitation (CPR) attempts for IHCA that occurred in a tertiary teaching hospital in Wuhan, China were collected according to the Utstein style. The primary outcome was restoration of spontaneous circulation (ROSC), and the secondary outcomes were 30-day survival, and neurological outcome.\nRESULTS: Data

In [80]:
embeddings = OllamaEmbeddings(model="llama3.2")
llm = OllamaLLM(model="llama3.2")

In [81]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
# retriever = vectorstore.as_retriever(
#     search_type="similarity",  # Options: "similarity", "mmr", "similarity_score_threshold"
#     search_kwargs={"k": 10}  # Number of top results to retrieve
# )

retriever = BM25Retriever.from_documents(splits, k=10)

In [93]:
prompt = PromptTemplate.from_template("You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. \nContext: {context} \nQuestion: {question} \nAnswer:")

def format_docs(docs):
    # return "\n\n".join(doc.page_content for doc in docs)
    return "\n\n".join(
        f"Content: {doc.page_content}\nMetadata: {doc.metadata}" for doc in docs
    )

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [94]:
# question = 'What are the potential adverse effects of aspirin? hemorrhage'
# question = 'Is chest pain a potential adverse effect of aspirin?'
question = 'Is hemorrhage a potential adverse effect of aspirin? If so, provide evidence and paper title in the context.'

rag_chain.invoke(question)




In [97]:
df.loc[df['PMID']=='33392889']

Unnamed: 0,PMID,Title,Journal,Authors,Abstract
3845,33392889,A maternal death due to the intracerebral hemo...,Clinical rheumatology,"['Bijan Keikhaei', 'Najmieh Saadati', 'Mohamma...",Antiphospholipid syndrome (APS) is a systemic ...


look data process in detail

In [91]:
# question = 'Is Asthma an adverse drug reaction of PGE2?'
retrieved_docs = retriever.invoke(question)

print(len(retrieved_docs))

context = "\n\n".join(doc.page_content for doc in retrieved_docs)
# print(context)

# prompt = hub.pull("rlm/rag-prompt")
prompt = PromptTemplate.from_template("You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nContext: {context} \nQuestion: {question} \nAnswer:")

example_messages = prompt.invoke(
    {"context": context, "question": question}
)

10


In [92]:
print(context)

CONCLUSION: The observed increase in TTM quality in this single-center study may be a result of increased competence through learning and training in different strict TTM protocols. If so, the results of this study further support the protocolization of post-cardiac arrest intensive care.

INTRODUCTION: Pharmacologic therapy options for COVID-19 should include antiviral, anti-inflammatory, and anticoagulant agents. With the limited effectiveness, currently available virus-directed therapies may have a substantial impact on global health due to continued reports of mutant variants affecting repeated waves of COVID-19 around the world.
METHODS: We searched articles pertaining to aspirin, COVID-19, acute lung injury and pharmacology in PubMed and provide a comprehensive appraisal of potential use of aspirin in the management of patients with COVID-19. The scope of this article is to provide an overview of the rationale and currently available clinical evidence that supports aspirin as an 

In [86]:
# print(example_messages)

In [87]:
chain = llm | StrOutputParser()
chain.invoke(example_messages)

'Aspirin is known to increase the risk of bleeding due to its antiplatelet properties. The use of aspirin has been associated with an increased risk of gastrointestinal bleeding and hemorrhage. \n\nFor example, a study titled "Aspirin and risk of intracranial hemorrhage" found that long-term use of aspirin was associated with an increased risk of intracranial hemorrhage in patients with cerebral vasculature disease.\n\nIn this context, the information about aspirin\'s potential to cause bleeding is not directly related to COVID-19 management, but it does highlight one of its known adverse effects.'