In [1]:
import os
import spacy
import pandas as pd
import re
from spacy.matcher import Matcher
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from sectag_utils import sectag_to_regex, find_segs

In [2]:
header_patterns, seg_names = sectag_to_regex("SecTag.csv", seg_col="str", header_col="kmname")

In [3]:
data_dir = "ClinicalNotes"
clinical_notes = {
    fname: open(os.path.join(data_dir, fname), encoding='utf-8').read()
    for fname in os.listdir(data_dir) if fname.endswith(".txt")
}

In [5]:
def extract_subjective(note):
    sections = find_segs(note, header_patterns, seg_names)
    for head, types, start, end in sections:
        if "subjective" in [t.lower() for t in types]:
            return note[start:end].strip()
    return ""

subjective_texts = {fname: extract_subjective(text) for fname, text in clinical_notes.items()}

In [7]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
symptoms = ["pain", "numbness", "tingling", "insomnia", "asthma", "cough", "fever", "diarrhea", "headache", "depression"]
matcher.add("SYMPTOMS", [[{"LOWER": word}] for word in symptoms])

def extract_symptoms(text):
    doc = nlp(text)
    matches = matcher(doc)
    return list(set([doc[start:end].text.lower() for _, start, end in matches]))

extracted_symptoms = {
    fname: extract_symptoms(text) for fname, text in subjective_texts.items()
}

In [9]:
llm = Ollama(model="llama3.2")
prompt = PromptTemplate(
    input_variables=["text", "condition"],
    template="""
Given the patient's subjective report:
---
{text}
---
Is the condition '{condition}' present or absent? Reply only with 'Present' or 'Absent'.
"""
)

chain = LLMChain(llm=llm, prompt=prompt)

classified_results = {}
for fname in subjective_texts:
    classified_results[fname] = {}
    for symptom in extracted_symptoms[fname]:
        response = chain.run(text=subjective_texts[fname], condition=symptom).strip()
        classified_results[fname][symptom] = response



  chain = LLMChain(llm=llm, prompt=prompt)
  response = chain.run(text=subjective_texts[fname], condition=symptom).strip()


In [10]:
for fname, results in classified_results.items():
    print(f"\n--- {fname} ---")
    for symptom, status in results.items():
        print(f"{symptom}: {status}")


--- sample_214.txt ---

--- sample_2627.txt ---
cough: Present.
headache: Absent.
pain: Present

--- sample_343.txt ---
asthma: Present.
diarrhea: Absent.
pain: Present.

--- sample_365.txt ---
numbness: Present
cough: Present.
pain: Present.
insomnia: Present.
tingling: Present.

--- sample_388.txt ---
fever: Present.
diarrhea: Present.

--- sample_391.txt ---
insomnia: Present.
pain: Present.

--- sample_392.txt ---
depression: Present.

--- sample_452.txt ---
asthma: Absent.
cough: Present

--- sample_70.txt ---
cough: Absent
pain: Present

--- sample_71.txt ---
diarrhea: Absent.
pain: Absent.
