In [66]:
import os

# Define input folder
input_folder_path = r"100notes_txt"

# dflow: data flow that passes through the pipeline
# dflow has filenames as keys; 
# values start with the original text, and are updated 
# to a list that includes results from each pipe
dflow = {}

### Pulling model to Ollama

In [67]:
!ollama pull gemma3:27b
!ollama pull llama3.1:8b
!ollama pull mistral-small3.1:latest

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling e796792eba26... 100% ▕████████████████▏  17 GB                         [K
pulling e0a42594d802... 100% ▕████████████████▏  358 B                         [K
pulling dd084c7d92a3... 100% ▕████████████████▏ 8.4 KB                         [K
pulling 3116c5225075... 100% ▕████████████████▏   77 B                         [K
pulling f838f048d368... 100% ▕████████████████▏  490 B                         [K
verifying sha256 digest [K
writing manifest [K
success [K[?25h[?2026l
[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 667b0c1932bc... 100% ▕████████████████▏ 4.9 GB                         [K
pulling 948af2743fc7... 1

### Each time you will run the whole pipeline with one LLM. Follow the instruction below:

- Change the first line of 'ros_extract' and 'ros_classify' file with the model name <br> - For example, if you run gemma3:27b, change the first line to: <br>FROM gemma3:27b <br> - Similar with other models:<br>FROM llama3.1:8b <br> FROM mistral-small3.1:latest

After you change the files content, run create_model.sh to create ros_extract and ros_classify models based on the corresponding LLM you want to use

In [68]:
!source create_model.sh

[?2026h[?25l[1Ggathering model components [K
using existing layer sha256:1fa8532d986d729117d6b5ac2c884824d0717c9468094554fd1d36412c740cfc [K
using existing layer sha256:6db27cd4e277c91264572b9c899c1980daa8dea11e902f0070a6f4763f3d13c8 [K
using existing layer sha256:78f1631bbae9c1601f7ef958c1a2ded966766a1601ca3a4f8b29d0a0c2c60222 [K
using existing layer sha256:77666e5fb92b1205d96fa2c71c8cadfbf56d2ad82636313603ec0113accfd1e1 [K
writing manifest [K
success [K[?25h[?2026l
[?2026h[?25l[1Ggathering model components [K
using existing layer sha256:1fa8532d986d729117d6b5ac2c884824d0717c9468094554fd1d36412c740cfc [K
using existing layer sha256:6db27cd4e277c91264572b9c899c1980daa8dea11e902f0070a6f4763f3d13c8 [K
using existing layer sha256:e112f3568ab826a6f1f317c70809ca002dad47947352d337c7b2b1bbf995df28 [K
using existing layer sha256:77666e5fb92b1205d96fa2c71c8cadfbf56d2ad82636313603ec0113accfd1e1 [K
writing manifest [K
success [K[?25h[?2026l


Set the output folder to corresponding model

In [69]:
# output_folder_path = r"Output_gemma3_27b"
output_folder_path = r"Output_mistral-small3.1"
# output_folder_path = r"Output_llama3.1_8b"

if os.path.exists(output_folder_path) == False:
    os.makedirs(output_folder_path)

# PIPELINE

## Pipe 1: Load notes

In [None]:
for filename in os.listdir(input_folder_path):
    if filename.endswith('.txt'):  # Process only .txt files
        file_path = os.path.join(input_folder_path, filename)
        with open(file_path) as f:
            dflow[filename]=f.read()

print(f'{len(dflow)} notes loaded')
dflow

## Pipe 2: Segmentation to extract the ROS section

Function definition

In [None]:
"""Functions Required for ROS Segmentation"""
#====================================================================
import pandas as pd
def sectag_to_regex(header_file_path, seg_col, header_col):
  header_df = pd.read_csv(header_file_path)
  header_df = header_df.drop_duplicates()
  headers = header_df[header_col].tolist()
  header_patterns = [f'^{header}[\n:]' for header in headers]
  return header_patterns, header_df[seg_col].tolist()

#====================================================================
import re
def find_segs(note, header_patterns, seg_names):
  segs = {}

  # Find the section headers and their start positions
  for i, pattern in enumerate(header_patterns):
    for m in re.finditer(pattern, note.lower(), re.MULTILINE):
      seg_head = (note[m.span()[0]:m.span()[1]], m.span()[0])
      if seg_head not in segs:
        segs[seg_head] = []  # A seg head can have multiple general seg names

      segs[seg_head].append(seg_names[i]) 

  segs = [[k[0], segs[k], k[1]] for k in segs.keys()]
  segs = sorted(segs, key=lambda x: x[2])
  
  # Find the entir sections and their start and end positions
  for i in range(len(segs)):
    if i == len(segs)-1:
      segs[i].append(len(note))
    else:
      segs[i].append(segs[i+1][2])

  return segs

#====================================================================
def ros_seg(note, segs):
  ros_data = []   # ros text + start position
  right_after_ros = False # flag sections after ROS
  for seg in segs:
      section_names = seg[1]
      section_content = note[seg[2]:seg[3]]  

      #---------------------------------------------------------------
      # Handle hiearchical subsections within ROS if exist
      if right_after_ros:
        if any("review" in item for item in section_names):
          # If there is a review section, append it to the ros_data
          ros_data[0] += section_content
        else:
          right_after_ros = False  # Consider it goes to another section
     
      #---------------------------------------------------------------
      if 'review_of_systems' in section_names:
          ros_data = [section_content, seg[2]]
          right_after_ros = True

  return ros_data

Extraction

In [None]:
header_patterns, seg_names = sectag_to_regex(r'SecTag.csv', 'kmname', 'str')
for filename, note in dflow.items():
    segs = find_segs(note, header_patterns, seg_names)
    ros_data = ros_seg(note, segs)        
    dflow[filename] = ros_data

#print(segs)
dflow


## Pipe 3: LLM to extract diseases, symptoms and body systems and their positive/negative status

Function to remove unnescessary characters in json output from LLMs

In [None]:
import re
def remove_char_json(text):
    pattern = r'\[.*\]'
    match = re.search(pattern, text, re.DOTALL)
    result = match.group()
    return result

Extract diseases and convert to json

In [None]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate([
    ("user", """{ros_text}"""),
    ("user", "convert to json, remove any unnecessary text and make sure the output starts with [")
])

llm1=ChatOllama(
    model="ros_extract"
)

chain = prompt|llm1

In [None]:
import json
for filename in dflow.keys():
    if dflow[filename] != []:
        ros_text = dflow[filename][0]
        ros_extract = chain.invoke(ros_text)
        
        try:
            ros_extract_json = json.loads(ros_extract.content)
        except:
            try:
                # Try again, assuming the error was due to the AI output being not a JSON
                ros_extract2 = llm1.invoke(f"Convert to json, remove any unnecessary text, for example ```, and make sure the output starts with [: {ros_extract.content}")
                ros_extract2_corrected = remove_char_json(ros_extract2.content)
                ros_extract_json = json.loads(ros_extract2_corrected)
            except:
                # If it still fails, just give an empty JSON list
                ros_extract_json = []
        dflow[filename].append(ros_extract_json)

dflow

## Pipe 4: LLM to identify the systems

Function to clean up and organize ros entities captured from LLMs output: remove unnecessary characters and group them into ros category

In [None]:
import re
def regexp_ros(text):
  pattern = r"-->\s*(?P<ros>[^(\n]+)"
  m = re.search(pattern, text, re.I)
  if m:
    return m.group('ros').strip()
  else:
    return 'None'

Identify the systems

In [None]:
llm2=ChatOllama(
    model="ros_classify"
)

In [None]:
for filename in dflow.keys():
    if dflow[filename] != []:
        ros_extracts = dflow[filename][2]
        ai_output = "" # Track the AI output before regex process
        for i in ros_extracts:
            ros_cat = llm2.invoke(i['extract']).content
            ai_output += f'###{ros_cat}\n'
            i['sys']=regexp_ros(ros_cat)
        
        dflow[filename].append(ai_output)
dflow

## Pipe 5: Output to files

In [None]:
with open(f'{output_folder_path}/dflow.txt', 'w') as f:
    f.write(str(dflow))

with open(f'{output_folder_path}/results.csv', 'w') as f:
    f.write("Filename,Extract,Status,Sys\n")
    for filename in dflow.keys():
        if dflow[filename] != []:
            ros_results = dflow[filename][2]
            if len(ros_results) == 0:
                f.write(f"{filename},NA,NA,NA\n")
            else:
                for i in ros_results:
                    f.write(f"{filename},{i['extract']},{i['status']},{i['sys']}\n")