In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./data/questionnaire_data.csv").dropna()
data.EXA = data.EXA.str.replace("\\n", " ")
data.ANA = data.ANA.str.replace("\\n", " ")



# Examination Interpretation

In [3]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/juliankraus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliankraus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Textual Adaptions

In [4]:
# List of strings that should not be split
do_not_split = [
    "li.",
    "bds.",
    "med.",
    "neg.",
    "lat.",
    'pos.',
    "re.",
    "mm, links",
    "mm, rechts",
    "mm, li",
    "mm, re",
    "o.B",
    "mm, verletzt",
    "mm, Seitendifferenz",
    "diskr.",
    "flüssig und mit"
]
def custom_split(text, exceptions):
    for exception in exceptions:
        text = re.sub(re.escape(exception), re.sub(r'[^\w\s]', '', exception), text, flags=re.IGNORECASE)
    parts = re.split(r'[.,;]|\sund\s', text)
    parts = [part.strip() for part in parts if part.strip()]
    return parts

In [5]:
# Preprocess the text
pattern_pre = r'\d{1,3}/\d{1,3}/\d{1,3}|\d{1,3}-\d{1,3}-\d{1,3}'

pattern_side = r'(\b\d+[\d.,]*\b)\s*seitendifferenz|\bseitendifferenz\s*(\d+[\d.,]*\b)'


def adaptText(match):
    # Extract the matched text
    matched_text = match.group(0)
    
    # Split the matched text by either '/' or '-'
    if '/' in matched_text:
        parts = matched_text.split('/')
    elif '-' in matched_text:
        parts = matched_text.split('-')
    
    # Convert parts to integers
    parts = list(map(int, parts))
    
    # Evaluate each part in the context of the "Neutral-Null-Methode"
    hyperextension = parts[0]
    extension_deficit = parts[1]
    flexion = parts[2]
    
    # Create descriptions based on the measurements
    if hyperextension > 0:
        hyperextension_desc = "überstreckung des knies ist möglich,"
    else:
        hyperextension_desc = "keine überstreckung des knies,"
    
    if extension_deficit > 0:
        extension_deficit_desc = "streckdefizit des knies,"
    else:
        extension_deficit_desc = "kein streckdefizit des knies,"
    
    if flexion >= 120:
        flexion_desc = "gute beugung des knies,"
    elif flexion >= 90:
        flexion_desc = "angemessene beugung des knies,"
    else:
        flexion_desc = "eingeschränkte beugung des knies,"
    
    # Combine the descriptions into a final string
    result = f",{hyperextension_desc} {extension_deficit_desc} {flexion_desc}"
    return result

def adaptText_2(match):
    # Extract the matched text
    matched_text = match.group(0)
    
    number = match.group(1) if match.group(1) else match.group(2)
    if int(number) > 3:
        return "wacklig"
    else:
        return ""

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub('ds', 'druckschmerz', text, flags=re.IGNORECASE)
    text = re.sub('medial', 'innere', text, flags=re.IGNORECASE)
    text = re.sub('gelenkspalt', 'seite des kniegelenks', text, flags=re.IGNORECASE)



    text = re.sub('lateral', 'äußere', text, flags=re.IGNORECASE)
    text = re.sub('lat', 'äußere', text, flags=re.IGNORECASE)
    text = re.sub('gs', 'seite des kniegelenks', text, flags=re.IGNORECASE)
    text = re.sub('li ', 'links ', text, flags=re.IGNORECASE)
    text = re.sub('re ', 'rechts ', text, flags=re.IGNORECASE)
    text = re.sub('gb ', 'gangbild ', text, flags=re.IGNORECASE)
    text = re.sub('patellafacette', 'kniescheibe', text, flags=re.IGNORECASE)
    text = re.sub('patella', 'kniescheibe', text, flags=re.IGNORECASE)
    text = re.sub('erguss', 'schwellung', text, flags=re.IGNORECASE)
    text = re.sub('lcp', 'fühle mich', text, flags=re.IGNORECASE)
    text = re.sub('lachman neg', '', text, flags=re.IGNORECASE)
    text = re.sub('lachman pos', 'fühle mich unsicher', text, flags=re.IGNORECASE)


    text = re.sub(pattern_side, adaptText_2, text, flags=re.IGNORECASE)


    text = re.sub(pattern_pre, adaptText, text, flags=re.IGNORECASE)

    text = re.sub(r'[^\w\s/-]', '', text)
    return text

In [6]:
patterns = [
    r'^\d{1,3}/\d{1,3}/\d{1,3}$',  # Pattern for 1-3 digits/numbers/1-3 digits/numbers/1-3 digits/numbers
    r'^\d{1,3}-\d{1,3}-\d{1,3}$',  # Pattern for 1-3 digits-numbers-1-3 digits-numbers-1-3 digits-numbers
    r'.*gangbild.*',               
    r'.*druck.*',                 
    r'.*ergu.*',              
    r'.*schwell.*',       
    r'.*rötung.*',          
    r'.*überstreckung.*',     
    r'.*fühle.*',   
    r'.*wacklig.*',     


]

# Combine patterns into a single regular expression
combined_pattern = r'|'.join(patterns)

In [7]:
inter_df = data.copy()
inter_df = inter_df.fillna(value="")

# Filter the list
for idx, row in inter_df.iterrows():
    text = row['EXA']
    if (text is not None) and text != "":
        parts = custom_split(text, do_not_split)  # Custom split by exceptions
        parts = list(map(preprocess, parts))
        
        filtered_parts = [s for s in parts if s and re.match(combined_pattern, s, re.IGNORECASE)]
        
        inter_df.at[idx, 'EXA_formatted'] = ', '.join(filtered_parts)
            
    else:
        inter_df.at[idx, 'EXA_formatted'] = ""

In [8]:
inter_df

Unnamed: 0,patientID,DIA,DIA_text,ANA,EXA,gender,age,questionnaire,EXA_formatted
0,308184,M23.32,Z.n. Sonstige Meniskusschädigungen: Hinterhorn...,Am 5.8.24 beim Aufrichten aus der Hocke plötzl...,"Beinachse gerade, Linkes Kniegelenk: Extension...",Männlich,37,Beim ausrechnen aus der tiefen Hocke\t Leicht...,linkes kniegelenk extension/flexion keine über...
1,308194,M23.32,Innenmeniskus-Komplexriss im Hinterhorn links,2005 und 2014 AC Knie mit IM Teilresektion im ...,"Beinachse 2 QF varisch, \nLinkes Kniegelenk: E...",Männlich,63,Grundsätzlich kein akutes Ereignis außer jetzt...,linkes kniegelenk extension/flexion keine über...
2,308197,M23.34,Sonstige Meniskusschädigungen: Vorderhorn des ...,2019 AC Knie re wegen AM VORderhornganglion un...,Beinachse gerade\nRechtes Kniegelenk: Extensio...,Männlich,61,Nach dem aufstehen aus der Hocke Schwellung l...,"kein gelenkschwellung, fühle mich stabil, mäßi..."
3,60081,M23.32,Innenmeniskus-Riss im Hinterhorn links,"WV mit MRT Bildern links, vor einer Woche nach...",Beinachse gerade\nLinkes Kniegelenk: Extension...,Männlich,48,Beim gehen umgeknickt. Beschwerden und Schmerz...,"kein gelenkschwellung, fühle mich stabil, druc..."
4,307082,S83.2,Innenmeniskuskorbhenkelruptur links,Z.n. AC Knie rechts mit Im TR in domo 06/24. M...,Beinachse gerade\nGB flüssig\nLinkes Kniegelen...,Männlich,49,Danach schlimmer geworden gesagt Position\t K...,"kein gelenkschwellung, fühle mich stabil, druc..."


## Formulation

In [9]:
from mlx_lm import load, generate
import pandas as pd


In [10]:
%%bash
python -m mlx_lm.convert --hf-path jphme/em_german_leo_mistral --mlx-path model/ -q 

[INFO] Loading


Fetching 8 files: 100%|██████████| 8/8 [00:00<00:00, 143395.01it/s]


[INFO] Quantizing


In [11]:
model, tokenizer = load("model/")

In [12]:
few_shot_examples = [
    ("links knie diskr schwellung, kein anteromedialer druckschmerz med äußerechts Seite des Kniegelenks, kein druckschmerz kniescheiben, keine meniskuszeichen rechts knie keine schwellung, druckschmerz proximal seitlich kniescheibe", 
     "Mein linkes Knie hat keine Schwellung, und es gibt keine Schmerzen beim Drücken auf der äußerechts Seite des Kniegelenks und der Kniescheibe. Mein rechtes Knie hat auch keine Schwellung, aber in der Nähe der Seite der Kniescheibe, gibt es Schmerzen beim Drücken."),
    ("rechts knie deutlicher schwellung, keine überstreckung des knies, streckdefizit des knies, angemessene beugung des knies, keine rötung", 
     "Mein rechtes Knie hat eine Schwellung, ich kann es nicht überstrecken und auch nicht komplett strecken. Die Beugung des Knies ist nur angemessen möglich. Mein Knie ist nicht gerötet."),
]

# Construct the few-shot prompt
instruction = "Du bist ein Patient mit den folgenden Symptomen. Ersetze alle lateinischen und deutschen Fachbegriffe und formuliere es umgangssprachlich aus deiner Sicht."
few_shot_prompt = f"{instruction}\n\n" + "\n\n".join([f"USER: {inp}\nASSISTANT: {out}" for inp, out in few_shot_examples])

In [13]:
def generate_response(text):
    if (text != "") and (text is not None) and not (isinstance(text, float)):
        prompt = f"{few_shot_prompt}\n\nUSER: {text}\nASSISTANT:"
        generated_text = generate(model, tokenizer, prompt=prompt, max_tokens=200, verbose=False)
        return generated_text
    return ""

In [14]:
def remove_after_newline(text):
    return text.split('\n')[0] if isinstance(text, str) else text

In [15]:
for idx, row in inter_df.iterrows():
    text = row['EXA_formatted']
    ouput_text = generate_response(text)
    inter_df.at[idx, 'EXA_interpreted'] = remove_after_newline(ouput_text)

## Labels

In [16]:
inter_df.to_csv("./data/questionnaire_inter.csv")

In [2]:
import pandas as pd
inter_df = pd.read_csv("./data/questionnaire_inter.csv")

In [3]:
inter_df

Unnamed: 0.1,Unnamed: 0,patientID,DIA,DIA_text,ANA,EXA,gender,age,questionnaire,EXA_formatted,EXA_interpreted
0,0,308184,M23.32,Z.n. Sonstige Meniskusschädigungen: Hinterhorn...,Am 5.8.24 beim Aufrichten aus der Hocke plötzl...,"Beinachse gerade, Linkes Kniegelenk: Extension...",Männlich,37,Beim ausrechnen aus der tiefen Hocke\t Leicht...,linkes kniegelenk extension/flexion keine über...,Mein linkes Kniegelenk hat keine Überstreckung...
1,1,308194,M23.32,Innenmeniskus-Komplexriss im Hinterhorn links,2005 und 2014 AC Knie mit IM Teilresektion im ...,"Beinachse 2 QF varisch, \nLinkes Kniegelenk: E...",Männlich,63,Grundsätzlich kein akutes Ereignis außer jetzt...,linkes kniegelenk extension/flexion keine über...,"Mein linkes Kniegelenk hat keine Schwellung, i..."
2,2,308197,M23.34,Sonstige Meniskusschädigungen: Vorderhorn des ...,2019 AC Knie re wegen AM VORderhornganglion un...,Beinachse gerade\nRechtes Kniegelenk: Extensio...,Männlich,61,Nach dem aufstehen aus der Hocke Schwellung l...,"kein gelenkschwellung, fühle mich stabil, mäßi...",Ich habe keine Gelenkschwellung und fühle mich...
3,3,60081,M23.32,Innenmeniskus-Riss im Hinterhorn links,"WV mit MRT Bildern links, vor einer Woche nach...",Beinachse gerade\nLinkes Kniegelenk: Extension...,Männlich,48,Beim gehen umgeknickt. Beschwerden und Schmerz...,"kein gelenkschwellung, fühle mich stabil, druc...",Ich habe keine Gelenkschwellung und fühle mich...
4,4,307082,S83.2,Innenmeniskuskorbhenkelruptur links,Z.n. AC Knie rechts mit Im TR in domo 06/24. M...,Beinachse gerade\nGB flüssig\nLinkes Kniegelen...,Männlich,49,Danach schlimmer geworden gesagt Position\t K...,"kein gelenkschwellung, fühle mich stabil, druc...",Ich habe keine Gelenkschwellung und fühle mich...


In [4]:
# Function to translate sex
def translate_sex(sex):
    if sex == 'male':
        return 'männlich '
    elif sex == 'female':
        return 'weiblich '
    else:
        return ""
    
def format_age(age):
    if not pd.isna(age):
        return str(age) + " Jahre alt "
    else:
        return ""

cols = ['ANA', 'EXA_interpreted']

# Ensure age and sex are included in the DataFrame and handle NaN values
inter_df['data'] = inter_df.apply(lambda row: (format_age(row['age']) + translate_sex(row['gender']) + ". ".join(row[cols].dropna().astype(str))
), axis=1)

In [5]:
meniskus = [
    "M23.3",
    "S83.2", # Rupture
]
meniskus_low = [
    "M23.0", # Ganglion doesn't seem to urgent
    "M23.1", # something you have from birth
    "M23.2", # problems because of old rupture
    "M23.9"  # Same as M23.8
]
cruciate = [
    "M23.6", # Ruptur
    "S83.50", # might be rupture
    "S83.53", # Rupture
    "S83.54", # Rupture
    "S83.7" # Multiple areas
]
cruciate_low = [
    "M23.8", # Seems to be no rupture, rather weakness, still operations
    "S83.51", # Distorsion
    "S83.52", # Distorsion
]
other_urgent = [
    "S83.3" # Rupture Kniegelenkknorpel
    "S83.4" # Rupture Seitenband
]
other = [
    "M17",
    "M22",  
    "M23.4", # Freikörper, does have a lot of operations
    "M23.5", # instability
    "S83.1", # Only luxation
    "S83.6" # Distorsion somewhere
]
def createLabel(code):
    if any(elem in code for elem in meniskus):
        return "meniskus-urgent"
    elif any(elem in code for elem in meniskus_low):
        return "meniskus-minor"
    elif any(elem in code for elem in cruciate):
        return "cruciate-ligament-urgent"
    elif any(elem in code for elem in cruciate_low):
        return "cruciate-ligament-minor"
    elif any(elem in code for elem in other_urgent):
        return "other-urgent"
    elif any(elem in code for elem in other):
        return "other-minor"
    else:
        return "other-minor"
    
inter_df['label'] = inter_df['DIA'].apply(createLabel)

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
import math
def preprocess_lstm(text):
    with open("tokenizer.pickle", 'rb') as file:
        tokenizer = pickle.load(file)
    tokens = tokenizer.texts_to_sequences([text])

    max_length = 222
    # Padding sequences
    text_padded = pad_sequences(tokens, maxlen=max_length, padding='post')
    return text_padded

def eval_lstm(prediction, label_encoder):
   
    probabilities = prediction[0]

    # Decode all class labels
    all_labels = label_encoder.inverse_transform(np.arange(len(probabilities)))

    # Build the prediction string and sum probabilities for "major" labels
    prediction_string = ""
    sum_major_probabilities = 0.0
    for idx in range(len(probabilities)):
        label = all_labels[idx]
        probability = probabilities[idx]
        prediction_string += f"{label}: {probability:.4f}; "
        if "urgent" in label:
            sum_major_probabilities += probability

    sum_major_probabilities = math.floor(sum_major_probabilities * 100) / 100

    print(sum_major_probabilities)
    print(prediction_string)


    return sum_major_probabilities, prediction_string




In [8]:
from keras.models import load_model
import pickle
# Define paths for the saved files
model_type = 'lstm'
model_name = 'final_keywords'
model_path = f'./models/{model_type}/{model_name}/{model_name}.h5'
history_path = f'./data/{model_type}/{model_name}/{model_name}_history.pkl'
data_path = f'./data/{model_type}/{model_name}/{model_name}_test_data.pkl'
label_path = f'./data/{model_type}/{model_name}/{model_name}_test_label.pkl'
encoder_path = f'./data/{model_type}/{model_name}/{model_name}_encoder.pkl'

# Load the model
model = load_model(model_path)
print(f"Model loaded from {model_path}")

# Load the history
with open(history_path, 'rb') as file:
    history = pickle.load(file)

# Load the test data
with open(data_path, 'rb') as file:
    test_data = pickle.load(file)

# Load the test labels
with open(label_path, 'rb') as file:
    test_labels = pickle.load(file)

# Load the encoder
with open(encoder_path, 'rb') as file:
    label_encoder = pickle.load(file)

print("Data loaded successfully")


2024-08-13 14:25:34.374616: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Max
2024-08-13 14:25:34.374648: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2024-08-13 14:25:34.374655: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2024-08-13 14:25:34.374670: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-13 14:25:34.374686: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model loaded from ./models/lstm/final_keywords/final_keywords.h5


  saveable.load_own_variables(weights_store.get(inner_path))


Data loaded successfully


In [9]:
data_label = 'data'
for idx, row in inter_df.iterrows():    
    text = preprocess_lstm(row[data_label])

    prediction = model.predict(text)

    risk, risk_string = eval_lstm(prediction, label_encoder)    

    inter_df.at[idx, "" + data_label + "_prediction"] = risk_string

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 245ms/step


2024-08-13 14:25:38.480927: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


0.91
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0000; meniskus-urgent: 0.9151; other-minor: 0.0849; 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
0.14
cruciate-ligament-minor: 0.0001; cruciate-ligament-urgent: 0.0002; meniskus-minor: 0.0252; meniskus-urgent: 0.1456; other-minor: 0.8289; 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
0.99
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0001; meniskus-urgent: 0.9987; other-minor: 0.0013; 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0.99
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0000; meniskus-urgent: 0.9998; other-minor: 0.0002; 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
0.99
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0000; meniskus-urgent: 1.0000; other-minor: 0.0000

In [10]:
data_label = 'questionnaire'
for idx, row in inter_df.iterrows():    
    print("1")
    text = preprocess_lstm(row[data_label])
    print("1")

    prediction = model.predict(text)
    print("1")

    risk, risk_string = eval_lstm(prediction, label_encoder)    
    print("1")

    inter_df.at[idx, "" + data_label + "_prediction"] = risk_string

1
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
1
0.97
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0000; meniskus-urgent: 0.9717; other-minor: 0.0283; 
1
1
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
1
0.0
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0000; meniskus-urgent: 0.0005; other-minor: 0.9994; 
1
1
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
1
0.99
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0003; meniskus-urgent: 0.9940; other-minor: 0.0057; 
1
1
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
1
0.99
cruciate-ligament-minor: 0.0000; cruciate-ligament-urgent: 0.0000; meniskus-minor: 0.0000; meniskus-urgent: 1.0000; other-minor: 0.0000; 
1
1
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
1
0.71
cruciate-ligament-minor: 0

In [None]:
inter_df.to_csv("./data/questionnaire_pred.csv")