In [13]:
import IO

In [14]:
from transformers import MarianMTModel, MarianTokenizer
def translate_column(column):
     return column.apply(translate)

def translate(text):
    # Load the MarianMT model and tokenizer for German to English translation
    model_name = 'Helsinki-NLP/opus-mt-de-en'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return translated_text
    


In [15]:
import spacy
def extract_entities(text):
    # Load the SciSpacy model
    nlp = spacy.load("en_core_sci_md")
    doc = nlp(text)
    print(doc)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from transformers.generation import TFGenerationMixin
import tensorflow

def extract_entitiesBert(text):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
    model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")

    # Create a pipeline for NER
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

    # Get the entities
    entities = nlp(text)

    # Initialize variables to store results
    processed_entities = []
    current_entity = ""
    current_label = ""

    for entity in entities:
        word = entity['word'].replace('▁', '').replace('##', '')
        label = entity['entity']

        # Check if the current label is part of a new entity or a continuation
        if "NONBIOLOGICAL_LOCATION" in label:
            continue
        if label.startswith("B-"):
            # Append the previous entity to the result list if it exists
            if current_entity:
                processed_entities.append(current_entity.strip())
            # Start a new entity
            current_entity = word
            current_label = label
        elif label.startswith("I-") and current_label[2:] == label[2:]:
            # Continue the current entity
            current_entity += " " + word
        else:
            # Append the previous entity to the result list if it exists and reset
            if current_entity:
                processed_entities.append(current_entity.strip())
            current_entity = word
            current_label = label

    # Append the last entity if it exists
    if current_entity:
        processed_entities.append(current_entity.strip())

    return processed_entities

# Example usage:
text = "Aspirin is a medication used to reduce pain, fever, or inflammation."
entities = extract_entitiesBert(text)
print(entities)




Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['pain', 'fever', 'inflammation']


In [17]:
text = "Aspirin is a medication used to reduce pain, fever, or inflammation."
tokenizer = AutoTokenizer.from_pretrained("Clinical-AI-Apollo/Medical-NER")
model = AutoModelForTokenClassification.from_pretrained("Clinical-AI-Apollo/Medical-NER")

# Create a pipeline for NER
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Get the entities
nlp(text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-SIGN_SYMPTOM',
  'score': 0.25873363,
  'index': 8,
  'word': '▁pain',
  'start': 38,
  'end': 43},
 {'entity': 'B-SIGN_SYMPTOM',
  'score': 0.31540182,
  'index': 10,
  'word': '▁fever',
  'start': 44,
  'end': 50},
 {'entity': 'B-SIGN_SYMPTOM',
  'score': 0.25196818,
  'index': 13,
  'word': '▁inflammation',
  'start': 54,
  'end': 67}]

In [21]:
german_text = IO.load("../data/patients++.csv")['Anamnese + Befund Whole Text']
translated_text = [translate(text) for text in german_text]
entities = [extract_entitiesBert(text) for text in translated_text]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [22]:
for i in range(0, len(german_text)):
    print(german_text.iloc[i])
    print(translated_text[i])
    print(entities[i])

Vor 1 Monate Nachts plötzliche Kniegelenksblockade linkes Kniegelenk. Vorstellung im Krankenhaus Gummersbach. Im Verlauf Vorstellung im Krankenhaus Engelskirchen mit Röntgen und Frakturausschluss. Vor 3 Jahren Voroperation im KH-Engelskirchen. Als Kind schon rezdivierende Blockadeereignisse. Das linke Knie kann eingeschränkt ausgestreckt und schwer gebeugt werden. Es gibt keine Flüssigkeitsansammlung im Knie. Es gibt deutliche Schmerzen außen seitlich beim Druck.
1 months ago sudden knee joint blockage left knee joint. Presentation at the hospital Gummersbach. In the course of presentation at the hospital Engelskirchen with X-ray and fracture exclusion. 3 years ago pre-operation in KH-Engelskirchen. As a child already recurrent blockade events. The left knee can be stretched out and severely bent. There is no accumulation of fluid in the knee. There is clear pain outside at the pressure.
['1 months ago', 'sudden', 'knee joint', 'blockage', 'left knee joint', 'Presentation', 'presentati