In [1]:
import pandas as pd
import numpy as np
import spacy
import en_ner_bc5cdr_md
import en_core_med7_lg
import re

In [2]:
import chardet

filename = "../../data/aasm_manual_v25 spell corrected.txt"

# Detect the encoding of the file
with open(filename, 'rb') as file:
    raw_data = file.read()
    encoding = chardet.detect(raw_data)
    print(encoding['encoding'])

utf-8


In [3]:
with open(filename, encoding=encoding['encoding']) as file:
    full_text = file.read()

# NB! Not ideal solution, but this is just a "prototype"
def split_text(full_text, max_length=400):
    words = full_text.split()
    return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

chunks = split_text(full_text)
len(chunks)
transcription_list = [re.sub(r'(\.,)', ". ", x) for x in chunks]

In [4]:
nlp = spacy.load("en_core_med7_lg")

# This function generate anotation for each entities and label
def generate_annotation(texts):
    annotations = []
    for text in texts:
        doc = nlp(text)
        entities = []
        for ent in doc.ents:
            entities.append((ent.start_char, ent.end_char, ent.label_))
        annotations.append((text, {'entities': entities}))
    return annotations

# Extract text entities and labels from the dataset (transcription)
medical_doc = chunks

# Let's generate annotations
annotations = generate_annotation(medical_doc)


# Let's print documents and annotations
print(f"Number of lines with annotations: {len(annotations)}")
print("Document:")
print(annotations[0][0]) # first document text
print("Annotations:")
print(annotations[0][1]) # annotation for the first document

  from .autonotebook import tqdm as notebook_tqdm


Number of lines with annotations: 81
Document:
) The AASM Manual for the Scoring of Sleep and Associated Events RULES, TERMINOLOGY AND TECHNICAL SPECIFICATIONS r ..._ VERSION 2.5 Richard B. Berry,MD (Chair); Claude L. Albertario, AST, RPSGT; Susan M. Harding, MD; Robin M. Lloyd, MD; David T. Plante, MD; Stuart F.Quan, MD; Matthew M.Troester, DO; Bradley V.Vaughn, MD; for the American Academy of Sleep Medicine .. ASt.l A American Academy of 1:: IYI SLEEP MEDICINE"' ) J ) :..J ) All Content © 2018 American Academy of Sleep Medicine AASM Scoring Manual Version 2.5 1 ( Copyright 1D 2018 American Academy of Sleep Medicine, 2510 North Frontage Road, Darien, IL 60561, U.S.A. r ( Yearly subscriptions to the online manual are available at www.aasm.org. c All rights reserved. Unauthorized reproduction or transmission of this manual or any portion thereof in any ( form or by any means, mechanical or electronic, is strictly prohibited. ( ( Correspondence regarding copyright permissions should be d

In [None]:
from spacy import displacy
nlp = spacy.load("en_core_med7_lg")

# Create distict colours for labels

col_dict = {}
s_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
for label, colour in zip(nlp.pipe_labels['ner'], s_colours):
    col_dict[label] = colour

options = {'ents': nlp.pipe_labels['ner'], 'colors':col_dict}

transcription = full_text
doc = nlp(transcription)

spacy.displacy.render(doc, style = 'ent', jupyter = True, options = options)

[(ent.text, ent.label_) for ent in doc.ents]

In [7]:
from  spacy.matcher import Matcher

# Let's load the model
nlp = spacy.load("en_core_med7_lg")

patterns = [
    [{"ENT_TYPE": "DRUG"}, {"LIKE_NUM": True}, {"IS_ASCII": True}],
    [{"LOWER": {"IN": ["mg", "g", "ml"]}}, {"ENT_TYPE": "DRUG"}],
    [{"ENT_TYPE": "DRUG"}, {"IS_DIGIT": True, "OP": "?"}, {"LOWER": {"IN": ["mg", "g", "ml"]}}]
]

matcher = Matcher(nlp.vocab)
matcher.add("DRUG_DOSE", patterns)

for transcription in  chunks:
    doc = nlp(transcription)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]
        print(string_id, start, end, span.text)

DRUG_DOSE 361 364 thc 2017 -201R
DRUG_DOSE 143 146 Nole 2.


In [8]:
# Let's load our pretrained spacy model

nlp = spacy.load("en_core_med7_lg")

# this function will extract relevant entities and labels needed from medical transcription 

def extract_keywords(text):
    doc = nlp(text)
    entities = []
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Lets define our categories
surgery_keywords = ["surgery", "operation", "procedure", "acute Cholangitis", "surgisis", "appendicitis"]
cardio_pul_keywords = ["heart", "cardiovascular", "pulmonary", "lungs"]
orthopaedic_keywords = ["orthopaedic", "bone", "joint", "fracture"]
neurology_keywords = ["neurology", "nervours system", "brain", "nerve"]
general_med_keywords = ["patient", "complaining", "history", "medical"]
    
# This will process each medical description and check for relevant keywords
medical_doc = chunks
for transcription in medical_doc:
    entities = extract_keywords(transcription.lower())
    
    is_surgery = any(keyword in transcription.lower() for keyword in surgery_keywords)
    is_cardio_pul = any(keyword in transcription.lower() for keyword in cardio_pul_keywords)
    is_orthopaedic = any(keyword in transcription.lower() for keyword in orthopaedic_keywords)
    is_neurology = any(keyword in transcription.lower() for keyword in neurology_keywords)
    is_general_med = any(keyword in transcription.lower() for keyword in general_med_keywords)
    
    print("Transcription:", transcription)
    print("Entities:", entities)
    print("Is Surgery:", is_surgery)
    print("Is Cardio Pulmonary:", is_cardio_pul)
    print("Orthopaedic:", is_orthopaedic)
    print("Neurology:", is_neurology)
    print("General Medicine:", is_general_med)

Transcription: ) The AASM Manual for the Scoring of Sleep and Associated Events RULES, TERMINOLOGY AND TECHNICAL SPECIFICATIONS r ..._ VERSION 2.5 Richard B. Berry,MD (Chair); Claude L. Albertario, AST, RPSGT; Susan M. Harding, MD; Robin M. Lloyd, MD; David T. Plante, MD; Stuart F.Quan, MD; Matthew M.Troester, DO; Bradley V.Vaughn, MD; for the American Academy of Sleep Medicine .. ASt.l A American Academy of 1:: IYI SLEEP MEDICINE"' ) J ) :..J ) All Content © 2018 American Academy of Sleep Medicine AASM Scoring Manual Version 2.5 1 ( Copyright 1D 2018 American Academy of Sleep Medicine, 2510 North Frontage Road, Darien, IL 60561, U.S.A. r ( Yearly subscriptions to the online manual are available at www.aasm.org. c All rights reserved. Unauthorized reproduction or transmission of this manual or any portion thereof in any ( form or by any means, mechanical or electronic, is strictly prohibited. ( ( Correspondence regarding copyright permissions should be directed to the American Academy 