In [211]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.6/12.8 MB 3.3 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.5 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 5.8 MB/s eta 0:00:02
     ------ --------------------------------- 1.9/12.8 MB 6.2 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 6.3 MB/s eta 0:00:02
     -------- ------------------------------- 2.9/12.8 MB 7.3 MB/s eta 0:00:02
     ----------- ---------------------------- 3


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [212]:
# NER Annotator: https://tecoholic.github.io/ner-annotator/

In [348]:
import spacy
from spacy.tokens import DocBin
import json

nlp = spacy.blank("en")

doc_bin = DocBin()

with open("data/medical/cardio/annotations (11).json", "r") as file:
    data = json.load(file)

classes = data['classes']
annotations = data['annotations']

print(classes)

for annotation in annotations:
    text = annotation[0]
    entities = annotation[1]["entities"]
    doc = nlp(text)

    ents = []
    for start, end, label in entities:
        ents.append((start, end, label))
    doc.ents = [doc.char_span(start, end, label=label) for start, end, label in ents]
    doc_bin.add(doc)

doc_bin.to_disk("model/training_data.spacy")

['PROCEDURE', 'CONDITION', 'ANATOMY', 'RISK FACTOR']


In [349]:
classes = ["PROCEDURE","CONDITION","ANATOMY","RISK FACTOR"]

In [353]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")

for label in classes:
    ner.add_label(label)

doc_bin = DocBin().from_disk("model/training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

nlp.begin_training()

for epoch in range(100):
    losses = {}
    random.shuffle(docs)
    for batch in minibatch(docs, size=8):
        for doc in batch:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")

nlp.to_disk("model/trained_model")

Epoch 1, Losses: {'ner': 188.64850878715515}
Epoch 2, Losses: {'ner': 174.05992633104324}
Epoch 3, Losses: {'ner': 149.56314465403557}
Epoch 4, Losses: {'ner': 106.20995605736971}
Epoch 5, Losses: {'ner': 57.92552828416228}
Epoch 6, Losses: {'ner': 33.01989497752402}
Epoch 7, Losses: {'ner': 30.885747153838302}
Epoch 8, Losses: {'ner': 30.380023909266843}
Epoch 9, Losses: {'ner': 29.96656486512864}
Epoch 10, Losses: {'ner': 28.827814668901645}
Epoch 11, Losses: {'ner': 25.589529940352804}
Epoch 12, Losses: {'ner': 25.84038987544045}
Epoch 13, Losses: {'ner': 24.09652822334101}
Epoch 14, Losses: {'ner': 22.27117690544401}
Epoch 15, Losses: {'ner': 23.342379815433446}
Epoch 16, Losses: {'ner': 21.744723172267854}
Epoch 17, Losses: {'ner': 17.98454824582041}
Epoch 18, Losses: {'ner': 20.890236390153476}
Epoch 19, Losses: {'ner': 17.43827848198207}
Epoch 20, Losses: {'ner': 18.31509823103935}
Epoch 21, Losses: {'ner': 15.190976136908422}
Epoch 22, Losses: {'ner': 16.471158693904144}
Epoch 

In [354]:
from spacy import displacy

nlp = spacy.load('model/trained_model')
doc = nlp('''During the examination, the Arteries, Septum, and Pulmonary artery were examined. The Aortic valve and LeftAtrium were assessed for any abnormalities. Michael underwent an Echocardiogram, ECG, Catheterization, and a StressTest to gauge his heart function. Angioplasty, Bypass surgery, and Stenting were performed, and he received a Pacemaker. Ongoing treatment includes managing Cardiomyopathy and Arrhythmias.''')

In [355]:
displacy.render(doc, style='ent')

In [11]:
import numpy as np
from numpy.linalg import norm

In [12]:
from gensim.models import Word2Vec

In [13]:
import nltk
from nltk import word_tokenize, sent_tokenize

In [429]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Keerthi
[nltk_data]     Vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [430]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Keerthi
[nltk_data]     Vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [15]:
def cosine_similarity(v1, v2):
  return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [336]:
from nltk.stem import SnowballStemmer
import json

stemmer = SnowballStemmer(language='english')

class Embedding:
    def __init__(self, file):
        self.nlp = spacy.load('en_core_web_sm')
       
        f = open(file, 'r')
        text = f.read()
        f.close()
        
        sentences = []
        
        for sent in sent_tokenize(text.lower()):
            s = []
            for word in self.nlp(sent):
                if word.is_punct or word.is_space or word.is_stop:
                    continue
                s.append(word.text)
            sentences.append(s)

        f = open('sentences.txt', 'w')
        json.dump(sentences, f)
        f.close()
        
        self.model = Word2Vec(sentences, vector_size=20, window=10, min_count=1, workers=4, sg=1, epochs=20)

    def has_embed(self, word):
        token = word.lower()
        return token in self.model.wv

    def get_embed(self, word):
        token = word.lower()
        return self.model.wv[token]

    def get_similarity(self, word1, word2):
        if not (self.has_embed(word1) and self.has_embed(word2)):
            return 0
        emb1 = self.get_embed(word1)
        emb2 = self.get_embed(word2)
        return cosine_similarity(emb1, emb2)

In [337]:
classes

['PROCEDURE', 'CONDITION', 'ANATOMY', 'RISK FACTOR']

In [338]:
embedding = Embedding(file='data/medical/cardio/cardio - long.txt')

In [339]:
embedding.get_similarity('Catheterization', 'StressTest')

0.4169105

In [327]:
embedding.get_similarity('cornea', 'retina')

0

In [24]:
from numpy import mean

tagger = spacy.load('en_core_web_sm')

class TextCorpusSearcher:
    def __init__(self, inputs, embedding):
        self.embedding: Embedding = embedding

        doc = ' '.join(inputs).lower()
        self.x = [word.text for word in tagger(doc) if not (word.is_space or word.is_punct)]

        for w in self.x:
            if not self.embedding.has_embed(w):
                print("[WARN]", w, "missing in Word2Vec training data")

    def get_score(self, word):
        word = word.lower()
        token = tagger(word)[0]

        empty = np.zeros(len(self.x))
        if token.pos_ not in ('NOUN', 'PROPN') or not self.embedding.has_embed(word):
            return empty
        
        scores = []
        for w in self.x:
            if not self.embedding.has_embed(w):
                continue
            score = self.embedding.get_similarity(w, word)
            scores.append(score)
        
        return scores

In [25]:
from spacy.tokens import Span

def create_emb_ner(config_file):
    f = open(config_file, 'rb')
    config = json.load(f)
    f.close()

    searchers = []

    for e in config:
        labels = e['labels']
        name = e['name']

        max_thresh = e.get('max_thresh', 0.5)
        min_thresh = e.get('min_thresh', 0.2)
        
        split_ratio = 0.85

        split = int(len(labels) * split_ratio)

        train_terms = labels[:split]
        
        searcher = TextCorpusSearcher(
            inputs=train_terms,
            embedding=embedding
        )

        searchers.append((searcher, min_thresh, max_thresh, name))

    tagger = spacy.load('en_core_web_sm')

    def custom_ner_component(doc):
        new_entities = [ent for ent in doc.ents]
        for index, token in enumerate(doc):
            tag = tagger(token.text)[0].pos_
            if token.ent_type != 0 or tag not in ('NOUN', 'PROPN'):
                continue
            maxLabel = ""
            
            maxScore = 0
            maxLabel = ''

            for searcher, min_thresh, max_thresh, label in searchers:
                scores = searcher.get_score(token.text)
                currMax = max(scores)
                if min(scores) < min_thresh or currMax < max_thresh:
                    continue
                
                if currMax > maxScore:
                    maxLabel = label
                    maxScore = currMax
            
            if maxScore > 0:
                print("Adding tag for ", token, maxLabel, "with score", maxScore)
                new_entities.append(Span(doc, index, index + 1, label=maxLabel))
        
        doc.ents = new_entities
        return doc
    
    return custom_ner_component

In [26]:
from spacy import Language

@Language.factory(name='embed_ner5', default_config={})
def create_embedding_component(nlp, name, config_file):
    return create_emb_ner(config_file)

In [27]:
import spacy

nlp = spacy.blank("en")

In [28]:
import random

In [29]:
t = open("data/medical/cardio/long.txt")
tokens = word_tokenize(t.read())
t.close()

In [305]:
import spacy

In [446]:
def test_entity(entity, max_thresh, min_thresh, embedding, force_thresh=False):
    labels = entity['labels']
    random.shuffle(labels)

    split_ratio = 0.8

    min_t = entity.get('min_thresh') or min_thresh
    max_t = entity.get('max_thresh') or max_thresh

    if force_thresh:
        min_t = min_thresh
        max_t = max_thresh

    split = int(len(labels) * split_ratio)

    train_terms = labels[:split]
    test_terms = labels[split:]
    
    searcher = TextCorpusSearcher(
        embedding=embedding,
        inputs=train_terms
    )
    
    x_test = []
    y_test = []

    for w in test_terms:
        x_test.append(w)
        y_test.append(1)
    
    while len(y_test) < len(test_terms) * 2:
        w = random.choice(tokens)
        if w in stop_words or w in ',.:;' or w in labels:
            continue
        x_test.append(w)
        y_test.append(0)

    pred = []
    for w in x_test:
        scores = searcher.get_score(w)
        if len(scores) == 0:
            pred.append(0)
            continue
    
        min_score = min(scores)
        max_score = max(scores)
        p = 1 if (min_score >= min_t and max_score >= max_t) else 0
        pred.append(p)

    return y_test, pred

In [448]:
f = open("data/medical/cardio/config - long.json")
config = json.load(f)
f.close()

config[2]

{'name': 'Procedure',
 'min_thresh': 0.1,
 'max_thresh': 0.25,
 'labels': ['Angioplasty',
  'Bypass',
  'Echocardiogram',
  'ECG',
  'Stenting',
  'Pacemaker',
  'Defibrillator',
  'Catheterization',
  'Replacement',
  'StressTest']}

In [336]:
from sklearn.metrics import f1_score

def find_ideal_parameters(entity, embedding):
    max_f1 = 0
    curr_value = ()

    for i in np.arange(0.5, 0, -0.05):
        for j in np.arange(0.9, i, -0.05):
            scores = []
            for k in range(3):
                y_test, y_pred = test_entity(entity, min_thresh=i, max_thresh=j, embedding=embedding, force_thresh=True)
                scores.append(f1_score(y_test, y_pred))
            scores.sort()
            real_score = min(scores)
            print(i, j, real_score)
            if real_score > max_f1:
                max_f1 = real_score
                curr_value = (i, j)
    
    return (curr_value, max_f1)

In [340]:
find_ideal_parameters(config[3], embedding)

[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data
[0.75158334, 0.70949733, 0.8153062, 0.73413795, 0.77935517, 0.5942821]
[0.8314764, 0.8330869, 0.90505284, 0.83104575, 0.82009375, 0.681434]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0.70680743, 0.751471, 0.80465966, 0.6903121, 0.72472525, 0.59209806]
[WARN] myocardial missing in Word2Vec training data
[0.77850825, 0.73413795, 0.8483867, 0.64860904, 0.8204289, 0.83104575, 0.7486363]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]
[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data
[0. 0. 0. 0. 0. 0. 0. 0.]
[0.8483867, 0.69510555, 0.8303921, 0.8259416, 0.90505284, 0.8153062]
[0.8788816, 0.67594117, 0.8257297, 0.81475884, 0.8633801, 0.8091389]
[0. 0. 0. 0. 0. 0. 0. 0.]
0.5 0.9 0.0
[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data
[0. 0. 0. 0. 0. 0. 0. 0

((0.30000000000000004, 0.6499999999999998), 0.8)

In [350]:
from sklearn.metrics import classification_report, f1_score

f1_scores = []
reports = []

for i in range(len(config)):
    y_test, pred = test_entity(config[i], embedding=embedding, min_thresh=0.2, max_thresh=0.5)
    f1_scores.append(f1_score(y_test, pred))
    reports.append(classification_report(y_test, pred))

[0.8270435, 0.7713772, 0.84550637, 0.61871165, 0.81112593]
[0.8089149, 0.76729983, 0.8392029, 0.6705724, 0.77037925]
[0. 0. 0. 0. 0.]
[0.77306104, 0.7612724, 0.8150917, 0.63904655, 0.7522536]
[0.85210574, 0.8025976, 0.8672366, 0.6920707, 0.80494565]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0.63205564, 0.75101864, 0.7164752, 0.76324075, 0.74454564]
[0.7035004, 0.6423799, 0.75721914, 0.7975248, 0.68081623]
[0.61218834, 0.6729439, 0.6559361, 0.58920944, 0.69406515]
[0.49711102, 0.5104952, 0.6294285, 0.63578653, 0.61824083]
[0.6347048, 0.6879401, 0.72359705, 0.71836364, 0.69685364]
[0. 0. 0. 0. 0.]
[0.6657413, 0.74940103, 0.7672932, 0.7130998, 0.7443526]
[0.65194744, 0.7107559, 0.7595134, 0.6653476, 0.762206]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0.74248075, 0.64480156, 0.7341798, 0.7186565, 0.6596327]
[0.6662751, 0.54922, 0.6552174, 0.5793142, 0.6335675]
[0.73404825, 0.5779741, 0.6966443, 0.7112961, 0.62966985]
[0

In [351]:
mean(f1_scores)

0.6752525252525252

In [352]:
for i in range(len(config)):
    print(config[i]['name'])
    print(reports[i])

Risk Factors
              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       1.00      0.80      0.89         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10

Anatomy
              precision    recall  f1-score   support

           0       0.50      0.40      0.44         5
           1       0.50      0.60      0.55         5

    accuracy                           0.50        10
   macro avg       0.50      0.50      0.49        10
weighted avg       0.50      0.50      0.49        10

Procedure
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.60      0.60      0.60         5

    accuracy                           0.60        10
   macro avg       0.60      0.60      0.60        10
weighted avg       0.60      0.60      0.60

In [192]:
y_test, pred = test_entity(config[0], min_thresh=0.15, max_thresh=0.25, embedding=embedding)

[0.8058911, 0.4118346, -0.011739332, 0.35001975, 0.4719914, 0.6814054, 0.7674778, 0.6528263]
[0.64200836, 0.44677752, 0.36799562, 0.1549766, 0.45173123, 0.61630654, 0.51842326, 0.5980642]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0.35305232, 0.3468652, 0.010354046, 0.3313469, 0.50458556, 0.21937074, 0.44318378, 0.47722352]


In [193]:
from sklearn.metrics import classification_report


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



## Without Embedding

In [360]:
from spacy import displacy
import spacy

In [361]:
nlp = spacy.load("model/trained_model")

## Embedding NER test

In [362]:
nlp_e = spacy.load('model/trained_model')

nlp_e.add_pipe('embed_ner5', config={
    'config_file': 'data/medical/cardio/config - long.json', 
})

[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data


<function __main__.create_emb_ner.<locals>.custom_ner_component(doc)>

## With Embedding NER

In [363]:
test_string = '''During the examination, the Arteries, Septum, and Pulmonary artery were examined. The Aortic valve and LeftAtrium were assessed for any abnormalities. Michael underwent an Echocardiogram, ECG, Catheterization, and a StressTest to gauge his heart function. Angioplasty, Bypass surgery, and Stenting were performed, and he received a Pacemaker. Ongoing treatment includes managing Cardiomyopathy and Arrhythmias.'''

In [364]:
doc = nlp(test_string)

In [365]:

displacy.render(doc, style='ent', jupyter=True)

In [366]:
doc = nlp_e(test_string)
displacy.render(doc, style='ent', jupyter=True)

During ADP
the PRON
Searching for examination
Risk Factors [0.7002432, 0.75206953, 0.6373908, 0.61263204, 0.7397246, 0.6943776, 0.7448299, 0.78330255]
Anatomy [0.75295854, 0.20976163, 0.44759104, 0.68742317, 0.6183481, 0.75009507, 0.65927994, 0.53264153]
Procedure [0.5489295, 0.7256304, 0.63539845, 0.6039183, 0.6538365, 0.45868728, 0.45639926, 0.4655003]
Condition [0.7002432, 0.81220454, 0.7892523, 0.73706585, 0.70642656, 0.7584331]
, PUNCT
the PRON
Searching for Arteries
Risk Factors [0.52986705, 0.736725, 0.6209072, 0.66538507, 0.8409203, 0.6648301, 0.77541864, 0.65265185]
Anatomy [0.6006989, 0.030804643, 0.50248057, 0.58097523, 0.5738601, 1.0, 0.44654918, 0.60141647]
Procedure [0.6141833, 0.71278846, 0.73332673, 0.52498275, 0.79832506, 0.61636835, 0.41044253, 0.34079102]
Condition [0.52986705, 0.7377396, 0.71843064, 0.7603771, 0.6796596, 0.7640452]
Adding tag for  Arteries Anatomy with score 1.0
, PUNCT
Septum PROPN
, PUNCT
and CCONJ
Pulmonary PROPN
artery NOUN
were AUX
examined VER