In [211]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.6/12.8 MB 3.3 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.5 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 5.8 MB/s eta 0:00:02
     ------ --------------------------------- 1.9/12.8 MB 6.2 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 6.3 MB/s eta 0:00:02
     -------- ------------------------------- 2.9/12.8 MB 7.3 MB/s eta 0:00:02
     ----------- ---------------------------- 3


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [212]:
# NER Annotator: https://tecoholic.github.io/ner-annotator/

In [584]:
import spacy
from spacy.tokens import DocBin
import json

nlp = spacy.blank("en")

doc_bin = DocBin()

with open("data/medical/cardio/annotations.json", "r") as file:
    data = json.load(file)

classes = data['classes']
annotations = data['annotations']

print(classes)

for annotation in annotations:
    text = annotation[0]
    entities = annotation[1]["entities"]
    doc = nlp(text)

    ents = []
    for start, end, label in entities:
        ents.append((start, end, label))
    print(text[start:end])
    doc.ents = [doc.char_span(start, end, label=label) for start, end, label in ents]
    doc_bin.add(doc)

doc_bin.to_disk("model/training_data.spacy")

['PROCEDURE', 'CONDITION', 'ANATOMY', 'RISK FACTOR']
Hypertension

Valvular

Smoking

erwent 

Smoking


In [585]:
classes = ['PROCEDURE', 'CONDITION', 'ANATOMY', 'RISK FACTOR']

In [586]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")

for label in classes:
    ner.add_label(label)

doc_bin = DocBin().from_disk("model/training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

nlp.begin_training()

for epoch in range(50):
    losses = {}
    random.shuffle(docs)
    for batch in minibatch(docs, size=8):
        for doc in batch:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")

nlp.to_disk("model/trained_model")

Epoch 1, Losses: {'ner': 325.89090210199356}
Epoch 2, Losses: {'ner': 149.2819656105712}
Epoch 3, Losses: {'ner': 58.29640912677149}
Epoch 4, Losses: {'ner': 52.51561563590014}
Epoch 5, Losses: {'ner': 53.67532374996663}
Epoch 6, Losses: {'ner': 54.79898782155966}
Epoch 7, Losses: {'ner': 53.442647355570514}
Epoch 8, Losses: {'ner': 51.83633285453696}
Epoch 9, Losses: {'ner': 49.62014601961854}
Epoch 10, Losses: {'ner': 49.038833676003215}
Epoch 11, Losses: {'ner': 51.47129590815434}
Epoch 12, Losses: {'ner': 48.74042458819077}
Epoch 13, Losses: {'ner': 46.03989764437786}
Epoch 14, Losses: {'ner': 44.68860874950806}
Epoch 15, Losses: {'ner': 43.90137694823637}
Epoch 16, Losses: {'ner': 41.87045299776471}
Epoch 17, Losses: {'ner': 44.681509582083386}
Epoch 18, Losses: {'ner': 43.18403458857136}
Epoch 19, Losses: {'ner': 45.02116698868052}
Epoch 20, Losses: {'ner': 35.63797421937089}
Epoch 21, Losses: {'ner': 41.09884330733681}
Epoch 22, Losses: {'ner': 36.891682939258686}
Epoch 23, Loss

In [356]:
import numpy as np
from numpy.linalg import norm

In [357]:
from gensim.models import Word2Vec

In [358]:
import nltk
from nltk import word_tokenize, sent_tokenize

In [429]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Keerthi
[nltk_data]     Vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [430]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Keerthi
[nltk_data]     Vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [6]:
def cosine_similarity(v1, v2):
  return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [412]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')

class Embedding:
    def __init__(self, file):
        self.nlp = spacy.load('en_core_web_sm')
       
        f = open(file, 'r')
        text = f.read()
        f.close()
        
        sentences = []
        
        for sent in sent_tokenize(text.lower()):
            s = []
            for word in self.nlp(sent):
                if word.is_punct or word.is_space or word.is_stop:
                    continue
                s.append(word.text)
            sentences.append(s)

        f = open('sentences.txt', 'w')
        json.dump(sentences, f)
        f.close()
        
        self.model = Word2Vec(sentences, vector_size=40, window=8, min_count=1, workers=4, sg=1, epochs=25)

    def has_embed(self, word):
        token = word.lower()
        return token in self.model.wv

    def get_embed(self, word):
        token = word.lower()
        return self.model.wv[token]

    def get_similarity(self, word1, word2):
        if not (self.has_embed(word1) and self.has_embed(word2)):
            return 0
        emb1 = self.get_embed(word1)
        emb2 = self.get_embed(word2)
        return cosine_similarity(emb1, emb2)

In [413]:
classes

['PROCEDURE', 'CONDITION', 'ANATOMY', 'RISK FACTOR']

In [414]:
embedding = Embedding(file='data/medical/cardio/long.txt')

In [415]:
embedding.get_similarity('bypass', 'monitoring')

0.7845788

In [416]:
embedding.get_similarity('bypass', 'Defibrillator')

0.8291508

In [320]:
from numpy import mean

tagger = spacy.load('en_core_web_sm')

class TextCorpusSearcher:
    def __init__(self, inputs, embedding):
        self.embedding: Embedding = embedding

        doc = ' '.join(inputs).lower()
        self.x = [word.text for word in tagger(doc) if not (word.is_space or word.is_punct)]

        for w in self.x:
            if not self.embedding.has_embed(w):
                print("[WARN]", w, "missing in Word2Vec training data")

    def get_score(self, word):
        word = word.lower()
        token = tagger(word)[0]

        empty = np.zeros(len(self.x))
        if token.pos_ not in ('NOUN', 'PROPN') or not self.embedding.has_embed(word):
            return empty
        
        scores = []
        for w in self.x:
            if not self.embedding.has_embed(w):
                continue
            score = self.embedding.get_similarity(w, word)
            scores.append(score)
        
        return scores

In [455]:
from spacy.tokens import Span

def create_emb_ner(config_file):
    f = open(config_file, 'rb')
    config = json.load(f)
    f.close()

    searchers = []

    for e in config:
        labels = e['labels']
        name = e['name']

        max_thresh = e.get('max_thresh', 0.5)
        min_thresh = e.get('min_thresh', 0.2)
        
        split_ratio = 0.85

        split = int(len(labels) * split_ratio)

        train_terms = labels[:split]
        
        searcher = TextCorpusSearcher(
            inputs=train_terms,
            embedding=embedding
        )

        searchers.append((searcher, min_thresh, max_thresh, name))

    tagger = spacy.load('en_core_web_sm')

    def custom_ner_component(doc):
        new_entities = [ent for ent in doc.ents]
        for index, token in enumerate(doc):
            tag = tagger(token.text)[0].pos_
            if token.ent_type != 0 or tag not in ('NOUN', 'PROPN'):
                print(token, tag)
                continue
            maxLabel = ""
            
            maxScore = 0
            maxLabel = ''

            print('Searching for', token)
            for searcher, min_thresh, max_thresh, label in searchers:
                scores = searcher.get_score(token.text)
                currMax = max(scores)

                print(label, scores)
                if max(scores) < 0.95 and (min(scores) < min_thresh or currMax < max_thresh):
                    continue
                
                if currMax > maxScore:
                    maxLabel = label
                    maxScore = currMax
            
            if maxScore > 0:
                print("Adding tag for ", token, maxLabel, "with score", maxScore)
                new_entities.append(Span(doc, index, index + 1, label=maxLabel))
        
        doc.ents = new_entities
        return doc
    
    return custom_ner_component

In [301]:
from spacy import Language

@Language.factory(name='embed_ner5', default_config={})
def create_embedding_component(nlp, name, config_file):
    return create_emb_ner(config_file)

In [302]:
import spacy

nlp = spacy.blank("en")

In [303]:
import random

In [304]:
t = open("data/medical/cardio/long.txt")
tokens = word_tokenize(t.read())
t.close()

In [305]:
import spacy

In [446]:
def test_entity(entity, max_thresh, min_thresh, embedding, force_thresh=False):
    labels = entity['labels']
    random.shuffle(labels)

    split_ratio = 0.8

    min_t = entity.get('min_thresh') or min_thresh
    max_t = entity.get('max_thresh') or max_thresh

    if force_thresh:
        min_t = min_thresh
        max_t = max_thresh

    split = int(len(labels) * split_ratio)

    train_terms = labels[:split]
    test_terms = labels[split:]
    
    searcher = TextCorpusSearcher(
        embedding=embedding,
        inputs=train_terms
    )
    
    x_test = []
    y_test = []

    for w in test_terms:
        x_test.append(w)
        y_test.append(1)
    
    while len(y_test) < len(test_terms) * 2:
        w = random.choice(tokens)
        if w in stop_words or w in ',.:;' or w in labels:
            continue
        x_test.append(w)
        y_test.append(0)

    pred = []
    for w in x_test:
        scores = searcher.get_score(w)
        print(scores)
        if len(scores) == 0:
            pred.append(0)
            continue
    
        min_score = min(scores)
        max_score = max(scores)
        p = 1 if (min_score >= min_t and max_score >= max_t) else 0
        pred.append(p)

    return y_test, pred

In [448]:
f = open("data/medical/cardio/config - long.json")
config = json.load(f)
f.close()

config[2]

{'name': 'Procedure',
 'min_thresh': 0.1,
 'max_thresh': 0.25,
 'labels': ['Angioplasty',
  'Bypass',
  'Echocardiogram',
  'ECG',
  'Stenting',
  'Pacemaker',
  'Defibrillator',
  'Catheterization',
  'Replacement',
  'StressTest']}

In [336]:
from sklearn.metrics import f1_score

def find_ideal_parameters(entity, embedding):
    max_f1 = 0
    curr_value = ()

    for i in np.arange(0.5, 0, -0.05):
        for j in np.arange(0.9, i, -0.05):
            scores = []
            for k in range(3):
                y_test, y_pred = test_entity(entity, min_thresh=i, max_thresh=j, embedding=embedding, force_thresh=True)
                scores.append(f1_score(y_test, y_pred))
            scores.sort()
            real_score = min(scores)
            print(i, j, real_score)
            if real_score > max_f1:
                max_f1 = real_score
                curr_value = (i, j)
    
    return (curr_value, max_f1)

In [340]:
find_ideal_parameters(config[3], embedding)

[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data
[0.75158334, 0.70949733, 0.8153062, 0.73413795, 0.77935517, 0.5942821]
[0.8314764, 0.8330869, 0.90505284, 0.83104575, 0.82009375, 0.681434]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0.70680743, 0.751471, 0.80465966, 0.6903121, 0.72472525, 0.59209806]
[WARN] myocardial missing in Word2Vec training data
[0.77850825, 0.73413795, 0.8483867, 0.64860904, 0.8204289, 0.83104575, 0.7486363]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0.]
[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data
[0. 0. 0. 0. 0. 0. 0. 0.]
[0.8483867, 0.69510555, 0.8303921, 0.8259416, 0.90505284, 0.8153062]
[0.8788816, 0.67594117, 0.8257297, 0.81475884, 0.8633801, 0.8091389]
[0. 0. 0. 0. 0. 0. 0. 0.]
0.5 0.9 0.0
[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data
[0. 0. 0. 0. 0. 0. 0. 0

((0.30000000000000004, 0.6499999999999998), 0.8)

In [350]:
from sklearn.metrics import classification_report, f1_score

f1_scores = []
reports = []

for i in range(len(config)):
    y_test, pred = test_entity(config[i], embedding=embedding, min_thresh=0.2, max_thresh=0.5)
    f1_scores.append(f1_score(y_test, pred))
    reports.append(classification_report(y_test, pred))

[0.8270435, 0.7713772, 0.84550637, 0.61871165, 0.81112593]
[0.8089149, 0.76729983, 0.8392029, 0.6705724, 0.77037925]
[0. 0. 0. 0. 0.]
[0.77306104, 0.7612724, 0.8150917, 0.63904655, 0.7522536]
[0.85210574, 0.8025976, 0.8672366, 0.6920707, 0.80494565]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0.63205564, 0.75101864, 0.7164752, 0.76324075, 0.74454564]
[0.7035004, 0.6423799, 0.75721914, 0.7975248, 0.68081623]
[0.61218834, 0.6729439, 0.6559361, 0.58920944, 0.69406515]
[0.49711102, 0.5104952, 0.6294285, 0.63578653, 0.61824083]
[0.6347048, 0.6879401, 0.72359705, 0.71836364, 0.69685364]
[0. 0. 0. 0. 0.]
[0.6657413, 0.74940103, 0.7672932, 0.7130998, 0.7443526]
[0.65194744, 0.7107559, 0.7595134, 0.6653476, 0.762206]
[0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0.]
[0.74248075, 0.64480156, 0.7341798, 0.7186565, 0.6596327]
[0.6662751, 0.54922, 0.6552174, 0.5793142, 0.6335675]
[0.73404825, 0.5779741, 0.6966443, 0.7112961, 0.62966985]
[0

In [351]:
mean(f1_scores)

0.6752525252525252

In [352]:
for i in range(len(config)):
    print(config[i]['name'])
    print(reports[i])

Risk Factors
              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       1.00      0.80      0.89         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10

Anatomy
              precision    recall  f1-score   support

           0       0.50      0.40      0.44         5
           1       0.50      0.60      0.55         5

    accuracy                           0.50        10
   macro avg       0.50      0.50      0.49        10
weighted avg       0.50      0.50      0.49        10

Procedure
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.60      0.60      0.60         5

    accuracy                           0.60        10
   macro avg       0.60      0.60      0.60        10
weighted avg       0.60      0.60      0.60

In [192]:
y_test, pred = test_entity(config[0], min_thresh=0.15, max_thresh=0.25, embedding=embedding)

[0.8058911, 0.4118346, -0.011739332, 0.35001975, 0.4719914, 0.6814054, 0.7674778, 0.6528263]
[0.64200836, 0.44677752, 0.36799562, 0.1549766, 0.45173123, 0.61630654, 0.51842326, 0.5980642]
[0. 0. 0. 0. 0. 0. 0. 0.]
[0.35305232, 0.3468652, 0.010354046, 0.3313469, 0.50458556, 0.21937074, 0.44318378, 0.47722352]


In [193]:
from sklearn.metrics import classification_report


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



## Without Embedding

In [587]:
from spacy import displacy

In [588]:
nlp = spacy.load("model/trained_model")

## Embedding NER test

In [589]:
nlp_e = spacy.load('model/trained_model')

nlp_e.add_pipe('embed_ner5', config={
    'config_file': 'data/medical/cardio/config - long.json', 
})

[WARN] myocardial missing in Word2Vec training data
[WARN] atherosclerosis missing in Word2Vec training data


<function __main__.create_emb_ner.<locals>.custom_ner_component(doc)>

## With Embedding NER

In [605]:
test_string = '''A person is diagnosed with HeartFailure and Arrhythmias. Her medical report states obesity compounded by smoking. He underwent Angioplasty days later.'''

In [606]:
doc = nlp(test_string)
displacy.render(doc, style='ent', jupyter=True)

In [596]:
doc = nlp_e(test_string)
displacy.render(doc, style='ent', jupyter=True)

A PRON
Searching for person
Risk Factors [0. 0. 0. 0. 0. 0. 0. 0.]
Anatomy [0. 0. 0. 0. 0. 0. 0. 0.]
Procedure [0. 0. 0. 0. 0. 0. 0. 0.]
Condition [0. 0. 0. 0. 0. 0. 0. 0.]
is AUX
diagnosed VERB
with ADP
Searching for CAD
Risk Factors [0.8139542, 0.8042925, 0.8644917, 0.8316245, 0.83820957, 0.84899145, 0.8768254, 0.8255233]
Anatomy [0.71407616, 0.74984825, 0.76426035, 0.7873922, 0.80722755, 0.7485509, 0.76859593, 0.8090368]
Procedure [0.83672476, 0.8192345, 0.7949721, 0.79534495, 0.8125044, 0.8336864, 0.7621198, 0.78172475]
Condition [0.8139542, 1.0000001, 0.84696394, 0.8385156, 0.8402989, 0.8112252]
Adding tag for  CAD Condition with score 1.0000001
. PUNCT
The PRON
mitral ADJ
and CCONJ
tricuspid NOUN
Searching for valves
Risk Factors [0.8095302, 0.8875664, 0.84914285, 0.87721676, 0.89304656, 0.8523851, 0.8762391, 0.80273104]
Anatomy [0.76501155, 0.8239975, 0.7748511, 0.819748, 0.87005085, 0.7842771, 0.8673595, 0.75860196]
Procedure [0.80091405, 0.8244416, 0.80501616, 0.79842144, 0.82