In [211]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.6/12.8 MB 3.3 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.5 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 5.8 MB/s eta 0:00:02
     ------ --------------------------------- 1.9/12.8 MB 6.2 MB/s eta 0:00:02
     ------- -------------------------------- 2.3/12.8 MB 6.3 MB/s eta 0:00:02
     -------- ------------------------------- 2.9/12.8 MB 7.3 MB/s eta 0:00:02
     ----------- ---------------------------- 3


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [212]:
# NER Annotator: https://tecoholic.github.io/ner-annotator/

In [17]:
import spacy
from spacy.tokens import DocBin
import json

nlp = spacy.blank("en")

doc_bin = DocBin()

with open("data/annotations_med.json", "r") as file:
    data = json.load(file)

classes = data['classes']
annotations = data['annotations']

for annotation in annotations:
    text = annotation[0]
    entities = annotation[1]["entities"]

    doc = nlp(text)

    ents = []
    for start, end, label in entities:
        ents.append((start, end, label))

    doc.ents = [doc.char_span(start, end, label=label) for start, end, label in ents]

    doc_bin.add(doc)

doc_bin.to_disk("model/training_data.spacy")

In [18]:
classes = ('PROCEDURE / TEST', 'DRUG', 'CONDITION', 'SYMPTOM')

In [19]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")

for label in classes:
    ner.add_label(label)

doc_bin = DocBin().from_disk("model/training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

nlp.begin_training()

for epoch in range(100):
    losses = {}
    random.shuffle(docs)
    for batch in minibatch(docs, size=8):
        for doc in batch:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")

nlp.to_disk("model/trained_model")

Epoch 1, Losses: {'ner': 962.2629118859768}
Epoch 2, Losses: {'ner': 142.46544225673108}
Epoch 3, Losses: {'ner': 108.40111186464276}
Epoch 4, Losses: {'ner': 103.71139129452962}
Epoch 5, Losses: {'ner': 102.44336059241577}
Epoch 6, Losses: {'ner': 93.48110673990135}
Epoch 7, Losses: {'ner': 90.24217953867446}
Epoch 8, Losses: {'ner': 78.65201494982807}
Epoch 9, Losses: {'ner': 78.7146151953392}
Epoch 10, Losses: {'ner': 70.98430156292133}
Epoch 11, Losses: {'ner': 72.75402945383172}
Epoch 12, Losses: {'ner': 68.91185505818275}
Epoch 13, Losses: {'ner': 61.294393109088034}
Epoch 14, Losses: {'ner': 66.53415800741051}
Epoch 15, Losses: {'ner': 64.08468256028532}
Epoch 16, Losses: {'ner': 51.83209046303569}
Epoch 17, Losses: {'ner': 56.27322734381741}
Epoch 18, Losses: {'ner': 53.83540639402655}
Epoch 19, Losses: {'ner': 60.29831454971595}
Epoch 20, Losses: {'ner': 56.80565134748797}
Epoch 21, Losses: {'ner': 47.55866545214897}
Epoch 22, Losses: {'ner': 44.33317708948885}
Epoch 23, Losse

In [20]:
import numpy as np
from numpy.linalg import norm

In [21]:
from gensim.models import Word2Vec

In [22]:
import nltk
from nltk import word_tokenize, sent_tokenize

In [23]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Keerthi
[nltk_data]     Vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\Keerthi
[nltk_data]     Vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [26]:
def cosine_similarity(v1, v2):
  return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [27]:
from nltk import pos_tag

In [28]:
from nltk.stem.snowball import SnowballStemmer

In [527]:
from numpy import mean


class TextCorpusSearcher:
    def __init__(self, filename, x, label):
        self.label = label

        nlp = spacy.load('en_core_web_sm')
        doc = ' '.join(x).lower()

        # self.stemmer = SnowballStemmer(language='english')
        self.x = [word.text for word in nlp(doc) if not (word.is_space or word.is_punct)]
        text = self.get_text(filename)
        sentences = []
        
        for sent in sent_tokenize(text.lower()):
            s = []
            for word in nlp(sent):
                if word.is_punct or word.is_space:
                    continue
                s.append(word.text)
            sentences.append(s)

        self.model = Word2Vec(sentences, vector_size=100, window=8, min_count=1, sg=1)
        self.model.train(sentences, total_examples=self.model.corpus_count, epochs=100)

        for w in self.x:
            if w not in self.model.wv:
                print("[WARN]", w, "missing in Word2Vec training data")

    def get_text(self, filename):
        with open(filename) as f:
            return f.read() 
    
    def get_embed(self, word):
        return self.model.wv[word.lower()]
    
    def has_embed(self, word):
        return word in self.model.wv

    def get_score(self, word):
        global curr_model
        word = word.lower()
        tag = pos_tag([word])[0][1]
        empty = np.zeros(len(self.x))

        if not tag.startswith('NN') or word not in self.model.wv:
            return empty
        
        curr_model = self.model
        scores = []
        for w in self.x:
            if w not in self.model.wv:
                continue
            score = cosine_similarity(self.model.wv[word], self.model.wv[w])
            scores.append(score)
        
        return scores    

In [493]:
from spacy.tokens import Span

def create_emb_ner(config_file):
    f = open(config_file, 'rb')
    config = json.load(f)
    f.close()

    searchers = []

    for e in config:
        labels = e['labels']
        name = e['name']

        max_thresh = e.get('max_thresh', 0.5)
        min_thresh = e.get('min_thresh', 0.2)
        
        split_ratio = 0.85

        split = int(len(labels) * split_ratio)

        train_terms = labels[:split]
        
        searcher = TextCorpusSearcher(
            'data/train-medical.txt',
            train_terms,
            name
        )

        searchers.append((searcher, min_thresh, max_thresh, name))

    tagger = spacy.load('en_core_web_sm')

    def custom_ner_component(doc):
        new_entities = [ent for ent in doc.ents]
        for index, token in enumerate(doc):
            tag = tagger(token.text)[0].pos_
            if token.ent_type != 0 or tag != 'NOUN':
                continue
            maxLabel = ""
            
            maxScore = 0
            maxLabel = ''

            print('Searching for', token)
            for searcher, min_thresh, max_thresh, label in searchers:
                scores = searcher.get_score(token.text)
                currMax = max(scores)

                if min(scores) < min_thresh or currMax < max_thresh:
                    continue
                
                print('Passed for', label)
                print(scores)

                s = mean([currMax, min(scores)])
                
                if s > maxScore:
                    maxLabel = label
                    maxScore = s
            
            if maxScore > 0:
                print("Adding tag for ", token, maxLabel, "with score", maxScore)
                new_entities.append(Span(doc, index, index + 1, label=maxLabel))
        
        doc.ents = new_entities
        return doc
    
    return custom_ner_component

In [400]:
from spacy import Language

@Language.factory(name='embed_ner2', default_config={})
def create_embedding_component(nlp, name, config_file):
    return create_emb_ner(config_file)

ValueError: [E004] Can't set up pipeline component: a factory for 'embed_ner2' already exists. Existing factory: <function create_embedding_component at 0x000001E0123620C0>. New factory: <function create_embedding_component at 0x000001E0134D7F60>

In [494]:
import spacy

nlp = spacy.blank("en")

In [495]:
import random

In [496]:
t = open("data/train-medical.txt")
tokens = word_tokenize(t.read())
t.close()

In [497]:
import spacy

In [498]:
t = spacy.load('en_core_web_sm')
a = t('pain')[0]
print(a.pos_)

NOUN


In [499]:
def test_entity(entity, max_thresh, min_thresh):
    labels = entity['labels']
    name = entity['name']

    split_ratio = 0.85

    split = int(len(labels) * split_ratio)

    train_terms = labels[:split]
    test_terms = labels[split:]
    
    searcher = TextCorpusSearcher(
        'data/train-medical.txt',
        train_terms,
        name
    )
    
    x_test = []
    y_test = []

    for w in test_terms:
        x_test.append(w)
        y_test.append(1)
    
    while len(y_test) < len(test_terms) * 2:
        w = random.choice(tokens)
        if w in stop_words or w in ',.:;' or w in labels:
            continue
        x_test.append(w)
        y_test.append(0)

    pred = []
    for w in x_test:
        scores = searcher.get_score(w)
        min_score = min(scores)
        max_score = max(scores)
        p = 1 if (min_score >= min_thresh and max_score >= max_thresh) else 0
        pred.append(p)

    return y_test, pred

In [500]:
config[1]

{'name': 'Condition',
 'labels': ['gallstones',
  'acute pancreatitis',
  'hepatic steatosis',
  'pancreatitis',
  'tachycardia',
  'palpitations',
  'arrhythmias',
  'myocardial infarction',
  'hypertension',
  'diabetes',
  'cardiomyopathy',
  'stroke']}

In [532]:
from sklearn.metrics import f1_score

f1_scores = []

for i in range(4):
    y_test, pred = test_entity(config[i], min_thresh=0.15, max_thresh=0.6)
    f1_scores.append(f1_score(y_test, pred))

In [533]:
mean(f1_scores)

0.725

In [503]:
pred

[0, 1, 0, 0, 0, 0]

## Without Embedding

In [534]:
from spacy import displacy

In [535]:
nlp = spacy.load("model/trained_model")

In [536]:
test_string = "I had chest pain and rash in my left leg, so I went to see the doctor. He ran a biopsy and some scans. Unfortunately, I got diagnozed with pancreatitis and he said I had a high chance of getting stroke. The doctor finally prescribed me aspirin and prednisone."

In [537]:
doc = nlp(test_string)

In [538]:
displacy.render(doc, style="ent", jupyter=True)

## Embedding NER test

In [539]:
nlp.add_pipe('embed_ner2', config={
    'config_file': 'ner_config.json'
})

<function __main__.create_emb_ner.<locals>.custom_ner_component(doc)>

## With Embedding NER

In [540]:
doc = nlp(test_string)

Searching for rash
Passed for Condition
[0.33655444, 0.40822273, 0.21276493, 0.3220555, 0.33729333, 0.21276493, 0.5105303, 0.49390528, 0.39802164, 0.46124947, 0.42895567, 0.3183581, 0.3007358]
Passed for Symptoms
[0.31940433, 0.45735604, 0.4099632, 0.37925756, 0.49390528, 0.51933193, 0.5020989, 0.26578057, 0.38611507, 0.8166873, 0.87939787, 0.99999994]
Adding tag for  rash Symptoms with score 0.6328902
Searching for leg
Searching for doctor
Searching for scans
Searching for chance
Searching for stroke
Passed for Drug
[0.36774987, 0.39049277, 0.35076165, 0.6310034, 0.4010282, 0.42588294, 0.3669644]
Passed for Condition
[0.24415085, 0.8427904, 0.7777064, 0.6175021, 0.63150436, 0.7777064, 0.32434508, 0.33704415, 0.8486899, 0.969083, 0.95720136, 0.34242904, 0.41118282]
Adding tag for  stroke Condition with score 0.6066169
Searching for doctor
Searching for prednisone
Passed for Drug
[0.46974435, 0.47893983, 0.99999994, 0.42986003, 0.6280245, 0.6434636, 0.44853935]
Passed for Condition
[0.6

In [541]:
displacy.render(doc, style='ent', jupyter=True)