In [28]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 3.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip available: 22.3.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
import spacy

NER = spacy.load("en_core_web_sm")

In [30]:
raw_text="The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."

In [31]:
text1= NER(raw_text)

In [32]:
for word in text1.ents:
    print(word.text,word.label_)

The Indian Space Research Organisation ORG
India GPE
Bengaluru GPE
Department of Space ORG
India GPE
ISRO ORG
DOS ORG


In [39]:
# NER Annotator: https://tecoholic.github.io/ner-annotator/

In [42]:
import spacy
from spacy.tokens import Doc, DocBin
import json

nlp = spacy.blank("en")

doc_bin = DocBin()

with open("data/annotations.json", "r") as file:
    data = json.load(file)
classes = data['classes']
annotations = data['annotations']
for annotation in annotations:
    text = annotation[0]
    entities = annotation[1]["entities"]

    doc = nlp(text)

    ents = []
    for start, end, label in entities:
        ents.append((start, end, label))

    doc.ents = [doc.char_span(start, end, label=label) for start, end, label in ents]

    doc_bin.add(doc)

doc_bin.to_disk("model/training_data.spacy")

In [43]:
classes = ('PERSON', 'PLACE', 'SERVICE', 'TIME', 'DATE')

In [45]:
import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

nlp = spacy.blank("en")

ner = nlp.add_pipe("ner")

for label in classes:
    ner.add_label(label)

doc_bin = DocBin().from_disk("model/training_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))

nlp.begin_training()

for epoch in range(50):
    losses = {}
    random.shuffle(docs)
    for batch in minibatch(docs, size=8):
        for doc in batch:
            example = Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]})
            nlp.update([example], drop=0.5, losses=losses)
    print(f"Epoch {epoch + 1}, Losses: {losses}")

nlp.to_disk("model/trained_model")

Epoch 1, Losses: {'ner': 64.92905551195145}
Epoch 2, Losses: {'ner': 62.88421410322189}
Epoch 3, Losses: {'ner': 57.87563592195511}
Epoch 4, Losses: {'ner': 52.684956789016724}
Epoch 5, Losses: {'ner': 41.17852871119976}
Epoch 6, Losses: {'ner': 33.18318738695234}
Epoch 7, Losses: {'ner': 29.434816773689818}
Epoch 8, Losses: {'ner': 27.31542748703214}
Epoch 9, Losses: {'ner': 29.216141655784668}
Epoch 10, Losses: {'ner': 25.985767440293785}
Epoch 11, Losses: {'ner': 28.69254120306141}
Epoch 12, Losses: {'ner': 26.566457065397117}
Epoch 13, Losses: {'ner': 23.863056193607918}
Epoch 14, Losses: {'ner': 23.871349419852777}
Epoch 15, Losses: {'ner': 21.741570208774647}
Epoch 16, Losses: {'ner': 22.87053915472643}
Epoch 17, Losses: {'ner': 20.01375056053439}
Epoch 18, Losses: {'ner': 19.312953634012956}
Epoch 19, Losses: {'ner': 17.25393970157893}
Epoch 20, Losses: {'ner': 17.643029863440916}
Epoch 21, Losses: {'ner': 17.246964328204456}
Epoch 22, Losses: {'ner': 17.700293172580803}
Epoch 2

In [46]:
import numpy as np
from numpy.linalg import norm

In [47]:
from gensim.models import Word2Vec

In [14]:
import nltk
from nltk import word_tokenize, sent_tokenize

In [49]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [2]:
def cosine_similarity(v1, v2):
  return np.dot(v1, v2) / (norm(v1) * norm(v2))

In [3]:
from text_corpus import TextCorpusSearcher

In [15]:
place_search = TextCorpusSearcher(
    "data/train-place.txt",
    ["chennai", "bangalore"],
    "PLACE"
)

In [16]:
place_search.get_score("jodhpur")

(0.5104972, 'PLACE')

In [17]:
service_search = TextCorpusSearcher(
    "data/train-service.txt",
    ["cut", "shave"],
    "SERVICE"
)

In [18]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vasan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
from nltk import pos_tag

In [21]:
from spacy.language import Language
from spacy.tokens import Span

datasets = [
    place_search,
    service_search,
]

@Language.component("custom_ner")
def custom_ner_component(doc):
    confidence = 0.5
    new_entities = [ent for ent in doc.ents]
    for index, token in enumerate(doc):
        tag = pos_tag([token.text])[0][1]
        if token.ent_type != 0 or tag != 'NN':
          continue
        maxLabel = ""
        maxScore = 0
        for searcher in datasets:
          score, label = searcher.get_score(token.text)
          if score > confidence and score > maxScore:
            maxLabel = label
            maxScore = score
        if maxScore > 0:
          print("Adding tag for ", token, maxLabel, "with score", maxScore)
          new_entities.append(Span(doc, index, index + 1, label=maxLabel))
    doc.ents = new_entities
    return doc

In [22]:
import spacy

nlp = spacy.blank("en")

In [23]:
nlp.add_pipe("custom_ner")

<function __main__.custom_ner_component(doc)>

In [24]:
doc = nlp("I want to go to Jodhpur and get a hair trim at 7pm")

Adding tag for  Jodhpur PLACE with score 0.5104972


In [25]:
for ent in doc.ents:
  print(ent)

Jodhpur


In [26]:
from spacy import displacy

In [28]:
service_search.get_score("trim")

(0.057224885, 'SERVICE')

In [27]:
displacy.render(doc, style="ent", jupyter=True)

In [503]:
class TextCorpusSearcher:
    def __init__(self, filename, x, label):
        self.label = label
        self.x = tuple(word.lower() for word in x)
        text = self.get_text(filename)
        sentences = []
        for sent in sent_tokenize(text.lower()):
            s = []
            for word in word_tokenize(sent):
                if word in stop_words or word in ',:;.':
                    continue
                s.append(word)
            sentences.append(s)

        self.model = Word2Vec(sentences, vector_size=24, window=3, min_count=1, sg=1)

        for w in self.x:
            if w not in self.model.wv:
                print("[WARN]", w, "missing in Word2Vec training data")

    def get_text(self, filename):
        with open(filename) as f:
            return f.read() 
    
    def get_embed(self, word):
        return self.model.wv[word]

    def get_score(self, word):
        max_score = 0
        global curr_model
        word = word.lower()
        if not pos_tag([word])[0][1].startswith('NN') or word not in self.model.wv:
            return 0, self.label
        curr_model = self.model
        for w in self.x:
            score = cosine_similarity(self.model.wv[word], self.model.wv[w])
            max_score = max(max_score, score)
        return max_score, self.label

In [504]:
cities = ('Chail', 'Majuli', 'Malana', 'Mawlynnong', 'Gavi', 'Diskit', 'Landour', 'Idukki', 'Mandawa', 'Delhi', 'Mumbai', 'Bangalore', 'Kolkata', 'Chennai', 'Hyderabad', 'Jaipur', 'Pune', 'Ahmedabad', 'Varanasi')

In [505]:
len(cities)

19

In [506]:
import random

In [507]:
input_labels = random.choices(cities, k=12)
expected = [city for city in cities if city not in input_labels]

In [508]:
expected

['Chail',
 'Malana',
 'Mawlynnong',
 'Gavi',
 'Diskit',
 'Delhi',
 'Kolkata',
 'Hyderabad',
 'Pune',
 'Ahmedabad']

In [509]:
t = open("data/train-cities.txt")
tokens = word_tokenize(t.read())

In [510]:
garbage = []
while len(garbage) < len(expected):
    w = random.choice(tokens)
    if w in stop_words or w in ',.:;':
        continue
    garbage.append(w)
t.close()

In [512]:
test_data = [
    (word, 1) for word in expected 
]

test_data.extend(
    [
        (word, 0) for word in garbage
    ]
)

In [513]:
ner = TextCorpusSearcher(filename="data/train-cities.txt", x=input_labels, label="PLACE")

In [514]:
y_test = []
y_pred = []

for word, is_expected in test_data:
    y_test.append(is_expected)
    y_pred.append(1 if ner.get_score(word.lower())[0] > 0.3 else 0)

In [515]:
y_pred

[1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [516]:
from sklearn.metrics import accuracy_score, classification_report

In [517]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86        10
           1       0.89      0.80      0.84        10

    accuracy                           0.85        20
   macro avg       0.85      0.85      0.85        20
weighted avg       0.85      0.85      0.85        20



In [518]:
accuracy_score(y_test, y_pred)

0.85

In [553]:
from spacy.language import Language
from spacy.tokens import Span

datasets = [
  ner,
]

@Language.component("custom_ner")
def custom_ner_component(doc):
    confidence = 0.3
    new_entities = [ent for ent in doc.ents]
    for index, token in enumerate(doc):
        tag = pos_tag([token.text])[0][1]
        if token.ent_type != 0 or tag != 'NN':
          continue
        maxLabel = ""
        maxScore = 0
        for searcher in datasets:
          score, label = searcher.get_score(token.text)
          if score > confidence and score > maxScore:
            maxLabel = label
            maxScore = score
        if maxScore > 0:
          print("Adding tag for ", token, maxLabel, "with score", maxScore)
          new_entities.append(Span(doc, index, index + 1, label=maxLabel))
    doc.ents = new_entities
    return doc

In [554]:
nlp = spacy.blank("en")
nlp.add_pipe("custom_ner")

<function __main__.custom_ner_component(doc)>

In [560]:
doc = nlp("I want to visit Mawlynnong sometime")

Adding tag for  Mawlynnong PLACE with score 0.33339307


In [561]:
displacy.render(doc, style="ent", jupyter=True)