In [337]:
import spacy
import numpy as np
from spacy.training import Example

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [4]:
doc = nlp('Tesla Inc is going to acquire. Twitter Inc for $44 billion ')

In [5]:
for sent in doc.sents:
#     print(token.text, "--->", token.pos_)
#     print(sent.text, token.dep_, token.head.text)
    print(sent.text)

Tesla Inc is going to acquire.
Twitter Inc for $44 billion


In [6]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_,"|", spacy.explain(ent.label_))
    

Tesla Inc | ORG | Companies, agencies, institutions, etc.
Twitter Inc | ORG | Companies, agencies, institutions, etc.
$44 billion | MONEY | Monetary values, including unit


In [7]:
from spacy import displacy
displacy.render(doc, style = 'ent')


In [8]:
document = """Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. 
            The company's name is a tribute to inventor and electrical engineer Nikola Tesla. 
            In February 2004 Elon Musk joined as the company's largest shareholder and in 2008 he was named CEO. 
            In 2008, the company began production of its first car model, the Roadster sports car, 
            followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, 
            the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. 
            The Model 3 is the all-time bestselling plug-in electric car worldwide, and in June 2021 became the 
            first electric car to sell 1 million units globally. In 2023, the Model Y was the best-selling 
            vehicle, of any kind, globally."""


In [9]:
document2 = """
            In the Himalayas, Mount Everest stands as a symbol of human achievement, 
            attracting climbers from all over the world. Last year, on January 1st, a team of ten experienced 
            mountaineers embarked on a journey to conquer the summit. They set off at 5:00 AM, braving the harsh 
            weather conditions and extreme altitudes. After weeks of arduous climbing, they reached the top, 
            achieving a remarkable feat. Their success rate was an impressive 80%, with eight out of ten climbers 
            reaching the peak. This accomplishment marked a significant milestone in mountaineering history and 
            inspired many aspiring adventurers to pursue their dreams.

            """

In [10]:
ddd = nlp(document2)
displacy.render(ddd, style = 'ent')


## NER on CoNLL dataset

In [11]:
import spacy

In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
from datasets import load_dataset

# Load the CoNLL 2003 NER dataset
dataset = load_dataset("conll2003")


Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /Users/jyotikumari/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /Users/jyotikumari/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [15]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [35]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [237]:
token_ner_tuples = [(token, ner_tag) for token, ner_tag in zip(dataset["train"]["tokens"], dataset["train"]["ner_tags"])]

token_ner_tuples[0]


(['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 [3, 0, 7, 0, 0, 0, 7, 0, 0])

In [240]:
def create_token_ner_tuples(data):
    
    filtered_data = [(tokens, ner_tags) for tokens, ner_tags in data if len(tokens) == len(ner_tags)]
    
    result = [list(zip(tokens, ner_tags)) for tokens, ner_tags in filtered_data]
    
    return result


output = create_token_ner_tuples(token_ner_tuples)
output[0]

[('EU', 3),
 ('rejects', 0),
 ('German', 7),
 ('call', 0),
 ('to', 0),
 ('boycott', 0),
 ('British', 7),
 ('lamb', 0),
 ('.', 0)]

In [204]:
ner_mapping =  { 

 0: 'O', 
 1: 'B-PER', 
 2: 'I-PER', 
 3:  'B-ORG', 
 4: 'I-ORG', 
 5: 'B-LOC', 
 6: 'I-LOC', 
 7: 'B-MISC', 
 8: 'I-MISC', 
 9: 'PAD', 
 10: 'UNK'
}


### inference using spacy pre-trained model

In [241]:
text = ' '.join([x[0] for x in output[1111]])
#print(text)
#print(conll_data[1111])
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_, spacy.explain(ent.label_))


Wednesday DATE Absolute or relative dates or periods
U.S. GPE Countries, cities, states
first ORDINAL "first", "second", etc.
third ORDINAL "first", "second", etc.


In [243]:
output[0:2]

[[('EU', 3),
  ('rejects', 0),
  ('German', 7),
  ('call', 0),
  ('to', 0),
  ('boycott', 0),
  ('British', 7),
  ('lamb', 0),
  ('.', 0)],
 [('Peter', 1), ('Blackburn', 2)]]

# custom trained on CoNLL dataset

In [280]:
X=[" ".join(x) for x in dataset['train'][0:]["tokens"]]

X[0]

'EU rejects German call to boycott British lamb .'

In [281]:
def extract_char_numbering_and_tag(tokens_with_tags):
    # Initialize variables to keep track of character indices
    start_index = 0
    end_index = 0

    # Initialize an empty list to store the (start, end, tag) tuples
    extracted_data = []

    # Iterate through the list of tuples
    for token, tag in tokens_with_tags:
        # Calculate the end index for the current token
        end_index = start_index + len(token)

        # Append the (start, end, tag) tuple to the list if tag is non-zero
        if tag != 0:
            extracted_data.append((start_index, end_index, tag))

        # Update the start index for the next token (including space)
        start_index = end_index + 1

    return extracted_data

final = []

for tokens_with_tags in output:
    
    extracted_data = extract_char_numbering_and_tag(tokens_with_tags)
    
    final.append(extracted_data)
    
final[0:2]

[[(0, 2, 3), (11, 17, 7), (34, 41, 7)], [(0, 5, 1), (6, 15, 2)]]

In [282]:
y = [[(start, end, ner_mapping[tag]) for start, end, tag in sublist] for sublist in final]

y[0:4]

[[(0, 2, 'B-ORG'), (11, 17, 'B-MISC'), (34, 41, 'B-MISC')],
 [(0, 5, 'B-PER'), (6, 15, 'I-PER')],
 [(0, 8, 'B-LOC')],
 [(4, 12, 'B-ORG'),
  (13, 23, 'I-ORG'),
  (59, 65, 'B-MISC'),
  (94, 101, 'B-MISC')]]

In [283]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [328]:
nlp = spacy.blank('en')
print(nlp.pipe_names)

[]


In [329]:
ner = nlp.add_pipe('ner')
print(nlp.pipe_names)

['ner']


In [331]:
unique_labels = set()
non_empty_y = [l for l in y_train if len(l)!= 0]

for annotations in non_empty_y:
    for entity in annotations:
        #print(entity)
        unique_labels.add(entity[2])
        
unique_labels = list(unique_labels)
unique_labels

['B-ORG', 'B-MISC', 'B-LOC', 'B-PER', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC']

In [330]:
for label in unique_labels:
    ner.add_label(label)
    
ner.labels

('B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER')

In [338]:
# Initialize a list to store Example objects
train_data = []

# Iterate over each pair of text and annotations
for text, annotations in zip(X_train, y_train):
    # Create Doc object using make_doc method
    doc = nlp.make_doc(text)
    
    # Create Example object from Doc and annotations
    example = Example.from_dict(doc, {"entities": annotations})
    
    # Append Example object to the list
    train_data.append(example)

# Print the list of Example objects
print(train_data)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [339]:
train_data[8]

{'doc_annotation': {'cats': {}, 'entities': ['U-B-LOC', 'O', 'O', 'O', 'O', 'O'], 'spans': {}, 'links': {}}, 'token_annotation': {'ORTH': ['PARIS', '1996', '-', '08', '-', '24'], 'SPACY': [True, False, False, False, False, False], 'TAG': ['', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', ''], 'POS': ['', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5], 'DEP': ['', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0]}}

In [340]:
# Train the NER model
import time
start= time.time()
import random

nlp.begin_training()
for itn in range(20):  # Adjust the number of iterations as needed
    random.shuffle(train_data)
    losses = {}
    for example in train_data:
        nlp.update([example], losses=losses)

    print("Losses:", losses)
print(time.time()-start)


Losses: {'ner': 13269.067364351467}
Losses: {'ner': 7228.719209355482}
Losses: {'ner': 5218.623908071141}
Losses: {'ner': 4235.147827521933}
Losses: {'ner': 3527.334686539278}
Losses: {'ner': 3169.6016198390485}
Losses: {'ner': 2772.917481338202}
Losses: {'ner': 2834.981582080788}
Losses: {'ner': 2514.9781875798863}
Losses: {'ner': 2346.2974665604615}
Losses: {'ner': 2122.8133694250478}
Losses: {'ner': 2203.4749957547197}
Losses: {'ner': 2203.4958050560695}
Losses: {'ner': 2070.3356065093394}
Losses: {'ner': 2076.971916894827}
Losses: {'ner': 2045.9435269179116}
Losses: {'ner': 1917.7357531976222}
Losses: {'ner': 1902.9205073020244}
Losses: {'ner': 1869.0906706415199}
Losses: {'ner': 1837.96275765335}
36993.486268758774


In [341]:
# Save the trained model to disk
nlp.to_disk("ner_model")

In [342]:
# Optionally, load the model from disk and test it
nlp_loaded = spacy.load("ner_model")
test_text = "Apple is hiring new engineers for its New York office."
doc = nlp_loaded(test_text)


In [343]:
for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.label_)

Apple 0 1 B-ORG
New 7 8 B-LOC
York 8 9 I-LOC


In [347]:
# Load the trained NER model
nlp_loaded = spacy.load("ner_model")

# Test data
test_data = [
    ("Apple is hiring new engineers for its New York office.", {"entities": [(0, 5, "ORG"), (44, 53, "LOC")]}),
    # Add more test examples as needed
]

# Initialize counters for evaluation metrics
true_positives = 0
false_positives = 0
false_negatives = 0

# Iterate over the test data
for text, annotations in zip(X_test, y_test):
    # Process the text with the loaded NER model
    doc = nlp_loaded(text)
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]
    true_entities = [(text[start:end], label) for start, end, label in annotations]

    # Compare predicted entities with true entities and update evaluation metrics
    for entity in predicted_entities:
        if entity in true_entities:
            true_positives += 1
        else:
            false_positives += 1

    for entity in true_entities:
        if entity not in predicted_entities:
            false_negatives += 1
            

In [459]:
# # Calculate precision, recall, and F1 score
# precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
# recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
# f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1_score)


Precision: 0.8774243599689682
Recall: 0.8619112940100594
F1 Score: 0.8695986467784099


In [348]:
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.8942731277533039
Recall: 0.888737922705314
F1 Score: 0.8914969334443856
