https://keras.io/examples/nlp/ner_transformers/

In [123]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
     -------------------------------------- 365.7/365.7 kB 7.7 MB/s eta 0:00:00
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
     ---------------------------------------- 95.8/95.8 kB 5.3 MB/s eta 0:00:00
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
     -------------------------------------- 141.2/141.2 kB 4.2 MB/s eta 0:00:00
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp310-cp310-win_amd64.whl (555 kB)
     ------------------------------------- 555.1/555.1 kB 11.6 MB/s eta 0:00:00
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
     -------------------------------------- 101.5/101.5 kB 5.7 MB/s eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.0.0-cp310-cp310-win_amd64.whl (29 kB)
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting


[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [124]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [125]:
import os

In [126]:
import tensorflow as tf

In [134]:
from tensorflow import keras
from tensorflow.keras import layers

In [128]:
import numpy as np

In [130]:
from datasets import load_dataset

In [131]:
from collections import Counter

In [132]:
from conlleval import evaluate

# transformer

In [135]:
class TransformerBlock(
    layers.Layer,
    ):
    
    def __init__(
        self,
        embed_dim,
        num_heads,
        ff_dim,
        rate = 0.1,
        ):
        
        super(TransformerBlock, self).__init__()
        
        self.att = layers.MultiHeadAttention(
            num_heads = num_heads,
            key_dim = embed_dim,
            )
        
        self.ffn = keras.Sequential(
            [
                layers.Dense(
                    ff_dim,
                    activation="relu",
                ),
                layers.Dense(
                    ff_dim,
                ),                
            ]
        )
        
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(
        self,
        inputs,
        training,
        ):
        
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training = training)      
        
        out1 = self.layernorm1(inputs + attn_output)
        #print(out1.shape)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(
            ffn_output,
            training=training
        )       
        
                
        return self.layernorm2(out1+ffn_output)

In [136]:
class TokenAndPositionEmbedding(
    layers.Layer,
    ):
    
    def __init__(
        self,
        maxlen,
        vocab_size,
        embed_dim,
        ):
        super(TokenAndPositionEmbedding, self).__init__()
        
        self.token_emb = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embed_dim,
            )
        
        self.pos_emb = layers.Embedding(
            input_dim=maxlen,
            output_dim=embed_dim,
            )
    
    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(
            start = 0, 
            limit = maxlen,
            delta = 1,
            )
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x+positions

In [150]:
class NERModel(
    keras.Model
    ):
    def __init__(
        self,
        num_tags,
        vocab_size,
        maxlen = 128,
        embed_dim = 32,
        num_heads = 2,
        ff_dim = 32,
        ):
        super(NERModel, self).__init__()
        
        self.embedding_layer = TokenAndPositionEmbedding(
            maxlen,
            vocab_size=vocab_size,
            embed_dim=embed_dim,
            )
        
        self.transformer_block = TransformerBlock(
            embed_dim=embed_dim,
            num_heads=num_heads,
            ff_dim=ff_dim,
            )
        
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation='relu')
        
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(
            num_tags,
            activation = 'softmax'
            )
    
    def call(
        self,
        inputs,
        training = False,
        ):
        
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training = training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        
        return x        

In [151]:
m = NERModel(
    num_tags = 2,
    vocab_size = 1000,
    )

In [155]:
x = np.random.randint(1000, size = (100,128))

In [157]:
y = m.predict(x)



In [158]:
y.shape

(100, 128, 2)

In [160]:
m.summary()

Model: "ner_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddin  multiple                 36096     
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_2 (Transf  multiple                 10656     
 ormerBlock)                                                     
                                                                 
 dropout_9 (Dropout)         multiple                  0         
                                                                 
 dense_16 (Dense)            multiple                  1056      
                                                                 
 dropout_10 (Dropout)        multiple                  0         
                                                       

# load the data

In [161]:
conll_data = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to C:\Users\jimwa\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to C:\Users\jimwa\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [162]:
conll_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [171]:
for r in conll_data['train']:
    print(r['tokens'])    
    print(r['ner_tags'])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]
['Peter', 'Blackburn']
[1, 2]
['BRUSSELS', '1996-08-22']
[5, 0]
['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.']
[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0]
['"', 'We', 'do', "n't", 'support', 'any', 'such', 'recomme

['OAKLAND', '62', '68', '.477', '13']
[3, 0, 0, 0, 0]
['CALIFORNIA', '59', '68', '.465', '14', '1/2']
[3, 0, 0, 0, 0, 0]
['FRIDAY', ',', 'AUGUST', '23', 'SCHEDULE']
[0, 0, 0, 0, 0]
['SEATTLE', 'AT', 'BOSTON']
[3, 0, 5]
['MILWAUKEE', 'AT', 'CLEVELAND']
[3, 0, 5]
['CALIFORNIA', 'AT', 'BALTIMORE']
[3, 0, 5]
['OAKLAND', 'AT', 'NEW', 'YORK']
[3, 0, 5, 6]
['TORONTO', 'AT', 'CHICAGO']
[3, 0, 5]
['DETROIT', 'AT', 'KANSAS', 'CITY']
[3, 0, 5, 6]
['TEXAS', 'AT', 'MINNESOTA']
[3, 0, 5]
['NATIONAL', 'LEAGUE']
[7, 8]
['EASTERN', 'DIVISION']
[7, 8]
['W', 'L', 'PCT', 'GB']
[0, 0, 0, 0]
['ATLANTA', '79', '47', '.627', '-']
[3, 0, 0, 0, 0]
['MONTREAL', '68', '58', '.540', '11']
[3, 0, 0, 0, 0]
['NEW', 'YORK', '59', '69', '.461', '21']
[3, 4, 0, 0, 0, 0]
['FLORIDA', '58', '69', '.457', '21', '1/2']
[3, 0, 0, 0, 0, 0]
['PHILADELPHIA', '52', '76', '.406', '28']
[3, 0, 0, 0, 0]
['CENTRAL', 'DIVISION']
[7, 8]
['HOUSTON', '68', '60', '.531', '-']
[3, 0, 0, 0, 0]
['ST', 'LOUIS', '67', '60', '.528', '1/2']
[3, 

['Attendance', '16,000', '.']
[0, 0, 0]
['Nice', '1', '(', 'Debbah', '39th', ')', 'Bastia', '1', '(', 'Drobnjak', '82nd', ')', '.']
[3, 0, 0, 1, 0, 0, 3, 0, 0, 1, 0, 0, 0]
['1-0', '.']
[0, 0]
['6,000', '.']
[0, 0]
['Lille', '3', '(', 'Boutoille', '47th', ',', 'Becanovic', '79th', 'pen', ',', '82nd', ')', ')', 'Rennes', '1']
[3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0]
['(', "Guivarc'h", '60th', 'pen', '.']
[0, 1, 0, 0, 0]
[')', '0-0', '.']
[0, 0, 0]
['6,000', '.']
[0, 0]
['Bordeaux', '0', 'Auxerre', '0', '.']
[3, 0, 3, 0, 0]
['30,000', '.']
[0, 0]
['Marseille', '1', '(', 'Gravelaine', '24th', ')', 'Metz', '2', '(', 'Traore', '65th', ',', 'Bombarda']
[3, 0, 0, 1, 0, 0, 3, 0, 0, 1, 0, 0, 1]
['69th', ')', '.']
[0, 0, 0]
['1-0', '.']
[0, 0]
['20,000', '.']
[0, 0]
['Strasbourg', '1', '(', 'Zitelli', '80th', ')', 'Le', 'Havre', '0', '.']
[3, 0, 0, 1, 0, 0, 3, 4, 0, 0]
['0-0', '.']
[0, 0]
['15,000']
[0]
['Caen', '1', '(', 'Bancarel', '70th', ')', 'Lyon', '1', '(', 'Caveglia', '89th', ')', '.

['Weinstein', 'was', 'found', 'dead', 'last', 'weekend', 'alongside', 'the', 'bodies', 'of', 'eight-year-olds', 'Julie', 'Lejeune', 'and', 'Melissa', 'Russo', 'in', 'a', 'house', 'belonging', 'to', 'Detroux', ',', 'who', 'said', 'they', 'starved', 'to', 'death', 'earlier', 'this', 'year', ',', 'nine', 'months', 'after', 'being', 'abducted', 'in', 'June', '1995', '.']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Two', 'other', 'girls', 'have', 'been', 'rescued', 'and', 'police', 'are', 'hunting', 'for', 'at', 'least', 'two', 'more', 'who', 'Dutroux', 'has', 'admitted', 'kidnapping', 'a', 'year', 'ago', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
['"', 'Dutroux', 'has', 'admitted', 'killing', 'Weinstein', 'after', 'a', 'disagreement', 'between', 'the', 'accomplices', 'in', 'an', 'affair', 'of', 'truck', 'theft', ',', '"', 'Bourlet', 'said', '.']
[0, 1, 0, 0, 0, 1, 0, 0, 0

['SQUASH', '-', 'HONG', 'KONG', 'OPEN', 'FIRST', 'ROUND', 'RESULTS', '.']
[0, 0, 7, 8, 8, 0, 0, 0, 0]
['HONG', 'KONG', '1996-08-27']
[5, 6, 0]
['First', 'round', 'results', 'in', 'the', 'Hong']
[0, 0, 0, 0, 0, 7]
['Kong', 'Open', 'squash', 'tournament', 'on', 'Tuesday', '(', 'prefix', 'denotes', 'seeding', ')', ':']
[7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['1', '-', 'Jansher', 'Khan', '(', 'Pakistnn', ')', 'beat', 'Jackie', 'Lee', '(', 'Hong', 'Kong', ')', '15-8', '15-8']
[0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, 5, 6, 0, 0, 0]
['15-6']
[0]
['3', '-', 'Brett', 'Martin', '(', 'Australia', ')', 'beat', 'David', 'Evans', '(', 'Wales', ')', '14-17', '15-1']
[0, 0, 1, 2, 0, 5, 0, 0, 1, 2, 0, 5, 0, 0, 0]
['13-15', '17-14', '15-12']
[0, 0, 0]
['Mark', 'Cairns', '(', 'England', ')', 'beat', '6', '-', 'Del', 'Harris', '(', 'England', ')', '15-12', '7-15']
[1, 2, 0, 5, 0, 0, 0, 0, 1, 2, 0, 5, 0, 0, 0]
['15-6', '15-12']
[0, 0]
['Anthony', 'Hill', '(', 'Australia', ')', 'beat', '8', '-', 'Mark', 'Chaloner', 

['Major', 'League', 'Baseball']
[7, 8, 8]
['standings', 'after', 'games', 'played', 'on', 'Tuesday', '(', 'tabulate', 'under', 'won', ',']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['lost', ',', 'winning', 'percentage', 'and', 'games', 'behind', ')', ':']
[0, 0, 0, 0, 0, 0, 0, 0, 0]
['AMERICAN', 'LEAGUE']
[7, 8]
['EASTERN', 'DIVISION']
[7, 8]
['W', 'L', 'PCT', 'GB']
[0, 0, 0, 0]
['NEW', 'YORK', '74', '57', '.565', '-']
[3, 4, 0, 0, 0, 0]
['BALTIMORE', '70', '61', '.534', '4']
[3, 0, 0, 0, 0]
['BOSTON', '68', '65', '.511', '7']
[3, 0, 0, 0, 0]
['TORONTO', '62', '71', '.466', '13']
[3, 0, 0, 0, 0]
['DETROIT', '47', '85', '.356', '27', '1/2']
[3, 0, 0, 0, 0, 0]
['CENTRAL', 'DIVISION']
[7, 8]
['CLEVELAND', '79', '53', '.598', '-']
[3, 0, 0, 0, 0]
['CHICAGO', '70', '64', '.522', '10']
[3, 0, 0, 0, 0]
['MINNESOTA', '66', '66', '.500', '13']
[3, 0, 0, 0, 0]
['MILWAUKEE', '64', '69', '.481', '15', '1/2']
[3, 0, 0, 0, 0, 0]
['KANSAS', 'CITY', '60', '73', '.451', '19', '1/2']
[3, 4, 0, 0, 0, 0, 0]
['WES

['Summary', 'of', 'Dutch', 'first', 'division', 'soccer', 'match', 'played', 'on', 'Thursday', ':']
[0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0]
['NAC', 'Breda', '1', '(', 'Abdellaoui', '20th', 'penalty', ')', 'NEC', 'Nijmegen', '1', '(', 'Graef', '36th', ')', '.']
[3, 4, 0, 0, 1, 0, 0, 0, 3, 4, 0, 0, 1, 0, 0, 0]
['Halftime', '1-1', '.']
[0, 0, 0]
['Attendance', '10,760', '.']
[0, 0, 0]
['SOCCER', '-', 'DUTCH', 'FIRST', 'DIVISION', 'RESULTS', '/', 'STANDINGS', '.']
[0, 0, 7, 0, 0, 0, 0, 0, 0]
['AMSTERDAM', '1996-08-29']
[5, 0]
['Result', 'of', 'a', 'Dutch', 'first']
[0, 0, 0, 7, 0]
['division', 'soccer', 'match', 'played', 'on', 'Thursday', ':']
[0, 0, 0, 0, 0, 0, 0]
['NAC', 'Breda', '1', 'NEC', 'Nijmegen', '1']
[3, 4, 0, 3, 4, 0]
['Played', 'on', 'Wednesday', ':']
[0, 0, 0, 0]
['Vitesse', 'Arnhem', '1', 'Sparta', 'Rotterdam', '1']
[3, 4, 0, 3, 4, 0]
['Utrecht', '0', 'Twente', 'Enschede', '0']
[3, 0, 3, 4, 0]
['Groningen', '1', 'Roda', 'JC', 'Kerkrade', '1']
[3, 0, 3, 4, 4, 0]
['Feyenoord', '2', 

In [172]:
def export_to_file(
    export_file_path,
    data,
    ):
    
    with open(export_file_path, 'w') as f:
        for record in data:
            ner_tags = record['ner_tags']
            tokens = record['tokens']
            
            if len(tokens) > 0:
                f.write(
                    str(len(tokens))
                    +'\t'
                    +'\t'.join(tokens)
                    +'\t'
                    +'\t'.join(map(str, ner_tags))
                    +'\n'
                )
    
    return None

In [175]:
os.mkdir('C:\\data\\upwork1\\temp\\data')

In [176]:
export_to_file(
    'C:\\data\\upwork1\\temp\\data\\conll_train.txt',
    conll_data['train'],    
    )

In [177]:
export_to_file(
    'C:\\data\\upwork1\\temp\\data\\conll_val.txt',
    conll_data['validation'],    
    )

In [179]:
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = ["PER", "ORG", "LOC", "MISC"]
    all_labels = [(label1, label2) 
        for label2 in ner_labels
        for label1 in iob_labels
        ]
    all_labels = ["-".join([a,b]) for a,b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    
    return dict(zip(range(0, len(all_labels)+1), all_labels))

In [180]:
mapping = make_tag_lookup_table()

In [181]:
print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}


# end