In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/ner

/content/drive/MyDrive/ner


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import time
import random
import pandas as pd
import torch.nn.functional as F
from torchtext.legacy import data, datasets

In [4]:
def readfile(filename, *, encoding="UTF8"):
    '''
    read file
    return format :
    [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ]
    '''
    with open(filename, mode='r', encoding=encoding) as f:
        sentences = []
        sentence = []
        for line in f:
            if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":   
                if len(sentence) > 0:
                    sentences.append(sentence)
                    sentence = []
                continue
            splits = line.split(' ')
            sentence.append([splits[0], splits[-1]])

    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
    return sentences

In [5]:
train_data = readfile('conll2003/train.txt')
valid_data = readfile('conll2003/valid.txt')
test_data = readfile('conll2003/test.txt')

In [6]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

14041
3250
3453


In [7]:
train_data[0]

[['EU', 'B-ORG\n'],
 ['rejects', 'O\n'],
 ['German', 'B-MISC\n'],
 ['call', 'O\n'],
 ['to', 'O\n'],
 ['boycott', 'O\n'],
 ['British', 'B-MISC\n'],
 ['lamb', 'O\n'],
 ['.', 'O\n']]

In [8]:
def clean_data(dataset):
    for sent in dataset:
        for word in sent:
            word[-1] = word[-1].strip('\n')
    return dataset

In [9]:
train_data = clean_data(train_data)
valid_data = clean_data(valid_data)
test_data = clean_data(test_data)

In [10]:
train_data[0]

[['EU', 'B-ORG'],
 ['rejects', 'O'],
 ['German', 'B-MISC'],
 ['call', 'O'],
 ['to', 'O'],
 ['boycott', 'O'],
 ['British', 'B-MISC'],
 ['lamb', 'O'],
 ['.', 'O']]

In [11]:
SEED = 0

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [12]:
def split_data(dataset):
    text = []
    label = []
    for sent in dataset:
        tokens = []
        tags = []
        for word in sent:
            tokens.append(word[0])
            tags.append(word[-1])
        text.append(' '.join(tokens))
        label.append(' '.join(tags))
    return text, label

In [13]:
train_text, train_label = split_data(train_data)
valid_text, valid_label = split_data(valid_data)
test_text, test_label = split_data(test_data)

In [14]:
train_df = pd.DataFrame({'text' : train_text, 'label' : train_label})
valid_df = pd.DataFrame({'text' : valid_text, 'label' : valid_label})
test_df = pd.DataFrame({'text' : test_text, 'label' : test_label})

train_df.head(5)

Unnamed: 0,text,label
0,EU rejects German call to boycott British lamb .,B-ORG O B-MISC O O O B-MISC O O
1,Peter Blackburn,B-PER I-PER
2,BRUSSELS 1996-08-22,B-LOC O
3,The European Commission said on Thursday it di...,O B-ORG I-ORG O O O O O O B-MISC O O O O O B-M...
4,Germany 's representative to the European Unio...,B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O ...


In [15]:
train_df.to_csv('data/train.csv')
valid_df.to_csv('data/valid.csv')
test_df.to_csv('data/test.csv')

In [16]:
TEXT = data.Field()
NESTING_CHAR = data.Field(tokenize=list)
CHAR = data.NestedField(NESTING_CHAR)
LABEL = data.Field(unk_token = None)
fields = ((None, None), (('text', 'char'), (TEXT, CHAR)), ('label', LABEL))

In [17]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [18]:
vars(train_data[0])

{'char': [['E', 'U'],
  ['r', 'e', 'j', 'e', 'c', 't', 's'],
  ['G', 'e', 'r', 'm', 'a', 'n'],
  ['c', 'a', 'l', 'l'],
  ['t', 'o'],
  ['b', 'o', 'y', 'c', 'o', 't', 't'],
  ['B', 'r', 'i', 't', 'i', 's', 'h'],
  ['l', 'a', 'm', 'b'],
  ['.']],
 'label': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 'text': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.']}

In [19]:
MIN_FREQ = 2
TEXT.build_vocab(train_data,
                 min_freq = MIN_FREQ,
                 vectors = 'glove.6B.50d',
                 unk_init = torch.Tensor.normal_)
CHAR.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [20]:
vars(LABEL.vocab)

{'freqs': Counter({'B-LOC': 7140,
          'B-MISC': 3438,
          'B-ORG': 6321,
          'B-PER': 6600,
          'I-LOC': 1157,
          'I-MISC': 1155,
          'I-ORG': 3704,
          'I-PER': 4528,
          'O': 169578}),
 'itos': ['<pad>',
  'O',
  'B-LOC',
  'B-PER',
  'B-ORG',
  'I-PER',
  'I-ORG',
  'B-MISC',
  'I-LOC',
  'I-MISC'],
 'stoi': defaultdict(None,
             {'<pad>': 0,
              'B-LOC': 2,
              'B-MISC': 7,
              'B-ORG': 4,
              'B-PER': 3,
              'I-LOC': 8,
              'I-MISC': 9,
              'I-ORG': 6,
              'I-PER': 5,
              'O': 1}),
 'unk_index': None,
 'vectors': None}

In [21]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
                                        (train_data, valid_data, test_data),
                                        sort = False,
                                        batch_size = BATCH_SIZE,
                                        device = device
)

In [22]:
class BiLSTM_CNN(nn.Module):
    def __init__(self, 
                 word_vocab_size,
                 word_embedding_dim,
                 word_pad_idx,
                 char_vocab_size,
                 char_embedding_dim,
                 char_pad_idx, 
                 hidden_dim, 
                 output_dim, 
                 n_layers,
                 n_filter,
                 cnn_kernel_size, 
                 bidirectional, 
                 lstm_dropout,
                 cnn_dropout,
                 fc_dropout,
                 emb_dropout):
        super().__init__()
        self.word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim, padding_idx = word_pad_idx)   
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim, padding_idx = char_pad_idx)
        self.cnn = nn.Conv1d(in_channels = char_embedding_dim, 
                             out_channels = char_embedding_dim * n_filter, 
                             kernel_size = cnn_kernel_size, 
                             groups=char_embedding_dim) 
        self.lstm = nn.LSTM(word_embedding_dim + char_embedding_dim * n_filter,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = lstm_dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.cnn_dropout = nn.Dropout(cnn_dropout)
        self.emb_dropout = nn.Dropout(emb_dropout)
          
    def count_params(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def forward(self, text, chars):
        #text = [sent, batch_size]
        #chars = [batch_size, sent, word]
        word_embedded = self.emb_dropout(self.word_embedding(text))
        #word_embedded = [sent, batch_size, word_embedding_dim]

        char_embedded = self.emb_dropout(self.char_embedding(chars))
        #char_embedded = [batch_size, sent, word, char_embedding]
        (batch_size, sent, word, char_emb_dim) = char_embedded.shape
        char_cnn_max_out = torch.zeros(batch_size, sent, self.cnn.out_channels).to(device)
        for sent_i in range(sent):
            # sent_char_emb = [batch size, word length, char emb dim]
            sent_char_emb = char_embedded[:, sent_i, :, :]  # get the character field of sent i
            # sent_char_emb_p = [batch size, char emb dim, word length]
            sent_char_emb_p = sent_char_emb.permute(0, 2, 1)  # the channel (char emb dim) has to be the last dimension
            # char_cnn_sent_out = [batch size, out channels * char emb dim, word length - kernel size + 1]
            char_cnn_sent_out = torch.tanh(self.cnn(sent_char_emb_p))
            # max pooling over the word length dimension
            char_cnn_max_out[:, sent_i, :] = F.max_pool1d(char_cnn_sent_out, char_cnn_sent_out.shape[2]).squeeze(2)
        char_cnn = self.cnn_dropout(char_cnn_max_out)
        # concat word and char embedding
        # char_cnn_p = [sentence length, batch size, char emb dim * num filter]
        char_cnn_p = char_cnn_max_out.permute(1, 0, 2)
        embedded = torch.cat((word_embedded, char_cnn_p), dim = 2)
        outputs, (hidden, cell) = self.lstm(embedded)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]
        pred = self.fc(self.fc_dropout(outputs))
        #pred = [sent, batch size, output dim]
        return pred

In [23]:
model = BiLSTM_CNN(
            word_vocab_size = len(TEXT.vocab),
            word_embedding_dim = 50,
            word_pad_idx = TEXT.vocab.stoi['<pad>'],
            char_vocab_size = len(CHAR.vocab),
            char_embedding_dim = 25,
            char_pad_idx = CHAR.vocab.stoi['<pad'], 
            hidden_dim = 200, 
            output_dim = len(LABEL.vocab), 
            n_layers = 2,
            n_filter = 5,
            cnn_kernel_size = 3, 
            bidirectional = True, 
            lstm_dropout = 0.25,
            cnn_dropout = 0.5,
            fc_dropout = 0.5,
            emb_dropout = 0.5
)

In [24]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)
            
model.apply(init_weights)

BiLSTM_CNN(
  (word_embedding): Embedding(11984, 50, padding_idx=1)
  (char_embedding): Embedding(86, 25, padding_idx=0)
  (cnn): Conv1d(25, 125, kernel_size=(3,), stride=(1,), groups=25)
  (lstm): LSTM(175, 200, num_layers=2, dropout=0.25, bidirectional=True)
  (fc): Linear(in_features=400, out_features=10, bias=True)
  (fc_dropout): Dropout(p=0.5, inplace=False)
  (cnn_dropout): Dropout(p=0.5, inplace=False)
  (emb_dropout): Dropout(p=0.5, inplace=False)
)

In [25]:
print(f'The model has {model.count_params():,} trainable parameters')

The model has 2,172,260 trainable parameters


In [26]:
pretrained_embeddings = TEXT.vocab.vectors
model.word_embedding.weight.data.copy_(pretrained_embeddings)
model.word_embedding.weight.data[TEXT.vocab.stoi['<pad>']] = torch.zeros(50)
model.char_embedding.weight.data[CHAR.vocab.stoi['<pad>']] = torch.zeros(25)

In [27]:
optimizer = optim.Adam(model.parameters(), lr =  0.0105)
criterion = nn.CrossEntropyLoss(ignore_index = LABEL.vocab.stoi['<pad>'])
model.to(device)
criterion.to(device)

CrossEntropyLoss()

In [28]:
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / y[non_pad_elements].shape[0]

In [29]:
def train(model, iter, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()

    for batch in iter:
        text = batch.text
        chars = batch.char
        label = batch.label
         #text = [sent len, batch size]
         #char = [batch, sent, word]
        optimizer.zero_grad()
        pred = model(text, chars)
        #pred = [sent len, batch size, output dim]
        #label = [sent len, batch size]
        pred = pred.view(-1, pred.shape[-1])
        #pred = [sent len * batch_size, output dim]
        label = label.view(-1)
        #label = [sent len * batch_size]
        loss = criterion(pred, label)
        acc = categorical_accuracy(pred, label, tag_pad_idx)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iter), epoch_acc / len(iter)

In [30]:
def evaluate(model, iter, criterion, tag_pad_idx): 
    epoch_loss = 0
    epoch_acc = 0  
    model.eval()
    
    with torch.no_grad():
        for batch in iter:
            text = batch.text
            label = batch.label
            chars = batch.char
            
            pred = model(text, chars)
            
            pred = pred.view(-1, pred.shape[-1])
            label = label.view(-1)
            
            loss = criterion(pred, label)
            
            acc = categorical_accuracy(pred, label, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iter), epoch_acc / len(iter)

In [32]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)
    return elapsed_mins, elapsed_secs

In [33]:
EPOCH = 30
PATH = '/content/drive/MyDrive/ner/bilstm_cnn.pt'
best_valid_loss = float('inf')

In [None]:
for epoch in range(EPOCH):
    start = time.time()
    train_loss, train_acc = train(model, train_iter, optimizer, criterion, LABEL.vocab.stoi['<pad>'])
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion, LABEL.vocab.stoi['<pad>'])
    end = time.time()
    epoch_mins, epoch_secs = epoch_time(start, end)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save({
            'epoch' : epoch,
            'model_state_dict' : model.state_dict(),
            'optimizer_state_dict' : optimizer.state_dict(),
            'loss' : best_valid_loss
        }, PATH)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 29s
	Train Loss: 0.420 | Train Acc: 87.36%
	 Val. Loss: 0.196 |  Val. Acc: 93.57%
Epoch: 02 | Epoch Time: 0m 29s
	Train Loss: 0.174 | Train Acc: 94.34%
	 Val. Loss: 0.119 |  Val. Acc: 96.36%
Epoch: 03 | Epoch Time: 0m 29s
	Train Loss: 0.124 | Train Acc: 96.09%
	 Val. Loss: 0.095 |  Val. Acc: 97.10%
Epoch: 04 | Epoch Time: 0m 28s
	Train Loss: 0.095 | Train Acc: 96.98%
	 Val. Loss: 0.089 |  Val. Acc: 97.22%
Epoch: 05 | Epoch Time: 0m 29s
	Train Loss: 0.077 | Train Acc: 97.58%
	 Val. Loss: 0.087 |  Val. Acc: 97.37%
Epoch: 06 | Epoch Time: 0m 29s
	Train Loss: 0.068 | Train Acc: 97.86%
	 Val. Loss: 0.091 |  Val. Acc: 97.48%
Epoch: 07 | Epoch Time: 0m 29s
	Train Loss: 0.059 | Train Acc: 98.17%
	 Val. Loss: 0.084 |  Val. Acc: 97.75%
Epoch: 08 | Epoch Time: 0m 29s
	Train Loss: 0.053 | Train Acc: 98.38%
	 Val. Loss: 0.084 |  Val. Acc: 97.71%
Epoch: 09 | Epoch Time: 0m 29s
	Train Loss: 0.048 | Train Acc: 98.49%
	 Val. Loss: 0.083 |  Val. Acc: 97.79%
Epoch: 10 | Epoch T

In [35]:
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
test_loss, test_acc = evaluate(model, test_iter, criterion, LABEL.vocab.stoi['<pad>'])
print(f'loss {test_loss:.3f} | acc {test_acc*100:.2f}%')

loss 0.154 | acc 96.32%


In [36]:
model.eval()
preds = []
labels = []
with torch.no_grad():
    for batch in test_iter:
        #pred = [sent, batch size, output_dim]
        pred = torch.argmax(model(batch.text, batch.char), dim = 2)  #pred = [sent, batch size]
        pred = pred.view(-1).tolist()  #pred = [sent * batch size]
        label = batch.label.view(-1).tolist()  #label = [sent * batch size]
        for idx in range(len(label)):
           if label[idx] < 2 or pred[idx] < 2: continue
           preds.append(LABEL.vocab.itos[pred[idx]])
           labels.append(LABEL.vocab.itos[label[idx]])

In [37]:
from sklearn.metrics import classification_report
print(classification_report(preds, labels, digits = 4))

              precision    recall  f1-score   support

       B-LOC     0.8933    0.9169    0.9049      1588
      B-MISC     0.8328    0.8051    0.8187       662
       B-ORG     0.8924    0.8207    0.8551      1707
       B-PER     0.8503    0.9513    0.8979      1355
       I-LOC     0.8279    0.7891    0.8080       256
      I-MISC     0.8011    0.7062    0.7506       211
       I-ORG     0.8983    0.8337    0.8648       848
       I-PER     0.9229    0.9688    0.9453      1088

    accuracy                         0.8802      7715
   macro avg     0.8649    0.8490    0.8557      7715
weighted avg     0.8804    0.8802    0.8791      7715



In [80]:
def pred_sent(sent):
    model.eval()
    nlp = spacy.load('en_core_web_sm')
    tokens = [token.text for token in nlp(sent)]
    chars = []
    max_word_len = -1
    for token in tokens:
        if max_word_len < len(token): max_word_len = len(token)
        chars.append([char for char in token])

    for char_list in chars:
        for _ in range(max_word_len - len(char_list)):
            char_list.append('<pad>')

    token_idx = [TEXT.vocab.stoi[token] for token in tokens]
    
    for x in range(len(chars)):
        for y in range(max_word_len):
            chars[x][y] = CHAR.vocab.stoi[chars[x][y]]
    
    token_tensor = torch.LongTensor(token_idx).unsqueeze(-1).to(device)
    char_tensor = torch.LongTensor(chars).unsqueeze(0).to(device)
    pred = model(token_tensor, char_tensor).argmax(-1).squeeze()
    predicted_tags = [LABEL.vocab.itos[x] for x in pred]

    return tokens, predicted_tags

In [113]:
sentence = 'John lives in New York. He just graduted from Harvard University. He is working for Google now'
tokens, predicted_tags = pred_sent(sentence)

test_df = pd.DataFrame({'token' : tokens,
                        'predicted_tag': predicted_tags})
test_df

Unnamed: 0,token,predicted_tag
0,John,B-PER
1,lives,O
2,in,O
3,New,B-LOC
4,York,I-LOC
5,.,O
6,He,O
7,just,O
8,graduted,O
9,from,O
