# Load the packages needed

In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import gc
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,TensorDataset, DataLoader, random_split
from torchcrf import CRF# We need this encapsulated for complicated CRF components
import torchcrf
from tqdm import tqdm

# Read in the datasets from .txt files

In [2]:
sentences = []
tags = []
with open('ner_datasets/source_BIO_2014_cropus.txt','r') as f1:
    for line in f1:
        sentences.append(line)
with open('ner_datasets/target_BIO_2014_cropus.txt','r') as f2:
    for line in f2:
        tags.append(line)

# Set the number of data samples we need

In [3]:
sentences = sentences[:1000]
tags = tags[:1000]

# Split the sentences/labels into token by token

In [4]:
for i in range(len(sentences)):
    sentences[i] = sentences[i].split()
for i in range(len(tags)):
    tags[i] = tags[i].split()

# Count the exsiting lable types in the dataset

In [5]:
labels = set()
for i in tags:
    for tag in i:
        labels.add(tag)
print (labels)
print (len(labels))

{'B_T', 'O', 'I_PER', 'B_ORG', 'I_LOC', 'B_LOC', 'I_T', 'I_ORG', 'B_PER'}
9


# Transfer the label symbol into label indexes

In [6]:
label2index = {
    "B_PER":0,
    "I_PER":1,
    "B_LOC":2,
    "I_LOC":3,
    "B_T":4,
    "I_T":5,
    "B_ORG":6,
    "I_ORG":7,
    "O":8,
}

In [7]:
for tag in tags:
    for i in range(len(tag)):
        tag[i] = label2index[tag[i]]

# Tokenize and index the input sentences with pretrained Tokenizer (HuggingFcae)

In [8]:
from transformers import AutoModel, AutoTokenizer
Tokenizer = AutoTokenizer.from_pretrained("FinBERT_L-12_H-768_A-12_pytorch",add_special_tokens=False)
# No special tokens needed for NER task.

In [9]:
sentences = [Tokenizer.convert_tokens_to_ids(x) for x in sentences]

# Count the lengths of of all sentences

In [10]:
max_length = 0
min_length = 100
for i in tags:
    max_length = max(max_length,len(i))
    min_length = min(min_length,len(i))    
print (max_length,min_length)

616 2


# Set Train and Test datastes 

In [11]:
train_sentences = sentences[:900]
test_sentences = sentences[900:]

train_labels = tags[:900]
test_labels = tags[900:]

train_masking = []
test_masking = []

# Generate the "input_ids", "labels", and "masking" tensors of the Train and Test datasets.

In [12]:
for i in range(len(train_sentences)):
    if len(train_sentences[i])>=100:
        train_sentences[i] = train_sentences[i][:100]
        train_labels[i] = train_labels[i][:100]
        train_masking.append([1 for i in range(100)])
    else:
        train_sentences[i] = train_sentences[i]+[0 for i in range(100-len(train_sentences[i]))]
        train_labels[i] = train_labels[i]+[0 for i in range(100-len(train_labels[i]))]
        train_masking.append([1 for i in range(len(train_sentences[i]))]+[0 for i in range(100-len(train_sentences[i]))])

for i in range(len(test_sentences)):
    if len(test_sentences[i])>=100:
        test_sentences[i] = test_sentences[i][:100]
        test_labels[i] = test_labels[i][:100]
        test_masking.append([1 for i in range(100)])
    else:
        test_sentences[i] = test_sentences[i]+[0 for i in range(100-len(test_sentences[i]))]
        test_labels[i] = test_labels[i]+[0 for i in range(100-len(test_labels[i]))]
        test_masking.append([1 for i in range(len(test_sentences[i]))]+[0 for i in range(100-len(test_sentences[i]))])

# Constrcut the dataset class

In [13]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, maskings):
        self.encodings = encodings
        self.labels = labels
        self.maskings = maskings

    def __getitem__(self, idx):
        item = {"input_ids":torch.tensor(self.encodings[idx])}
        item['labels'] = torch.tensor(self.labels[idx])
        item['maskings'] = torch.tensor(self.maskings[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_sentences,train_labels,train_masking)
test_dataset = NERDataset(test_sentences,test_labels,test_masking)

# Set Batch-Size

In [14]:
batch_size = 6

# Load the datasets into Dataloader, batchlize them at the same time.

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

# Check the content in each batch from the dataloader

In [16]:
#for batch in test_dataloader:
#    print (batch)
#    break
#batch["maskings"].size()

# Build the model

In [57]:
class BERTBiLSTMCRF(nn.Module):
    def __init__(self,num_tags = None):
        super(BERTBiLSTMCRF, self).__init__()
        # Get the sequence encoding from FinBERT
        self.base_model = AutoModel.from_pretrained('FinBERT_L-12_H-768_A-12_pytorch')
        # The hyper-parameters for LSTM
        self.word_embeds = 768# output dimensions from FinBERT
        self.hidden_dim = 1024# double-layer: 2* actual hidden_size
        self.num_tags = num_tags# the number of unique labels
        # build the lstm
        self.lstm = nn.LSTM(self.word_embeds, self.hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first = True)
        # map from the dimension of lstm outputs to the dimension of num_tags
        self.hidden2tag = nn.Linear(self.hidden_dim, self.num_tags)
        # build the CRF
        self.crf = torchcrf.CRF(self.num_tags,batch_first=True)
    
    def forward(self, sequence = None, labels = None, maskings = None):
        #Get the batch_size
        batch_size = sequence.size()[0]
        # add maskings, because various input lengths and the corresponding paddings
        outputs = self.base_model(sequence,attention_mask = maskings)#output tuple:(LastLayerSequenceOuput,PoolerOutput)
        
        
        ##########################################################
        #######The process to help LSTM get rid of padding########
        
        lengths = []# the list to store the real length of each input
        for i in range(batch_size):
            lengths.append(maskings[i,:].tolist().count(1))
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        X = torch.nn.utils.rnn.pack_padded_sequence(outputs[0], torch.Tensor(lengths).long(), batch_first=True)
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        hidden_state = torch.randn(2, batch_size, self.hidden_dim//2)
        cell_state = torch.randn(2, batch_size, self.hidden_dim//2)
        # now run through LSTM
        X  = self.lstm(X,(hidden_state,cell_state))[0]#The output of lstm is (hidden_output, cell_output)
        # undo the packing operation
        X = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)[0]
        X = X.contiguous()
        X = X.view(-1,sequence.size()[1] ,X.shape[2])
        
        #lstm_outputs = self.lstm(outputs[0])[0]
        
        ##########################################################
        ##########################################################
        
        emission_scores = self.hidden2tag(X)# map from 1024 to 9
        # decode and get the predicted labels
        predictions = self.crf.decode(emission_scores, mask = maskings.bool())# bool()!!!!!
        # calculate losses
        if labels is not None:
            loss = -self.crf.forward(emission_scores, labels, mask = maskings.bool(), reduction='sum')
            return (loss,predictions)
        else:
            return predictions
model = BERTBiLSTMCRF(num_tags = 9)


##############################################################################################
##############################################################################################
#############The example of the use of CRF modules.
#############The order of seq_length and batch_size can be different, they are all supported. Or
#############set the batch_first=True (defalut is batch_first=False)
#tags = torch.tensor([[0, 1], [2, 4], [3, 1]], dtype=torch.long)  # (seq_length, batch_size)
#emissions = torch.randn(seq_length, batch_size, num_tags)
#model(emissions, tags)   

Some weights of the model checkpoint at FinBERT_L-12_H-768_A-12_pytorch were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Set up epoch number and optimizer

In [58]:
from transformers import AdamW
epochs = 3
#optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
optimizer = AdamW(model.parameters(), lr=5e-5)
#total_steps = len(train_data_loader) * epochs
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training Step

In [59]:
model.train()
for epoch in range(epochs):
    total_loss = 0.
    for step, batch in tqdm(enumerate(train_dataloader),total=len(train_dataloader)):
        optimizer.zero_grad()
        # Extract contents from each batch. They are of the size B*Seq_len
        sent = batch["input_ids"]
        lab = batch["labels"]
        mask = batch["maskings"]
        outputs = model(sent,lab,mask)
        loss = outputs[0]
        loss.backward()
        total_loss+=loss.item()
        optimizer.step()
    print ("loss is: ", total_loss)

100%|██████████| 150/150 [16:22<00:00,  6.55s/it]


loss is:  33213.80809020996


100%|██████████| 150/150 [16:25<00:00,  6.57s/it]


loss is:  8971.87890625


100%|██████████| 150/150 [16:08<00:00,  6.46s/it]

loss is:  4793.221740722656





# Validation Step

In [60]:
results = []
model.eval()
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_dataloader),total=len(test_dataloader)):
        sent = batch["input_ids"]
        mask = batch["maskings"]
        #There is no "lables" in the test step
        outputs = model(sent,maskings = mask)
        results.append(outputs)


100%|██████████| 17/17 [00:28<00:00,  1.65s/it]


In [63]:
final_resulsts = []
for i in results:
    final_resulsts+=i

In [64]:
len(final_resulsts)

100

# Minor Test

In [132]:
sentence = test_sentences[0]
sent = torch.Tensor(sentence).long().view(1,-1)
result = model(sent)
list(zip(result,test_labels[0]))

[([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 0),
 ([8], 1),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8),
 ([8], 8)]

In [144]:
a = [1,2,3,4,5,6,7,8,9,0]

In [145]:
print (a)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]


In [143]:
import random

In [146]:
random.shuffle(a)

[4, 0, 3, 2, 8, 6, 7, 1, 9, 5]

In [43]:
b = torch.Tensor([[1,1,1,1,0,0],[1,1,0,0,0,0]])

In [47]:
b[1,:].tolist().count(1)

2

In [1]:
pip install torchrua

Collecting torchrua
  Downloading torchrua-0.3.1-py3-none-any.whl (11 kB)
Collecting einops
  Downloading einops-0.3.2-py3-none-any.whl (25 kB)
Installing collected packages: einops, torchrua
Successfully installed einops-0.3.2 torchrua-0.3.1
Note: you may need to restart the kernel to use updated packages.
