### 2.Tagging
#### 2.1 LSTM Tagging

##### 2.1.1 Read dataset

In [1]:
import copy
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np



with open("./Mydata.json",'r+') as f:
    Mydata = json.load(f)
tmp = {}
for key in Mydata['id2label']:
    tmp[int(key)] = Mydata['id2label'][key]
Mydata['id2label'] = tmp

device = 'cuda' if torch.cuda.is_available() else 'cpu'

##### 2.2.2 Generate DataLoader

In [2]:
from torch.utils.data import Dataset, DataLoader


class MyDataset(Dataset):
    def __init__(self, data: list, label, word2id, id2label, label2id, device):
        self.data = data
        self.label = label
        self.word2id = word2id
        self.id2label = id2label
        self.label2id = label2id
        self.device = device

    def __getitem__(self, item):
        data = []
        label = []
        # print(item)
        for i in range(len(self.data[item])):
            data.append(self.word2id[self.data[item][i]])
            label.append(self.label2id[self.label[item][i]])
        return data, label

    def __len__(self):
        return len(self.data)

    def batchfy(self, batchData):
        # print(batchData)
        maxlen = max([len(each[0]) for each in batchData])
        padc, padi = self.word2id["PAD"], self.label2id["PAD"]
        batchDatas = []
        padlabels = []
        attention = []
        for i in range(len(batchData)):
            if len(batchData[i][0]) >= maxlen:
                batchDatas.append(batchData[i][0][:])
                padlabels.append(batchData[i][1][:])
                attention.append([1] * len(batchData[i][0]))
                continue
            l = len(batchData[i][0])
            padsize = maxlen - l
            batchDatas.append(batchData[i][0] + [padc] * padsize)
            padlabels.append(batchData[i][1] + [padi] * padsize)
            attention.append([1] * l + [0] * padsize)
        return batchDatas, attention, padlabels, maxlen

In [3]:
BatchSize = 100
trainDataSet = MyDataset(Mydata['trainset']['data'], Mydata['trainset']['label'], Mydata['word2id'], Mydata['id2label'],
                         Mydata['label2id'],device)

trainDataloader = DataLoader(dataset=trainDataSet, batch_size=BatchSize,
                             shuffle=False, collate_fn=trainDataSet.batchfy)

testDataSet = MyDataset(Mydata['testset']['data'], Mydata['testset']['label'],
                        Mydata['word2id'], Mydata['id2label'], Mydata['label2id'],device)
testDataloader = DataLoader(dataset=testDataSet, batch_size=BatchSize,
                            shuffle=False, collate_fn=testDataSet.batchfy)

devDataset = MyDataset(Mydata['devset']['data'], Mydata['devset']['label'],
                        Mydata['word2id'], Mydata['id2label'], Mydata['label2id'],device)
devDataloader = DataLoader(dataset=devDataset, batch_size=BatchSize,
                            shuffle=False, collate_fn=devDataset.batchfy)


##### 2.2.3 Model Definition

In [4]:
class MyLSTM(nn.Module):
    def __init__(self, hidden_size,vocalsize, tagsize,
                 embedding_size,dropout_rate,batch_len, device,encoderon=True):
        super(MyLSTM, self).__init__()

        self.hidden_size = hidden_size
        self.vocalsize = vocalsize
        self.tagsize = tagsize
        self.embedding_size = embedding_size
        self.device = device
        self.batch_len = batch_len
        self.encoderon=encoderon
        if self.encoderon:
            self.encoder = nn.Embedding(self.vocalsize, self.embedding_size)

        self.lstm = nn.LSTM(input_size=self.embedding_size,
                            hidden_size=self.hidden_size,
                            num_layers=1,bidirectional=True,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)

        self.output = nn.Linear(self.hidden_size*2, self.tagsize)

        # self.hidden = (torch.zeros(1,2,self.hidden_size).to(self.device))
        # self.hidden = (torch.zeros(2, self.input_size, self.hidden_size).to(self.device),
        #        torch.zeros(2, self.input_size, self.hidden_size).to(self.device))
        self.hidden = (torch.zeros(2, self.batch_len, self.hidden_size).to(self.device),
               torch.zeros(2, self.batch_len, self.hidden_size).to(self.device))


    def forward(self, sentence):
        l = len(sentence)
        self.adjustHiddenSize(l)

        if self.encoderon:
            emb = self.encoder(sentence)
        else:
            emb = sentence

        lstm_out, self.hidden = self.lstm(emb, self.hidden)
        # lstm_out, _ = self.lstm(emb)
        lstm_out = self.dropout(lstm_out)

        output = self.output(lstm_out)
        return output

    def adjustHiddenSize(self,senlen):
        self.hidden = (torch.zeros(2, senlen, self.hidden_size).to(self.device),
               torch.zeros(2, senlen, self.hidden_size).to(self.device))

##### 2.2.3.1 Test Model

In [5]:
def test():
    model = MyLSTM(embedding_size=10, hidden_size=10,
               vocalsize=len(Mydata['word2id']),
               tagsize=len(Mydata['label2id'])-1,
               dropout_rate=0.1,
               batch_len=BatchSize,
               device=device,
               encoderon=True)
    data,label,size = None,None,0
    for i, (batch, atten, labe, size) in enumerate(trainDataloader):
        data = batch
        label = labe
        break
    model.to(device=device)

    criterion = nn.CrossEntropyLoss(ignore_index=Mydata['label2id']["PAD"])
    with torch.no_grad():
        inp = torch.tensor(np.matrix(data),dtype=torch.long).to(device)
        print(inp.shape)
        tag_scores = model(inp)
        print(tag_scores.shape)
        label = torch.tensor(label,dtype=torch.long).to(device)
        print("label",label.shape)
        loss = 0.
        for i in range(BatchSize):
            loss += criterion(tag_scores[i],label[i])
        print(loss)
test()

torch.Size([100, 47])
torch.Size([100, 47, 9])
label torch.Size([100, 47])
tensor(217.3495, device='cuda:0')


##### 2.2.4 Training and Evaluating

In [6]:
import time
import math

model = MyLSTM(embedding_size=64, hidden_size=64,
               vocalsize=len(Mydata['word2id']),
               tagsize=len(Mydata['label2id']),
               dropout_rate=0.1,
               batch_len=BatchSize,
               device=device,
               encoderon=True).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=Mydata['label2id']["PAD"])
lr = 5  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [7]:

def train(model:nn.Module,epochi:int,dataloader:DataLoader,dtype):
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 50
    start_time = time.time()
    num_batches = len(dataloader)
    for i, (batch, _, label, size) in enumerate(dataloader):
        # print(i)
        batch=torch.tensor(batch, dtype=dtype,device=device)
        label = torch.tensor(label ,dtype=torch.long,device=device)
        # print(batch.shape,batch[0])

        output = model(batch)

        # print(output.shape)

        loss = 0.
        for j in range(len(batch)):
            loss += criterion(output[j],label[j])

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epochi:3d} | {i:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

In [8]:
def evaluate(model: nn.Module, eval_data: DataLoader,dtype,id2label:dict) -> tuple:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    l = 0
    predlabel_word = []
    reallabel_word = []
    with torch.no_grad():
        for i, (batch, atten, label, size) in enumerate(eval_data):
            l+=len(batch)
            batch=torch.tensor(batch,dtype=dtype,device=device)
            label = torch.tensor(label ,dtype=torch.long,device=device)

            output = model(batch)
            pred_id = torch.argmax(output.data,dim=-1)
            for i in range(len(batch)):
                total_loss += criterion(output[i],label[i])
                tmp_pred = []
                tmp_real = []
                for j in range(size):
                    if label[i][j] == 9:
                        break
                    tmp_real.append(id2label[label[i][j].item()])
                    tmp_pred.append(id2label[pred_id[i][j].item()])
                predlabel_word.append(tmp_pred[:])
                reallabel_word.append(tmp_real[:])

    return total_loss / l,\
           accuracy_score(reallabel_word,predlabel_word),\
           f1_score(reallabel_word,predlabel_word,zero_division=1)

In [16]:
from seqeval.metrics import classification_report,accuracy_score,f1_score



def generate_Report(model: nn.Module, eval_data: DataLoader,id2label:dict,dtype):
    model.eval()
    predlabel_word = []
    reallabel_word = []
    with torch.no_grad():
        for _, (batch, _, label, size) in enumerate(eval_data):
            batch=torch.tensor(batch,dtype=dtype,device=device)

            output = model(batch)
            pred_id = torch.argmax(output.data,dim=-1)
            for i in range(len(batch)):
                tmp_pred = []
                tmp_real = []
                for j in range(size):
                    if label[i][j] == 9:
                        break
                    tmp_real.append(id2label[label[i][j]])
                    tmp_pred.append(id2label[pred_id[i][j].item()])
                predlabel_word.append(tmp_pred[:])
                reallabel_word.append(tmp_real[:])
    print("-----Output classification report:-----")
    print("ACC:",accuracy_score(reallabel_word,predlabel_word),end=" ")
    try:
        f1 = f1_score(reallabel_word,predlabel_word,zero_division=1)

    except TypeError:
        f1 = 'Nan'
    print("f1: ",f1)

    print(classification_report(reallabel_word,predlabel_word))

##### 2.2.5 Train the Model

In [10]:
from torch.utils.tensorboard import SummaryWriter
best_val_loss = float('inf')
epochs = 10
best_model = None

torch.manual_seed(100)
writer = SummaryWriter("runs/LSTM")

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model,epoch,trainDataloader,torch.long)
    val_loss,val_acc,val_f1 = evaluate(model, devDataloader, torch.long,Mydata['id2label'])
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    writer.add_scalar('Loss',val_loss,epoch)
    writer.add_scalar("Acc",val_acc,epoch)
    writer.add_scalar("F1",val_acc,epoch)

    if val_loss < best_val_loss:
        print("Find a better model")
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

generate_Report(best_model,testDataloader,Mydata['id2label'],torch.long)

##### 2.2.6 Generate Vector by Glove

In [11]:
from transformers import AutoModel,AutoTokenizer

class MyDatasetWithGlove(MyDataset):
    def __init__(self, data: list, label, word2id, id2label, label2id,
                 device, tokenizer,GloveModel):
        super(MyDatasetWithGlove,self).__init__(data, label, word2id, id2label, label2id, device)
        self.Glovetokenizer = tokenizer
        self.GloveModel = GloveModel

    def checkVector(self,vector):
        l = len(vector)
        vector = vector.last_hidden_state
        if l==1:
            return vector[0].cpu().detach().numpy().tolist()
        elif l>1:
            t = vector[0]
            for i in range(1,l):
                t += vector[i]
            return (t/l).cpu().detach().numpy().tolist()
        else:
            raise "Vector: length error"

    def GenerateVector(self,sentence):
        ret = []
        for word in sentence:
            token = self.Glovetokenizer.encode(word)
            vec = self.GloveModel(token)
            vec = self.checkVector(vec)
            # print(len(vec))
            ret.append(vec)
        return ret

    def __getitem__(self, item):
        label = []
        # print(item)
        for i in range(len(self.data[item])):
            label.append(self.label2id[self.label[item][i]])
        data = self.GenerateVector(self.data[item])

        return data, label

    def batchfy(self, batchData):
        # print(batchData)
        maxlen = max([len(each[0]) for each in batchData])
        padc, padi = self.GenerateVector(["pad"])[0], self.label2id["PAD"]
        batchDatas = []
        padlabels = []
        attention = []
        for i in range(len(batchData)):
            if len(batchData[i][0]) >= maxlen:
                batchDatas.append(batchData[i][0])
                padlabels.append(batchData[i][1][:])
                attention.append([1] * len(batchData[i][0]))
                continue
            l = len(batchData[i][0])
            padsize = maxlen - l
            batchDatas.append(batchData[i][0] + [padc] * padsize)
            padlabels.append(batchData[i][1] + [padi] * padsize)
            attention.append([1] * l + [0] * padsize)
        return batchDatas, attention, padlabels, maxlen

In [12]:
BatchSize = 100
tokenizer = AutoTokenizer.from_pretrained("Iseratho/glove-wiki-gigaword-50")
GloveModel = AutoModel.from_pretrained("Iseratho/glove-wiki-gigaword-50",
                                                    trust_remote_code=True)

trainGlove = MyDatasetWithGlove(Mydata['trainset']['data'], Mydata['trainset']['label'], Mydata['word2id'], Mydata['id2label'],
                         Mydata['label2id'],device,tokenizer=tokenizer,GloveModel=GloveModel)
trainGloveLoader = DataLoader(dataset=trainGlove, batch_size=BatchSize,
                            shuffle=False, collate_fn=trainGlove.batchfy)

testGlove = MyDatasetWithGlove(Mydata['testset']['data'], Mydata['testset']['label'],
                        Mydata['word2id'], Mydata['id2label'], Mydata['label2id'],device,tokenizer=tokenizer,GloveModel=GloveModel)
testGloveloader = DataLoader(dataset=testGlove, batch_size=BatchSize,
                            shuffle=False, collate_fn=testGlove.batchfy)

devGlove = MyDatasetWithGlove(Mydata['devset']['data'], Mydata['devset']['label'],
                        Mydata['word2id'], Mydata['id2label'], Mydata['label2id'],device,tokenizer=tokenizer,GloveModel=GloveModel)
devGloveloader = DataLoader(dataset=devGlove, batch_size=BatchSize,
                            shuffle=False, collate_fn=devGlove.batchfy)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


##### 2.2.7 Train Model with Glove

In [13]:
modelWithGlove = MyLSTM(embedding_size=50, hidden_size=64,
               vocalsize=1, # useless in this situation
               tagsize=len(Mydata['label2id']),
               dropout_rate=0.1,
               device=device,
               batch_len=BatchSize,
               encoderon=False).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=Mydata['label2id']["PAD"])
lr = 1  # learning rate
optimizer = torch.optim.SGD(modelWithGlove.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [14]:
import time
import math

best_val_loss = float('inf')
epochs = 20
best_model = None

torch.manual_seed(100)

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/LSTM")

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(modelWithGlove,epoch,trainGloveLoader,torch.float32)
    val_loss,val_acc,val_f1 = evaluate(modelWithGlove, devGloveloader, torch.float32,Mydata['id2label'])
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    writer.add_scalar('Loss',val_loss,epoch)
    writer.add_scalar("Acc",val_acc,epoch)
    writer.add_scalar("F1",val_f1,epoch)


    if val_loss < best_val_loss:
        print("Find a better model")
        best_val_loss = val_loss
        best_model = copy.deepcopy(modelWithGlove)

    scheduler.step()

generate_Report(best_model,testGloveloader,Mydata['id2label'],torch.float32)

| epoch   1 |    50/  141 batches | lr 1.00 | ms/batch 126.46 | loss 78.33 | ppl 10404234763134928851667631239856128.00
| epoch   1 |   100/  141 batches | lr 1.00 | ms/batch 125.45 | loss 44.66 | ppl 24973057650038095872.00
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 29.53s | valid loss  0.34 | valid ppl     1.41
-----------------------------------------------------------------------------------------
Find a better model
| epoch   2 |    50/  141 batches | lr 0.95 | ms/batch 121.65 | loss 32.62 | ppl 146465655050137.66
| epoch   2 |   100/  141 batches | lr 0.95 | ms/batch 124.59 | loss 29.34 | ppl 5507529384118.45
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 29.18s | valid loss  0.25 | valid ppl     1.29
-----------------------------------------------------------------------------------------
Find a better model
| epoch   3 |    50/  141 bat

  self.dropout, self.training, self.bidirectional, self.batch_first)


-----Output classification report:-----
ACC: 0.9614730268116722 f1:  0.8119886161508361
              precision    recall  f1-score   support

         LOC       0.84      0.88      0.86      1668
        MISC       0.69      0.64      0.66       702
         ORG       0.75      0.74      0.75      1661
         PER       0.91      0.88      0.89      1617

   micro avg       0.82      0.81      0.81      5648
   macro avg       0.80      0.78      0.79      5648
weighted avg       0.81      0.81      0.81      5648



##### 2.2.8 save result file

In [18]:
def getPred(model:nn.Module,eval_data:DataLoader,id2label:dict,dtype):
    model.eval()
    predlabel_word = []
    with torch.no_grad():
        for _, (batch, _, label, size) in enumerate(eval_data):
            batch=torch.tensor(batch,dtype=dtype,device=device)

            output = model(batch)
            pred_id = torch.argmax(output.data,dim=-1)
            for i in range(len(batch)):
                tmp_pred = []
                tmp_real = []
                for j in range(size):
                    if label[i][j] == 9:
                        break
                    tmp_real.append(id2label[label[i][j]])
                    tmp_pred.append(id2label[pred_id[i][j].item()])
                predlabel_word += tmp_pred[:]
    return predlabel_word

In [19]:
predlabel=getPred(best_model,testGloveloader,Mydata['id2label'],torch.float32)

  self.dropout, self.training, self.bidirectional, self.batch_first)


In [25]:
import csv
def writeTestFile(pred):
    reader = csv.reader(open("./data/test.txt"),delimiter=' ')
    wri = csv.writer(open('./test_LSTM.txt','w+',newline=''),delimiter=' ')
    i = 0
    for line in reader:
        if not line or line[0]=="-DOCSTART-":
            wri.writerow(line)
            continue
        line[-1] = pred[i]
        wri.writerow(line)
        i+=1
    if i!=len(predlabel)-1:
        assert "Number Error"
writeTestFile(predlabel)