### 2.Tagging
#### 2.1 LSTM Tagging


In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np



with open("./Mydata.json",'r+') as f:
    dataset = json.load(f)
tmp = {}
for key in dataset['id2label']:
    tmp[int(key)] = dataset['id2label'][key]
dataset['id2label'] = tmp

In [25]:

class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size,
                 vocalsize, tagsize, embedding_size,dropout_rate, device,batch_size):
        super(MyLSTM, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.vocalsize = vocalsize
        self.tagsize = tagsize
        self.embedding_size = embedding_size
        self.device = device
        self.batch_size = batch_size

        # self.word_embeddings = nn.Embedding(self.vocalsize, self.input_size)
        self.encoder = nn.Embedding(self.vocalsize, self.embedding_size)

        self.lstm = nn.LSTM(input_size=self.embedding_size,
                            hidden_size=self.hidden_size,
                            num_layers=1,bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)

        self.output = nn.Linear(self.hidden_size*2, self.tagsize)

        # self.hidden = (torch.zeros(1,2,self.hidden_size).to(self.device))
        # self.hidden = (torch.zeros(2, self.input_size, self.hidden_size).to(self.device),
        #        torch.zeros(2, self.input_size, self.hidden_size).to(self.device))
        self.init_hidden()


    def forward(self, sentence):
        # embed = self.word_embeddings(sentence)
        # vie = embed.view(len(sentence), self.input_size)
        # print(vie.shape)
        # print("print sentence",sentence)
        emb = self.encoder(sentence)
        # print("print inside model",emb,emb.shape)
        # input()
        lstm_out, self.hidden = self.lstm(emb, self.hidden)
        lstm_out = self.dropout(lstm_out)

        # tag_space = self.output()
        # tag_score = F.log_softmax(tag_space, dim=self.tagsize)
        return self.output(lstm_out[:,-1,:])

    def init_hidden(self):
        self.hidden = (torch.zeros(2, self.batch_size, self.hidden_size).to(self.device),
               torch.zeros(2, self.batch_size, self.hidden_size).to(self.device))
        # self.hidden = torch.zeros(1,batch_size,self.hidden_size).to(self.device)

In [19]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device='cpu'
model = MyLSTM(input_size=1, embedding_size=10, hidden_size=10,
               vocalsize=len(dataset['word2id']),
               tagsize=len(dataset['label2id'])-1,
               dropout_rate=0.1,
               device=device,
               batch_size=1)
model = model.to(device)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

with torch.no_grad():
    # inp = torch.tensor(np.matrix(dataset['trainset']['data'][0]).T, dtype=torch.long).unsqueeze(0)
    # inp = torch.tensor(np.matrix(dataset['trainset']['data'][0]).T[0],dtype=torch.long)
    idx = dataset['trainset']['data'][0][1].index(0)
    model.init_hidden()
    # print(idx)
    inp = torch.tensor(np.matrix([dataset['trainset']['data'][0][0][:idx]]),
                       dtype = torch.long).to(device)
    # print(inp)
    # hidden = model.init_hidden(126)
    tag_scores = model(inp)
    print("tag",tag_scores.shape,tag_scores)

tag torch.Size([1, 9]) tensor([[ 0.1796, -0.0555, -0.1661,  0.0823,  0.0715, -0.1701, -0.2067, -0.0733,
         -0.1361]])


In [4]:
def getSeqLen(seqData):
    for i in range(len(seqData[1])):
        if seqData[1][i] == 0:
            return i
    return len(seqData[1])

def getTrueTag(dataset):
    data = dataset['data']
    label = dataset['label']
    trueLabel = []
    for idx,setence in enumerate(data):
        trueLabel.append(label[idx][:getSeqLen(setence)][:])
    return trueLabel

def id2Label(labels,id2label:dict):
    retlabel = []
    for seq in labels:
        tmp = [id2label[each] for each in seq]
        retlabel.append(tmp[:])
    return retlabel

def id2Label_RemoveSpecialToken(labels,id2label:dict):
    retlabel = []
    pad = len(id2label)
    for idx,seq in enumerate(labels):
        tmp = []
        for each in seq:
            if each == pad:
                break
            tmp.append(id2label[each])
        retlabel.append(tmp[:])
    return retlabel

# def get_batch(source,label,idx,batch_size):
#     dataset = source[idx*batch_size:idx*batch_size+batch_size]
#     labels = label[idx*batch_size:idx*batch_size+batch_size]
#     maxlen = max([getSeqLen(each) for each in dataset])
#     batchdata = []
#     for each in dataset:
#         batchdata.append(each[0][:maxlen])
#     label = [la[:maxlen] for la in labels]
#     return batchdata,label,maxlen


In [29]:
# from seqeval.metrics import classification_report
from seqeval.metrics import classification_report,accuracy_score,f1_score
from datetime import datetime

batch_size = 1
EPOCH = 10
learning_rate = 0.3
testFreq = 1
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device='cpu'
torch.manual_seed(100)
traindata = dataset['trainset']['data']
# trainlabel = dataset['trainset']['label']
trainlabel = getTrueTag(dataset['trainset'])

testdata = dataset['testset']['data']
# testlabel = dataset['testset']['label']
testlabel = getTrueTag(dataset['testset'])
testLabelWord = id2Label(testlabel,dataset['id2label'])
testLabelWordnoSpe = id2Label_RemoveSpecialToken(testlabel,dataset['id2label'])

model = MyLSTM(input_size=1, embedding_size=64, hidden_size=64,
               vocalsize=len(dataset['word2id']),
               # tagsize=len(dataset['label2id'])-1,      # remove PAD
               tagsize=len(dataset['label2id']),
               dropout_rate=0.1,
               device=device,
               batch_size=batch_size).to(device)
# model = model.to(device)


minloss = float('inf')
# BestModel = None

for epoch in range(EPOCH):
    # tag_sc = np.array([])
    epoch_loss = 0.0
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    for idx,sentence in enumerate(traindata):
    # for _, i in enumerate(range(0,len(traindata)-1,batch_size)):
        model.train()
        optimizer.zero_grad()
        model.zero_grad()


        # sen_tag = []
        seqlen = getSeqLen(sentence)
        sen_truetag = torch.tensor(trainlabel[idx],dtype=torch.long).to(device)
        # print("sen_shape:",sen_truetag.shape)
        # for word in np.matrix(sentence).T:
        # sentence,sen_truetag,seqlen = get_batch(traindata,dataset['trainset']['label'],i,batch_size)
        model.init_hidden()
        inp = torch.tensor(np.matrix(sentence[0][:seqlen]).T,dtype=torch.long).to(device)
        # inp = torch.tensor(np.matrix(sentence),dtype=torch.long).to(device)
        # print("inp:",inp.shape)
        tag_scores = model(inp)
        sen_truetag = torch.tensor(sen_truetag,dtype=torch.long).to(device)
        loss = criterion(tag_scores,sen_truetag)
        # loss = 0.0
        loss.backward()
        optimizer.step()
        epoch_loss+=tag_scores.shape[0]*loss.item()
    # print(true_tags.shape)
    # print(tag_sc.shape)
    # true_tags = torch.tensor(trainlabel,dtype=torch.long).to(device)
    # tag_sc = torch.tensor(np.array(tag_sc),dtype=torch.long).to(device)

    # print(loss.real)
    # loss.backward()
    # optimizer.step()
    aveloss = epoch_loss/len(traindata)
    print("-----End Epoch ",epoch,"loss:",aveloss,"-----")
    if aveloss<minloss:
        minloss = aveloss
        torch.save(model,f"./LSTModle/EPOCH{epoch}_{aveloss}_{datetime.now().ctime().replace(' ','_').replace(':','_')}.pt")
        print(f"find a better model, model saved as ./LSTModle/EPOCH{epoch}_{aveloss}_{datetime.now().ctime().replace(' ','_').replace(':','_')}.pt")

    if (epoch+1)%testFreq==0:
        # test the model
        print("-----Test model on ",epoch,'-----')
        predict = []
        optimizer.zero_grad()
        for idx,sentence in enumerate(testdata):
            model.init_hidden()
            # sen_tag = []
            seqlen = getSeqLen(sentence)
            sen_truetag = torch.tensor(testlabel[idx][:seqlen],dtype=torch.long).to(device)
            # for word in np.matrix(sentence).T:
            inp = torch.tensor(np.matrix(sentence[0][:seqlen]).T,dtype=torch.long).to(device)
            # print("inp:",idx)
            tag_scores = model(inp)
            thispredict = torch.max(tag_scores.data,1).indices.cpu()
            # print(thispredict.shape)
            predict.append(list(thispredict.numpy()))

        print("-----Output classification report:-----")
        predict = id2Label(predict,dataset['id2label'])
        print("ACC:",accuracy_score(testLabelWordnoSpe,predict)," F1:",f1_score(testLabelWordnoSpe,predict,zero_division=1))
        print(classification_report(testLabelWordnoSpe,predict))





-----End Epoch  0 loss: 6.114168236387533 -----
find a better model, model saved as ./LSTModle/EPOCH0_6.114168236387533_Mon_Nov__7_19_05_42_2022.pt
-----Test model on  0 -----
-----Output classification report:-----
ACC: 0.8722299989232261  F1: 0.3994074074074073
              precision    recall  f1-score   support

         LOC       0.64      0.57      0.60      1668
        MISC       0.45      0.28      0.35       702
         ORG       0.41      0.24      0.30      1661
         PER       0.30      0.30      0.30      1617

   micro avg       0.45      0.36      0.40      5648
   macro avg       0.45      0.35      0.39      5648
weighted avg       0.45      0.36      0.39      5648



KeyboardInterrupt: 

### 2.2 Transformer Tagger

In [None]:
# device='cpu'
# model2 = MyLSTM(input_size=2, embedding_size=10, hidden_size=10,
#                vocalsize=len(dataset['word2id']),
#                tagsize=len(dataset['label2id']),
#                device=device).to(device)
# errdata  = dataset['testset']['data'][3451]
# errlabel = dataset['testset']['label'][3451]
# print(len(errdata[0]),len(errdata[1]))
# print(errlabel)
# print(errdata[0],errdata[1])
# # ... 3 2 3 3 3....???
#
# # model2.init_hidden(2)
# sen_truetag = torch.tensor(errlabel,dtype=torch.long).to(device)
#
# inp = torch.tensor(np.matrix(errdata).T,dtype=torch.long).to(device)
# print(inp.shape)
# tag_scores = model2(inp)
# predict = torch.max(tag_scores.data,1)