In [25]:
import copy

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils import data

import tensorflow as tf
# from seqeval.metrics import f1_score,classification_report

In [26]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import warnings
import random

warnings.filterwarnings('ignore')

data = pd.read_csv("aidai/AIDEA_Train2_p.txt", sep='\t'
                   ,encoding = 'utf-8',
#                    header = None
                  )

dev_data = pd.read_csv("aidai/AIDEA_Dev2_p.txt",sep = '\t',encoding = 'utf-8')
data

Unnamed: 0,0,1,Sentence#
0,醫師,O,1
1,：,O,1
2,啊,O,1
3,回去,O,1
4,還好,O,1
...,...,...,...
304177,照,O,25572
304178,個,O,25572
304179,x,O,25572
304180,光,O,25572


In [27]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [28]:
from collections import defaultdict
def end_of_chunk(prev_tag, tag, prev_type, type_):
    chunk_end = False
    if prev_tag == 'L': chunk_end = True
    if prev_tag == 'U': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'U': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'U': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    return chunk_end

def start_of_chunk(prev_tag, tag, prev_type, type_):
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'U': chunk_start = True

    if prev_tag == 'L' and tag == 'L': chunk_start = True
    if prev_tag == 'L' and tag == 'I': chunk_start = True
    if prev_tag == 'U' and tag == 'L': chunk_start = True
    if prev_tag == 'U' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'L': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    return chunk_start
def get_entities(seq, suffix=False):
    if any(isinstance(s, list) for s in seq):
        seq = [item for sublist in seq for item in sublist + ['O']]

    prev_tag = 'O'
    prev_type = ''
    begin_offset = 0
    chunks = []
    for i, chunk in enumerate(seq + ['O']):
        if suffix:
            tag = chunk[-1]
            type_ = chunk.split('-')[0]
        else:
            tag = chunk[0]
            type_ = chunk.split('-')[-1]

        if end_of_chunk(prev_tag, tag, prev_type, type_):
            chunks.append((prev_type, begin_offset, i-1))
        if start_of_chunk(prev_tag, tag, prev_type, type_):
            begin_offset = i
        prev_tag = tag
        prev_type = type_

    return chunks
def f1_score(y_true, y_pred, average='micro', suffix=False):
    true_entities = set(get_entities(y_true, suffix))
#     print(true_entities)
    pred_entities = set(get_entities(y_pred, suffix))
#     print(pred_entities)

    nb_correct = len(true_entities & pred_entities)
#     print(true_entities & pred_entities)
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0
    score = 2 * p * r / (p + r) if p + r > 0 else 0

    return score
def classification_report(y_true, y_pred, digits=2, suffix=False):
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    name_width = 0
    d1 = defaultdict(set)
    d2 = defaultdict(set)
    for e in true_entities:
        d1[e[0]].add((e[1], e[2]))
        name_width = max(name_width, len(e[0]))
    for e in pred_entities:
        d2[e[0]].add((e[1], e[2]))

    last_line_heading = 'macro avg'
    width = max(name_width, len(last_line_heading), digits)

    headers = ["precision", "recall", "f1-score", "support"]
    head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
    report = head_fmt.format(u'', *headers, width=width)
    report += u'\n\n'

    row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'

    ps, rs, f1s, s = [], [], [], []
    for type_name, true_entities in d1.items():
        pred_entities = d2[type_name]
        nb_correct = len(true_entities & pred_entities)
        nb_pred = len(pred_entities)
        nb_true = len(true_entities)

        p = nb_correct / nb_pred if nb_pred > 0 else 0
        r = nb_correct / nb_true if nb_true > 0 else 0
        f1 = 2 * p * r / (p + r) if p + r > 0 else 0

        report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits)

        ps.append(p)
        rs.append(r)
        f1s.append(f1)
        s.append(nb_true)

    report += u'\n'

    # compute averages
    report += row_fmt.format('micro avg',
                             precision_score(y_true, y_pred, suffix=suffix),
                             recall_score(y_true, y_pred, suffix=suffix),
                             f1_score(y_true, y_pred, suffix=suffix),
                             np.sum(s),
                             width=width, digits=digits)
    report += row_fmt.format(last_line_heading,
                             np.average(ps, weights=s),
                             np.average(rs, weights=s),
                             np.average(f1s, weights=s),
                             np.sum(s),
                             width=width, digits=digits)

    return report
def precision_score(y_true, y_pred, average='micro', suffix=False):
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities)
    nb_pred = len(pred_entities)

    score = nb_correct / nb_pred if nb_pred > 0 else 0

    return score
def recall_score(y_true, y_pred, average='micro', suffix=False):
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities)
    nb_true = len(true_entities)

    score = nb_correct / nb_true if nb_true > 0 else 0

    return score

In [29]:
words = list(set(data['0'].values))
words.append("ENDPAD")
words.append("unk")

In [30]:
n_words = len(words); n_words
tags = list(set(data['1'].values))
tags.append("ENDPAD")

In [31]:
len(tags)

26

In [32]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['0'].values.tolist(),
                                                     s['1'].values.tolist())]
        self.grouped = self.data.groupby("Sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [33]:
getter = SentenceGetter(data)
dev_getter = SentenceGetter(dev_data)

In [34]:
sentences = getter.sentences
dev_sentences = dev_getter.sentences
# len(sentences)

In [35]:
len(max(sentences,key = len))

378

In [36]:
word2idx = {w: i for i, w in enumerate(words)}
idx2word = {i: w for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {i: t for i, t in enumerate(tags)}

In [37]:
from keras.preprocessing.sequence import pad_sequences
dev_X = []
for s in dev_sentences:
    b = []
    for w in s:
        try:
            b.append(word2idx[w[0]])
        except:
            b.append(word2idx['unk'])
    dev_X.append(b)

X = [[word2idx[w[0]] for w in s] for s in sentences]

In [38]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

dev_y = []
for s in dev_sentences:
    b = []
    for w in s:
        try:
            b.append(tag2idx[w[1]])
        except:
            tags.append(w[1])
            tag2idx[w[1]] = len(tags)-1
            idx2tag[len(tags)-1] = w[1]
            b.append(tag2idx[w[1]])
    dev_y.append(b)
    
n_tags = len(tags)

In [39]:
for i,j in zip(dev_X,dev_y):
    if len(i) != len(j):
        print('asd')

In [40]:
from torch.utils.data import Dataset
from torchcrf import CRF
class NERDataset(Dataset):
    def __init__(self,sentences,labels, word_pad_idx, tag_pad_idx, max_len = 500):
        self.sentences = sentences
        self.labels = labels
        self.word_pad_idx = word_pad_idx
        self.tag_pad_idx = tag_pad_idx
        self.max_len = max_len
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        return (self.sentences[index],self.labels[index])
        
    def collate_fn(self, datasets):
        sentences = [dataset[0] for dataset in datasets]
        labels = [dataset[1] for dataset in datasets]
        max_sent = max([len(data) for data in sentences])
        max_len = max([min(len(sentence), self.max_len) for sentence in sentences])
        pad_sentence = []
        pad_label = []
        for sentence,label in zip(sentences,labels):
            
            if len(sentence) > max_len:
#                 print('asd')
                pad_sentence.append(sentence[:max_len])
                pad_label.append(label[:max_len])
            else:
#                 print('zxc')
                pad_sentence.append(sentence+[self.word_pad_idx]*(max_len-len(sentence)))
                pad_label.append(label+[self.tag_pad_idx]*(max_len-len(label)))
        return torch.LongTensor(pad_sentence), torch.LongTensor(pad_label)

In [41]:
from torch.utils.data import TensorDataset, DataLoader
bs = 4

tr_dataset = NERDataset(X,y,word2idx['ENDPAD'],tag2idx['ENDPAD'])
train_dataloader = DataLoader(tr_dataset, batch_size=bs,
                              collate_fn=tr_dataset.collate_fn,
                             )
va_dataset = NERDataset(dev_X,dev_y,word2idx['ENDPAD'],tag2idx['ENDPAD'])
valid_dataloader = DataLoader(va_dataset, batch_size=bs,
                              collate_fn=va_dataset.collate_fn,
                             )

In [42]:
for i in tr_dataset:
    a = 'a'

In [43]:
all_dataloader = {
    'train' : train_dataloader,
#     'valid' : valid_dataloader,
}
for i in all_dataloader:
    print(i)

train


In [49]:
class EmbeddedRnn(nn.Module):
    def __init__(self, vocab, hidden_dim, output_vocab, n_layer,word_pad_idx,tag_pad_idx):
        super(EmbeddedRnn, self).__init__()
        self.n_layer = n_layer
        self.embedding_size = 50
        self.hidden_dim = hidden_dim
        self.embedded = nn.Embedding(vocab, self.embedding_size , padding_idx  = word_pad_idx)
        self.lstm = nn.LSTM(self.embedding_size, hidden_dim, num_layers=n_layer,batch_first = True, bidirectional=True)
        self.fc1 = nn.Linear(2 * hidden_dim, output_vocab)
        self.softmax = nn.Softmax(dim=-1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.crf = CRF(num_tags=output_vocab, batch_first = True)
        self.tag_pad_idx = tag_pad_idx
        
    def forward(self, x, hidden,y_tag):
        embedded = self.embedded(x)
        output, hidden = self.lstm(embedded, hidden)
        output = self.fc1(output)
#         output = self.softmax(output)
        if y_tag is not None:
            mask = y_tag != self.tag_pad_idx
            crf_out = self.crf.decode(output, mask=mask)
            crf_loss = -self.crf(output, tags=y_tag, mask=mask)
        else:
            crf_out = self.crf.decode(output)
            crf_loss = None
        return crf_out, crf_loss
#         return output, hidden
    
    def initHidden(self, batch_size):
        hidden = Variable(torch.zeros(2 * self.n_layer, batch_size, self.hidden_dim))
        cell = Variable(torch.zeros(2 * self.n_layer, batch_size, self.hidden_dim))
        return [hidden, cell]
#         return hidden
    def init_crf_transitions(self, tag_names, imp_value=-100):
#         crf = CRF(num_tags=len(tag_names))
        num_tags = len(tag_names)
        for i in range(num_tags):
            tag_name = tag_names[i]
            if tag_name[0] in ("I") or tag_name == "ENDPAD":
                torch.nn.init.constant_(self.crf.start_transitions[i], imp_value)
        tag_is = {}
        for tag_position in ("B", "I", "O"):
            tag_is[tag_position] = [i for i, tag in enumerate(tag_names) if tag[0] == tag_position]
        impossible_transitions_position = {
            "O": "I",  
        }
        for from_tag, to_tag_list in impossible_transitions_position.items():
            to_tags = list(to_tag_list)
            for from_tag_i in tag_is[from_tag]:
                for to_tag in to_tags:
                    for to_tag_i in tag_is[to_tag]:
                        torch.nn.init.constant_(
                            self.crf.transitions[from_tag_i, to_tag_i], imp_value
                        )
        # init impossible B and I transitions to different entity types
        impossible_transitions_tags = {
            "B": "I",
            "I": "I",
        }
        for from_tag, to_tag_list in impossible_transitions_tags.items():
            to_tags = list(to_tag_list)
            for from_tag_i in tag_is[from_tag]:
                for to_tag in to_tags:
                    for to_tag_i in tag_is[to_tag]:
                        if tag_names[from_tag_i].split("-")[1] != tag_names[to_tag_i].split("-")[1]:
                            torch.nn.init.constant_(
                                self.crf.transitions[from_tag_i, to_tag_i], imp_value
                            )

In [50]:
use_cuda = torch.cuda.is_available()
word_pad_idx = word2idx['ENDPAD']
tag_pad_idx = tag2idx['ENDPAD']
model = EmbeddedRnn(n_words, 256, n_tags,2,word_pad_idx,tag_pad_idx)
model

EmbeddedRnn(
  (embedded): Embedding(7579, 50, padding_idx=7577)
  (lstm): LSTM(50, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=27, bias=True)
  (softmax): Softmax(dim=-1)
  (sigmoid): Sigmoid()
  (relu): ReLU()
  (crf): CRF(num_tags=27)
)

In [51]:
# for x in all_dataloader['train']:
#     print(all_dataloader['train'])
#     print(x)
model.init_crf_transitions(tags)
if use_cuda:
    print('use_cuda')
    model = model.cuda(0)

optimizer = optim.AdamW(model.parameters(), lr=5e-3)

num_epoch = 10

use_cuda


In [53]:
records = {
    'loss':[],
    'F1':[],
}
model.train(True)
for epoch in tqdm(range(num_epoch)):
    # train_loss,valid_loss = [],[]
    all_loss = {
        'train': [],
        'valid': [],
    }
    print('')
    for loader in all_dataloader:
        predictions , true_labels  = [],[]
        for x, y in all_dataloader[loader]:
            optimizer.zero_grad()
            x = x.cuda(0) if use_cuda else x
            y = y.cuda(0) if use_cuda else y
            hidden = model.initHidden(x.size(0))
            if use_cuda:
                hidden[0] = hidden[0].cuda(0)
                hidden[1] = hidden[1].cuda(0)
# #                 hidden = hidden.cuda(0)
            output, loss = model(x,hidden,y)
            if loader == 'train':
                loss.backward()
                optimizer.step()
            all_loss[loader].append(loss.cpu().item()) 
#             print(output)
#             print(y)
            predictions.extend([[idx2tag[j] for j in i] for i in output])
#             print(predictions)
        
            for i in y.detach().cpu().numpy():
                _ = []
                for j in i:
                    if j != tag_pad_idx:
                        _.append(idx2tag[j])
                true_labels.append(_)
                
#             print(true_labels)
#             break
        print(f'{loader}_loss : {np.mean(np.array(all_loss[loader]))}')
        f_ = f1_score(true_labels,predictions)
        print(f'{loader}_F1: {f_}')
        if loader == 'valid':
            records['loss'].append(np.mean(np.array(all_loss[loader])))
            records['F1'].append(f_)

# records = np.array(records)
# print(classification_report([valid_tags],[pred_tags]))
# plt.plot(np.array(records['loss']), label='valid loss')
# plt.plot(np.array(records['F1']), label='valid F1')
# plt.legend()
# plt.show()

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]


train_loss : 6.133849587515824


 10%|████████▏                                                                         | 1/10 [04:45<42:47, 285.30s/it]

train_F1: 0.004502251125562782

train_loss : 4.118219335109401


 20%|████████████████▍                                                                 | 2/10 [09:37<38:19, 287.44s/it]

train_F1: 0.23779976521884957

train_loss : 2.873372875765868


 30%|████████████████████████▌                                                         | 3/10 [14:31<33:44, 289.20s/it]

train_F1: 0.4257369614512471

train_loss : 2.361416420950756


 40%|████████████████████████████████▊                                                 | 4/10 [19:20<28:55, 289.17s/it]

train_F1: 0.4837733773377338

train_loss : 2.054316849306624


 50%|█████████████████████████████████████████                                         | 5/10 [23:59<23:51, 286.36s/it]

train_F1: 0.5370774263904035

train_loss : 1.8052574842958482


 60%|█████████████████████████████████████████████████▏                                | 6/10 [28:35<18:52, 283.17s/it]

train_F1: 0.5733944954128442

train_loss : 1.5710236426159079


 70%|█████████████████████████████████████████████████████████▍                        | 7/10 [33:22<14:12, 284.33s/it]

train_F1: 0.5934686198091654

train_loss : 1.418414828462368


 80%|█████████████████████████████████████████████████████████████████▌                | 8/10 [38:10<09:30, 285.47s/it]

train_F1: 0.6145734757334055

train_loss : 1.3304320290864176


 90%|█████████████████████████████████████████████████████████████████████████▊        | 9/10 [42:45<04:42, 282.08s/it]

train_F1: 0.624126813541107

train_loss : 1.305042970260693


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [47:37<00:00, 285.71s/it]

train_F1: 0.6333874458874459





In [36]:
np.array(predictions).shape

(400,)

In [None]:
for i in output:
    for j in i:
        print(j)

In [None]:
predictions , true_labels , x_  = [],[],[]
for x, y in all_dataloader['valid']:
    optimizer.zero_grad()
    x = x.cuda(0) if use_cuda else x
    y = y.cuda(0) if use_cuda else y
    hidden = model.initHidden(x.size(0))
    if use_cuda:
        hidden[0] = hidden[0].cuda(0)
        hidden[1] = hidden[1].cuda(0)
    output, hidden = model(x, hidden)
    x_.extend(x.detach().cpu().numpy())
    predictions.extend(np.argmax(output.detach().cpu().numpy(), axis=2))
    true_labels.extend(np.argmax(y.detach().cpu().numpy(), axis=2))
# pred_tags = [idx2tag[p_i] for p, l , __ in zip(predictions, true_labels,x_) for p_i, l_i ,_i in zip(p, l,__) if idx2word[_i] != "ENDPAD"]
# valid_tags = [idx2tag[l_i] for l, __ in zip(true_labels,x_) for l_i,_i in zip(l,__) if idx2word[_i] != "ENDPAD"]
# print(classification_report([valid_tags],[valid_tags]))

In [None]:
pred_tags = []
valid_tags = []
for p, l , __ in zip(predictions, true_labels,x_):
    _1 = []
    _2 = []
#     _3 = []
    for p_i, l_i ,_i in zip(p, l,__):
        if idx2word[_i] != "ENDPAD":
            _1.append(idx2tag[p_i])
            _2.append(idx2tag[l_i])
#         _3.append(_i)
    pred_tags.append(_1)
    valid_tags.append(_2)

In [None]:
print(classification_report(valid_tags,pred_tags))