In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import pathlib
import time
import os

import torch
!pip install pytorch_pretrained_bert==0.4.0
from pytorch_pretrained_bert import BertTokenizer, BertModel
from pytorch_pretrained_bert.modeling import PreTrainedBertModel
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from torch.utils import data
from torch import nn, optim
from torch.nn.functional import relu, sigmoid

from pathlib import Path

import numpy as np

import sys

from operator import itemgetter
from collections import defaultdict

Collecting pytorch_pretrained_bert==0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/95/68/84de54aea460eb5b2e90bf47a429aacc1ce97ff052ec40874ea38ae2331d/pytorch_pretrained_bert-0.4.0-py3-none-any.whl (45kB)
[K     |███████▎                        | 10kB 14.6MB/s eta 0:00:01[K     |██████████████▌                 | 20kB 3.0MB/s eta 0:00:01[K     |█████████████████████▊          | 30kB 4.0MB/s eta 0:00:01[K     |█████████████████████████████   | 40kB 4.3MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.6MB/s 
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.4.0
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [0]:
BASE_DIR = Path('/content/drive/My Drive')  # Path.cwd().parent
DIRECTORY_TRAIN = BASE_DIR.joinpath('data', 'protechn_corpus_eval', 'train')
DIRECTORY_DEV = BASE_DIR.joinpath('data', 'protechn_corpus_eval', 'dev')
DIRECTORY_TEST = BASE_DIR.joinpath('data', 'protechn_corpus_eval', 'test')
DIRECTORY_MARKUP = BASE_DIR.joinpath('data', 'protechn_corpus_eval', 'markup')
DIRECTORY_PREDICT = BASE_DIR.joinpath('data', 'protechn_corpus_eval', 'predict')
TECHNIQUES = [
    'No', 'Whataboutism', 'Thought-terminating_Cliches', 'Straw_Men', 'Slogans', 'Repetition',
    'Reductio_ad_hitlerum', 'Red_Herring', 'Obfuscation,Intentional_Vagueness,Confusion',
    'Name_Calling,Labeling', 'Loaded_Language', 'Flag-Waving', 'Exaggeration,Minimisation',
    'Doubt', 'Causal_Oversimplification', 'Black-and-White_Fallacy', 'Bandwagon',
    'Appeal_to_fear-prejudice', 'Appeal_to_Authority'
]
HUMAN_READABLE_TECHNIQUES = [
    "No", "Whataboutism", "Thought-terminating Cliches", "Straw Men", "Slogans", "Repetition",
    "Reductio ad hitlerum", "Red Herring", "Obfuscation, Intentional Vagueness, Confusion",
    "Name Calling, Labeling", "Loaded Language", "Flag-Waving", "Exaggeration, Minimisation",
    "Doubt", "Causal Oversimplification", "Black-and-White Fallacy", "Bandwagon",
    "Appeal to fear-prejudice", "Appeal to Authority"
]
ARTICLE = 7

In [0]:
def get_list(id_, directory=DIRECTORY_TRAIN):
    """
    Функция, возвращающая список [set(), set(), ..., {Flag-Waving, Bandwagon}, ..., set(), set()].
    """

    lines = []
    labels_file = directory.joinpath(f'article{id_}.labels.tsv')
    if labels_file.is_file():
        with open(labels_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    with open(directory.joinpath(f'article{id_}.txt'), 'r', encoding='utf-8') as inner_f:
        length = len(inner_f.read())
    lst = [set() for _ in range(length)]
    for line in lines:
        id_, technique, left, right = line.split()
        id_, left, right = list(map(int, (id_, left, right)))
        for i in range(left, right):
            lst[i].add(technique)
    return lst

In [0]:
def get_num_of_techniques_for_id(id_, directory=DIRECTORY_TRAIN):
    """
    Функция, возвращающая словарь с количеством употреблённых техник.
    """

    lines = []
    labels_file = directory.joinpath(f'article{id_}.labels.tsv')
    if labels_file.is_file():
        with open(labels_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    label_count_dct = defaultdict(int)
    lst = []
    for line in lines:
        _, technique, left, right = line.split()
        left, right = list(map(int, [left, right]))
        technique = HUMAN_READABLE_TECHNIQUES[TECHNIQUES.index(technique)]
        label_count_dct[technique] += 1
        lst.append((technique, left, right))
    return label_count_dct, lst

In [0]:
id_to_text = {}
id_to_labels = {}
id_to_label_count = {}
id_to_label_left_right = {}
for directory in (DIRECTORY_TRAIN, DIRECTORY_DEV, DIRECTORY_TEST,
                  DIRECTORY_MARKUP, DIRECTORY_PREDICT):
    for f in directory.glob('*.txt'):
        id_ = int(f.name.split('.')[0][ARTICLE:])
        id_to_text[id_] = f.read_text(encoding='utf-8')
        id_to_labels[id_] = get_list(id_, directory=directory)
        id_to_label_count[id_], id_to_label_left_right[id_] = \
            get_num_of_techniques_for_id(id_, directory=directory)

In [0]:
def parse_labels(label_path):
    """
    returns: [[left, right, technique, intersection, more_than_sent], ...]
    """

    labels = []
    if not Path(label_path).exists():
        return labels
    with open(label_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            _, technique, left, right = line.strip().split('\t')
            labels.append([int(left), int(right), technique, 0, 0])
    labels.sort()
    if not labels:
        return labels
    length = max([label[1] for label in labels])
    visited = np.zeros(length)
    for label in labels:
        if sum(visited[label[0]:label[1]]):
            label[3] = 1  # intersection
        else:
            visited[label[0]:label[1]] = 1
    return labels

In [0]:
def clean_text(articles, ids):
    """
    articles: ['first text here', 'second text here', ...]
    ids: ['id1', 'id2', ...]
    returns: [[[id_, sentence, start, end], ...], ...]
    """

    texts = []
    for article, id_ in zip(articles, ids):
        sentences = article.split('\n')  # ['first sentence', 'second sentence', ...]
        end = -1
        res = []
        for sentence in sentences:
            start = end + 1
            end = start + len(sentence)  # length of sequence
            if sentence:
                res.append([id_, sentence, start, end])  # [[id_, sentence, start, end], ...]
        texts.append(res)
    return texts  # [[[id_, sentence, start, end], ...], ...]

In [0]:
def read_data(directory, is_test=False):
    """
    returns: (['id1', 'id2', ...], ['text1', 'text2', ...], [[[left, right, technique, intersection, more_than_sent], ...], ...])
    """

    ids = []
    texts = []
    if not is_test:
        labels = []
    for f in directory.glob('*.txt'):
        ids.append(f.name.replace('article', '').replace('.txt', ''))
        texts.append(f.read_text(encoding='utf-8'))
        if not is_test:
            labels.append(parse_labels(f.as_posix().replace('.txt', '.labels.tsv')))
    if not is_test:
        return ids, texts, labels
    return ids, texts

In [0]:
def make_dataset(directory):
    """
    returns: [[['id1', 'sentence1', start, end, left, right, technique, intersection, more_than_sent], ...], ...]
    """

    ids, texts, group_labels = read_data(directory)
    # ids: ['id1', 'id2', ...]
    # texts: ['text1', 'text2', ...]
    # group_labels: [[[left, right, technique, intersection, more_than_sent], ...], ...]
    texts = clean_text(texts, ids)
    # texts: [[['id1', 'sentence1', start, end], ['id1', 'sentence2', start, end], ...], ...]
    res = []
    for sents, labels in zip(texts, group_labels):
        # sents: [['id1', 'sentence1', start, end], ['id1', 'sentence2', start, end], ...]
        # labels: [[left, right, technique, intersection, more_than_sent], ...]

        # making positive examples
        tmp = []
        pos_ind = [0] * len(sents)
        for label in labels:
            # label: [left, right, technique, intersection, more_than_sent]
            left, right, technique, intersection, more_than_sent = label
            for i, sent in enumerate(sents):
                # sent: ['id1', 'sentence1', start, end]
                *_, start, end = sent
                if left >= start and left < end and right > end:
                    label[4] = 1
                    tmp.append(sent + [left, end, technique, intersection, label[4]])
                    pos_ind[i] = 1
                    label[0] = end + 1
                elif left != right and left >= start and left < end and right <= end:
                    tmp.append(sent + label)
                    # tmp: [['id1', 'sentence1', start, end, left, right, technique, intersection, more_than_sent], ...]
                    pos_ind[i] = 1

        # making negative examples
        dummy = [0, 0, 'O', 0, 0]
        for i, sent in enumerate(sents):
            if pos_ind[i] != 1:
                tmp.append(sent + dummy)
        res.append(tmp)
        # res: [[['id1', 'sentence1', start, end, left, right, technique, intersection, more_than_sent], ...], ...]
    return res

In [0]:
def make_bert_dataset(dataset, is_test=False, verbose=False):
    """
    dataset: [[['id1', 'sentence1', start, end, left, right, technique, intersection, more_than_sent], ...], ...]
    returns: (
                [ [ ['first_word', 'second_word', ...], ... ], ... ],
                [ [ ['label1', 'label2', ...], ... ], ... ],
                [ [ id1, id2, ... ], ... ]
            )
    """

    words, tags, ids= [], [], []
    for article in dataset:
        # article: [['id1', 'sentence1', start, end, left, right, technique, intersection, more_than_sent], ...]
        tmp_doc, tmp_label, tmp_id = [], [], []
        tmp_sen = article[0][1]
        tmp_i = article[0][0]
        label = ['O'] * len(tmp_sen.split(' '))
        for sentence in article:
            # sentence: ['id1', 'sentence1', start, end, left, right, technique, intersection, more_than_sent]
            tokens = sentence[1].split(' ')
            token_len = [len(token) for token in tokens]
            if len(sentence) == 9:  # label exists
                if tmp_sen != sentence[1] or (sentence[7] and is_test):
                    tmp_label.append(label)
                    tmp_doc.append(tmp_sen.split(' '))
                    tmp_id.append(tmp_i)
                    if tmp_sen != sentence[1]:
                        label = ['O'] * len(token_len)
                start = sentence[4] - sentence[2] 
                end = sentence[5] - sentence[2]
                if sentence[6] != 'O':
                    for i in range(1, len(token_len)): 
                        token_len[i] += token_len[i-1] + 1
                    token_len[-1] += 1
                    token_len = np.asarray(token_len)
                    s_ind = np.min(np.where(token_len > start))
                    tmp = np.where(token_len >= end)  
                    if len(tmp[0]) != 0:
                        e_ind = np.min(tmp)
                    else: 
                        e_ind = s_ind
                    for i in range(s_ind, e_ind+1):
                        label[i] = sentence[6]
                tmp_sen = sentence[1]
                tmp_i = sentence[0]
            else:
                tmp_doc.append(tokens)
                tmp_id.append(sentence[0])
        if len(sentence) == 9:
            tmp_label.append(label)
            tmp_doc.append(tmp_sen.split(' '))
            tmp_id.append(tmp_i)
        words.append(tmp_doc) 
        tags.append(tmp_label)
        ids.append(tmp_id)
    if verbose:
        print(f'words: {words}')
        print(f'tags: {tags}')
        print(f'ids: {ids}')
    return words, tags, ids

In [0]:
training = "training"
checkdir = "checkpoints"
resultdir = "results"

# either of the four variants:
bert = False
joint = False
granu = False
mgn = True

assert bert or joint or granu or mgn
assert not(bert and joint) and not(bert and granu) and not(bert and mgn) and not(joint and granu) and not(joint and mgn) and not(granu and mgn)

# either of the two variants
sigmoid_activation = False
relu_activation = True
assert not(sigmoid_activation and relu_activation) and (sigmoid_activation or relu_activation)

trainset = './drive/My Drive/data/protechn_corpus_eval/train'
validset = './drive/My Drive/data/protechn_corpus_eval/dev'
testset = './drive/My Drive/data/protechn_corpus_eval/test'

In [0]:
if bert:
    num_task = 1
    masking = 0
    hier = 0
elif joint:
    num_task = 2
    masking = 0
    hier = 0
elif granu:
    num_task = 2
    masking = 0
    hier = 1
elif mgn:
    num_task = 2
    masking = 1
    hier = 0
else:
    raise ValueError("You should choose one of [bert, joint, granu and mgn] in options")

In [0]:
BATCH_SIZE = 32
LR = 1e-5
ALPHA = 0.75
N_EPOCHS = 100
PATIENCE = 15
INPUT_SIZE = 768
SEQ_LEN = 212
POS_WEIGHT = 926 / 3532

In [0]:
tag_to_index, index_to_tag = [], []

In [0]:
VOCAB = [
    ("<PAD>", "O", "Name_Calling,Labeling", "Repetition", "Slogans",
     "Appeal_to_fear-prejudice", "Doubt", "Exaggeration,Minimisation",
     "Flag-Waving", "Loaded_Language", "Reductio_ad_hitlerum", "Bandwagon",
     "Causal_Oversimplification", "Obfuscation,Intentional_Vagueness,Confusion",
     "Appeal_to_Authority", "Black-and-White_Fallacy",
     "Thought-terminating_Cliches", "Red_Herring", "Straw_Men", "Whataboutism")
]

In [0]:
if num_task == 2:  # sentence classification
    VOCAB.append(("Non-prop", "Prop"))

In [0]:
for i in range(num_task):
    tag_to_index.append({tag: idx for idx, tag in enumerate(VOCAB[i])})
    index_to_tag.append({idx: tag for idx, tag in enumerate(VOCAB[i])})

In [0]:
# (New, recommended) 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/bert/vocab.txt', do_lower_case=False)  # 'bert-base-multilingual-cased', do_lower_case=False

In [0]:
class PropDataset(data.Dataset):
    def __init__(self, directory_path, is_test=False, verbose=False):
        directory = pathlib.Path(directory_path)
        dataset = make_dataset(directory)
        words, tags, ids = make_bert_dataset(dataset, is_test=is_test, verbose=verbose)
        # if is_test:
        #     words, tags, ids = make_bert_testset(dataset, verbose=verbose)
        # else:
        #     words, tags, ids = make_bert_dataset(dataset, verbose=verbose)

        # (
        #     [ [ ['first_word', 'second_word', ...], ... ], ... ],
        #     [ [ ['label1', 'label2', ...], ... ], ... ],
        #     [ [ id1, id2, ... ], ... ]
        # )

        flat_ids, flat_sents = [], []
        tags_li = [[] for _ in range(num_task)]
        for article_words, article_tags, article_ids in zip(words, tags, ids):
            for inner_words, inner_tags, id_ in zip(article_words, article_tags, article_ids):
                flat_sents.append(["[CLS]"] + inner_words + ["[SEP]"])
                flat_ids.append(id_)

                tmp_tags = []
                if num_task == 1:  # technique classification
                    tmp_tags.append(['O'] * len(inner_tags))
                    for j, inner_tag in enumerate(inner_tags):
                        if inner_tag != 'O' and inner_tag in VOCAB[0]:
                            tmp_tags[0][j] = inner_tag
                    tags_li[0].append(["<PAD>"] + tmp_tags[0] + ["<PAD>"])
                else:  # sentence classification
                    tmp_tags.append(['O'] * len(inner_tags))
                    tmp_tags.append(['Non-prop'])
                    for j, inner_tag in enumerate(inner_tags):
                        if inner_tag != 'O' and inner_tag in VOCAB[0]:
                            tmp_tags[0][j] = inner_tag
                            tmp_tags[1] = ['Prop']
                    for i in range(num_task):
                        tags_li[i].append(["<PAD>"] + tmp_tags[i] + ["<PAD>"])

        self.sents, self.ids, self.tags_li = flat_sents, flat_ids, tags_li
        assert len(self.sents) == len(self.ids) == len(self.tags_li[0])

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, index):
        words = self.sents[index]
        id_ = self.ids[index]
        tags = list(list(zip(*self.tags_li))[index])  # [ ['label1', 'label2', ...] ]

        x, is_heads = [], []  # list of ids
        y = [[] for _ in range(num_task)]  # list of lists of lists
        tt = [[] for _ in range(num_task)]  # list of lists of lists

        for word, tag in zip(words, tags[0]):
            tokens = tokenizer.tokenize(word) if word not in ("[CLS]", "[SEP]") else [word]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0] * (len(tokens) - 1)
            if len(xx) < len(is_head):
                xx = xx + [100] * (len(is_head) - len(xx))  # 100 == "[UNK]"

            tag = [tag] + [tag] * (len(tokens) - 1)
            y[0].extend([tag_to_index[0][each] for each in tag])
            tt[0].extend(tag)

            x.extend(xx)
            is_heads.extend(is_head)

        if num_task == 2:
            if tags[1][1] == 'Non-prop':
                y[1].extend([1, 0])
                tt[1].extend([tags[1][1]])
            elif tags[1][1] == 'Prop':
                y[1].extend([0, 1])
                tt[1].extend([tags[1][1]])

        seqlen = len(y[0])
        words = " ".join([id_] + words)

        for i in range(num_task):
            tags[i] = " ".join(tags[i])

        att_mask = [1] * seqlen
        return words, x, is_heads, att_mask, tags, y, seqlen

In [0]:
def pad(batch):
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    seqlen = f(-1)
    maxlen = SEQ_LEN

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch]  # 0: '[PAD]'
    x = torch.LongTensor(f(1, maxlen))
    att_mask = f(-4, maxlen)

    y = []
    tags = []

    y.append(torch.LongTensor([sample[-2][0] + [0] * (maxlen - len(sample[-2][0])) for sample in batch]))
    for i in range(num_task):
        tags.append([sample[-3][i] for sample in batch])
    if num_task == 2:  # sentence classification
        y.append(torch.LongTensor([sample[-2][1] for sample in batch]))

    return words, x, is_heads, att_mask, tags, y, seqlen

In [0]:
class EarlyStopping:
    def __init__(self, patience=PATIENCE, verbose=False, filepath='checkpoint.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.filepath = filepath

    def __call__(self, val_loss, model, filepath):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model, filepath)
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model, filepath)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, filepath):
        """Saves model when validation loss decrease."""
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), filepath)
        self.val_loss_min = val_loss

In [0]:
class BertMultiTaskLearning(PreTrainedBertModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.ModuleList([nn.Linear(config.hidden_size, len(VOCAB[i])) for i in range(num_task)])
        self.apply(self.init_bert_weights)
        self.masking_gate = nn.Linear(2, 1)

        if num_task == 2:
            self.merge_classifier_1 = nn.Linear(len(VOCAB[0]) + len(VOCAB[1]), len(VOCAB[0]))

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        sequence_output, pooled_output = self.bert(
            input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        sequence_output = self.dropout(sequence_output)
        pooled_output = self.dropout(pooled_output)

        if num_task == 1:
            logits = [self.classifier[i](sequence_output) for i in range(num_task)]
        elif num_task == 2 and masking:
            token_level = self.classifier[0](sequence_output)
            sen_level = self.classifier[1](pooled_output)

            if sigmoid_activation:
                gate = sigmoid(self.masking_gate(sen_level))
            else:
                gate = relu(self.masking_gate(sen_level))

            dup_gate = gate.unsqueeze(1).repeat(1, token_level.size()[1], token_level.size()[2])
            wei_token_level = torch.mul(dup_gate, token_level)

            logits = [wei_token_level, sen_level]
        elif num_task == 2 and hier:
            token_level = self.classifier[0](sequence_output)
            sen_level = self.classifier[1](pooled_output)
            dup_sen_level = sen_level.repeat(1, token_level.size()[1])
            dup_sen_level = dup_sen_level.view(sen_level.size()[0], -1, sen_level.size()[-1])
            logits = [
                self.merge_classifier_1(torch.cat((token_level, dup_sen_level), 2)),
                self.classifier[1](pooled_output)
            ]
        elif num_task == 2:
            token_level = self.classifier[0](sequence_output)
            sen_level = self.classifier[1](pooled_output)
            logits = [token_level, sen_level]
        else:
            raise ValueError("Incorrect combination of input arguments")
        y_hats = [logits[i].argmax(-1) for i in range(num_task)]

        return logits, y_hats

Training

In [0]:
timestr = time.strftime("%Y%m%d-%H%M%S")

In [0]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [0]:
def train(model, iterator, optimizer, criterion, binary_criterion):
    model.train()

    train_losses = []

    for k, batch in enumerate(iterator):
        words, x, is_heads, att_mask, tags, y, seqlens = batch
        att_mask = torch.Tensor(att_mask)

        optimizer.zero_grad()
        logits, _ = model(x, attention_mask=att_mask)

        loss = []

        for i in range(num_task):
            logits[i] = logits[i].view(-1, logits[i].shape[-1])  # (N*T, 2)
        y[0] = y[0].view(-1).to(DEVICE)
        loss.append(criterion(logits[0], y[0]))
        if num_task == 2:    
            y[1] = y[1].float().to(DEVICE)
            loss.append(binary_criterion(logits[1], y[1]))

        if num_task == 1:
            joint_loss = loss[0]
        elif num_task == 2:
            joint_loss = ALPHA*loss[0] + (1 - ALPHA)*loss[1]

        joint_loss.backward()
        optimizer.step()
        train_losses.append(joint_loss.item())

        if k % 10 == 0:  # monitoring
            print("step: {}, loss0: {}".format(k, loss[0].item()))

    train_loss = np.average(train_losses)

    return train_loss

In [85]:
sorted(tag_to_index[0].items(), key=lambda elem: elem[1])

[('<PAD>', 0),
 ('O', 1),
 ('Name_Calling,Labeling', 2),
 ('Repetition', 3),
 ('Slogans', 4),
 ('Appeal_to_fear-prejudice', 5),
 ('Doubt', 6),
 ('Exaggeration,Minimisation', 7),
 ('Flag-Waving', 8),
 ('Loaded_Language', 9),
 ('Reductio_ad_hitlerum', 10),
 ('Bandwagon', 11),
 ('Causal_Oversimplification', 12),
 ('Obfuscation,Intentional_Vagueness,Confusion', 13),
 ('Appeal_to_Authority', 14),
 ('Black-and-White_Fallacy', 15),
 ('Thought-terminating_Cliches', 16),
 ('Red_Herring', 17),
 ('Straw_Men', 18),
 ('Whataboutism', 19)]

In [0]:
def eval(model, iterator, f, criterion, binary_criterion, baseline_1=False):
    """ evaluation on SLC and FLC tasks """
    model.eval()

    valid_losses = []

    Words, Is_heads = [], []
    Tags = [[] for _ in range(num_task)]
    Y = [[] for _ in range(num_task)]
    Y_hats = [[] for _ in range(num_task)]
    with torch.no_grad():
        for batch in iterator:
            words, x, is_heads, att_mask, tags, y, seqlens = batch
            att_mask = torch.Tensor(att_mask)
            logits, y_hats = model(x, attention_mask=att_mask)  # logits: (N, T, VOCAB), y: (N, T)

            loss = []
            for i in range(num_task):
                logits[i] = logits[i].view(-1, logits[i].shape[-1])  # (N * T, 2)
            y[0] = y[0].view(-1).to(DEVICE)
            loss.append(criterion(logits[0], y[0]))
            if num_task == 2:
                y[1] = y[1].float().to(DEVICE)
                loss.append(binary_criterion(logits[1], y[1]))

            if num_task == 1:
                joint_loss = loss[0]
            elif num_task == 2:
                joint_loss = ALPHA*loss[0] + (1-ALPHA)*loss[1]

            valid_losses.append(joint_loss.item())
            Words.extend(words)
            Is_heads.extend(is_heads)

            for i in range(num_task):
                Tags[i].extend(tags[i])
                Y[i].extend(y[i].cpu().numpy().tolist())
                Y_hats[i].extend(y_hats[i].cpu().numpy().tolist())
    valid_loss = np.average(valid_losses)

    with open(f, 'w', encoding='utf-8') as fout:
        y_hats, preds = [[[] for _ in range(num_task)] for _ in range(2)]
        if num_task == 1:
            for words, is_heads, tags[0], y_hats[0] in zip(Words, Is_heads, *Tags, *Y_hats):
                y_hats[0] = [hat for head, hat in zip(is_heads, y_hats[0]) if head == 1]
                preds[0] = [index_to_tag[0][hat] for hat in y_hats[0]]
                fout.write(words.split()[0])
                fout.write("\n")
                for w, t1, p_1 in zip(words.split()[2:-1], tags[0].split()[1:-1], preds[0][1:-1]):
                    fout.write("{} {} {} \n".format(w, t1, p_1))
                fout.write("\n")
        else:  # num_task == 2
            TP, FP, FN, TN = 0, 0, 0, 0
            for words, is_heads, tags[0], tags[1], y_hats[0], y_hats[1] in zip(Words, Is_heads, *Tags, *Y_hats):
                y_hats[0] = [hat for head, hat in zip(is_heads, y_hats[0]) if head == 1]
                preds[0] = [index_to_tag[0][hat] for hat in y_hats[0]]
                preds[1] = index_to_tag[1][y_hats[1]]

                if baseline_1:
                    preds[1] = 'Prop'

                if tags[1].split()[1] == 'Non-prop' and preds[1] == 'Non-prop':
                    TN += 1
                elif tags[1].split()[1] == 'Non-prop' and preds[1] == 'Prop':
                    FP += 1
                elif tags[1].split()[1] == 'Prop' and preds[1] == 'Prop':
                    TP += 1
                elif tags[1].split()[1] == 'Prop' and preds[1] == 'Non-prop':
                    FN += 1

                fout.write(words.split()[0] + "\n")
                for w, t1, p_1 in zip(words.split()[2:-1], tags[0].split()[1:-1], preds[0][1:-1]):
                    fout.write("{} {} {} {} {}\n".format(w, t1, tags[1].split()[1:-1][0], p_1, preds[1]))
                fout.write("\n")

            try:
                precision = TP / (TP + FP)
            except ZeroDivisionError:
                precision = 1.0
            try:
                recall = TP / (TP + FN)
            except ZeroDivisionError:
                recall = 1.0
            try:
                f1 = 2 * precision * recall / (precision + recall)
            except ZeroDivisionError:
                if precision * recall == 0:
                    f1 = 1.0
                else:
                    f1 = 0.0
            print(f"SLC precision: {precision:.4f}")
            print(f"SLC recall: {recall:.4f}")
            print(f"SLC f1-score: {f1:.4f}")

    ## calc metric 
    y_true, y_pred = [], []
    for i in range(num_task):
        y_true.append(np.array([tag_to_index[i][line.split()[i + 1]] for line in open(f, 'r', encoding='utf-8').read().splitlines() if len(line.split()) > 1]))
        if baseline_1:
            if i == 0:
                key = "Loaded_Language"
            else:
                key = "Prop"
            y_pred.append(np.array([tag_to_index[i][key] for line in open(f, 'r', encoding='utf-8').read().splitlines() if len(line.split()) > 1]))
        else:
            y_pred.append(np.array([tag_to_index[i][line.split()[i + 1 + num_task]] for line in open(f, 'r', encoding='utf-8').read().splitlines() if len(line.split()) > 1]))

    num_predicted, num_correct, num_gold = 0, 0, 0

    num_predicted += len(y_pred[0][y_pred[0] > 1])
    num_correct += (np.logical_and(y_true[0] == y_pred[0], y_true[0] > 1)).astype(np.int).sum()
    num_gold += len(y_true[0][y_true[0] > 1])

    print(f"FLC number of predicted techniques: {num_predicted}")
    print(f"FLC number of correct techniques: {num_correct}")
    print(f"FLC number of gold techniques: {num_gold}")

    try:
        precision = num_correct / num_predicted
    except ZeroDivisionError:
        precision = 1.0

    try:
        recall = num_correct / num_gold
    except ZeroDivisionError:
        recall = 1.0

    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        if precision * recall == 0:
            f1 = 1.0
        else:
            f1 = 0

    # final = f + ".P%.4f_R%.4f_F1%.4f" % (precision, recall, f1)
    # with open(f, 'w', encoding='utf-8') as fout:  # final
    #     result = open(f, "r", encoding='utf-8').read()
    #     fout.write("{}\n".format(result))
    #     fout.write("precision={:4f}\n".format(precision))
    #     fout.write("recall={:4f}\n".format(recall))
    #     fout.write("f1={:4f}\n".format(f1))

    # os.remove(f)

    print(f"FLC precision: {precision:.4f}")
    print(f"FLC recall: {recall:.4f}")
    print(f"FLC f1-score: {f1:.4f}")
    return precision, recall, f1, valid_loss

In [0]:
# clear tqdm

from IPython import get_ipython

def tqdm_clear(*args, **kwargs):
    from tqdm import tqdm
    getattr(tqdm, '_instances', {}).clear()

get_ipython().events.register('post_execute', tqdm_clear)

In [0]:
def check_overlap(line_1, line_2):
    if line_1[2] > line_2[3] or line_1[3] < line_2[2]:
        return False
    return True

In [0]:
def remove_duplicates(res):
    sorted_res = sorted(res, key=itemgetter(0, 1, 2, 3))
    ans = []
    skip = 0
    for i, line_1 in enumerate(sorted_res):
        assert line_1 == sorted_res[i]
        for j, line_2 in enumerate(sorted_res[i + 1:]):
            skip = 0
            if line_1[0] != line_2[0]:
                break
            elif line_1[1] != line_2[1]:
                continue

            if check_overlap(line_1, line_2):
                if line_1[2] != line_2[2] or line_1[3] != line_2[3]:
                    sorted_res[i + j + 1][2] = min(line_1[2], line_2[2])
                    sorted_res[i + j + 1][3] = max(line_1[3], line_2[3])
                skip = 1
                break
        if skip == 0:
            ans.append(line_1)
    return ans

In [0]:
import re


def convert(ind, flat_texts, filename):
    """
    1173236160
    Мало O Non-prop O Prop
    того, O Non-prop O Prop
    что O Non-prop O Prop
    «Аркан» O Non-prop O Prop
    приобрел O Non-prop O Prop
    «Ведомости» O Non-prop O Prop
    у O Non-prop O Prop
    кипрского O Non-prop O Prop
    офшора O Non-prop O Prop
    за O Non-prop O Prop
    бóльшую O Non-prop Loaded_Language Prop
    сумму, O Non-prop O Prop
    чем O Non-prop O Prop
    они O Non-prop O Prop
    <...>

    1173236160
    <...>
    """

    with open(filename, 'r', encoding='utf-8') as f1:
        output = []
        for line in f1:
            if len(line.split()) == 1:  # if line is id
                id_ = line.strip()
                continue
            elif line != '\n':  # In the same sentence
                tmp = [id_] + line.strip().split()  # add id to line
                if len(tmp) == 6:  # num_task 2
                    tmp += [tmp[-2]]
                else:
                    tmp += [tmp[-(1 + ind)]]
                output.append(tmp + [len(tmp[1])])  # add word length to line
            else:
                output.append('\n')

    res = []
    aid = output[0][0]
    sub_list = [sentence for sentence in flat_texts if sentence[0] == aid]
    sub_dic = {re.sub('\s+', ' ', sentence): (start, end) for _, sentence, start, end in sub_list}


    start = 0
    end = -1
    sentence = ""
    cur = 0
    on = 0

    tmp_ans = []
    cur_tag = 'O'
    prop_or_not_dict = {}

    slc_task = {}
    sent_predictions = (False, False)

    for line in output:  # ['36081082999', 'вора', 'O', 'Non-prop', 'O', 'Non-prop', 'O', 4]
        # ['1173236160', 'щедрых', 'O', 'Loaded_Language', 'Loaded_Language', 6]
        if line != '\n':
            aid = line[0]
            if int(aid) not in prop_or_not_dict:
                prop_or_not_dict[int(aid)] = [False for _ in range(len(id_to_text[int(aid)]))]

            sentence += line[1] + " "
            prop_or_not_prop = line[-3] != 'Non-prop'
            sent_predictions = (sent_predictions[0] | (line[2] != 'O'),
                                sent_predictions[1] | (line[4] != 'O'))

            if line[-2] != 'O' and line[-2] != '<PAD>':
                if on == 0:
                    on = 1
                    cur_tag = line[-2]
                    start = cur
                    end = cur + line[-1]
                elif line[-2] == cur_tag:
                    end = cur + line[-1]
                else:
                    tmp_ans.append([aid, cur_tag, start, end])
                    cur_tag = line[-2]
                    start = cur
                    end = cur + line[-1]
            else:
                if on:
                    tmp_ans.append([aid, cur_tag, start, end])
                    on = 0
            cur += line[-1] + 1

        else:
            if on:
                tmp_ans.append([aid, cur_tag, start, end])
                on = 0

            cur = 0
            sub_list = [sentence for sentence in flat_texts if sentence[0] == aid]
            sub_dic = {re.sub('\s+', ' ', sentence): (start, end) for _, sentence, start, end in sub_list}

            if sentence[:-1] != "":
                s, e = sub_dic.get(sentence[:-1])
                slc_task[(s, e, sentence[:-1])] = sent_predictions

            if len(tmp_ans) and sentence[:-1] != "":
                s, e = sub_dic.get(sentence[:-1])
                if prop_or_not_prop:
                    prop_or_not_dict[int(aid)][s:e] = [True for _ in range(s, e)]

            # tmp_ans: [['1173236160', 'Loaded_Language', 41, 63]]
            # sentence: Журнал The New Times в 2016 г. назвал ее предполагаемой старшей дочерью Путина.
            # prop_or_not_prop: True
            # start, end: (25180, 25259)
            # sent_predictions: (False, True)

            if len(tmp_ans) and sentence[:-1] != "":
                for ans in tmp_ans:
                    ans[2] += s
                    ans[3] += s
                    res.append(ans)
            sent_predictions = (False, False)
            sentence = ""
            prop_or_not_prop = False
            
            tmp_ans = []
    TP, FP, FN, TN = 0, 0, 0, 0
    for true, pred in slc_task.values():
        if pred == true == True:
            TP += 1
        elif pred == true == False:
            TN += 1
        elif pred != true and pred == True:
            FP += 1
        else:
            FN += 1
    try:
        precision = TP / (TP + FP)
    except ZeroDivisionError:
        precision = 1.0
    try:
        recall = TP / (TP + FN)
    except ZeroDivisionError:
        recall = 1.0
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        if precision * recall == 0:
            f1 = 1.0
        else:
            f1 = 0.0
    print(f'SLC precision: {precision:.4f}')
    print(f'SLC recall: {recall:.4f}')
    print(f'SLC f1_score: {f1:.4f}')
            
    return res, prop_or_not_dict

In [91]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

model = BertMultiTaskLearning.from_pretrained('/content/drive/My Drive/bert')  # 'bert-base-multilingual-cased'
print("Detect ", torch.cuda.device_count(), "GPUs!")
model = nn.DataParallel(model)
model.to(DEVICE)

train_dataset = PropDataset(trainset, is_test=False)
eval_dataset = PropDataset(validset, is_test=True)
test_dataset = PropDataset(testset, is_test=True)

train_iter = data.DataLoader(
    dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=1, collate_fn=pad
)
eval_iter = data.DataLoader(
    dataset=eval_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=1, collate_fn=pad
)
test_iter = data.DataLoader(
    dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=1, collate_fn=pad
)

warmup_proportion = 0.1
num_train_optimization_steps = int(len(train_dataset) / BATCH_SIZE) * N_EPOCHS
param_optimizer = list(model.named_parameters())
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = BertAdam(
    optimizer_grouped_parameters, lr=LR, warmup=warmup_proportion,
    t_total=num_train_optimization_steps
)

criterion = nn.CrossEntropyLoss(ignore_index=0)
binary_criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([POS_WEIGHT]).to(DEVICE))

avg_train_losses = []
avg_valid_losses = []

# initialize the early_stopping object
early_stopping = EarlyStopping(patience=PATIENCE, verbose=True)

Detect  1 GPUs!


#### Training

In [0]:
for epoch in range(1, N_EPOCHS + 1):
    print("=========eval at epoch={epoch}=========")
    if not os.path.exists('checkpoints'):
        os.makedirs('checkpoints')
    if not os.path.exists('results'):
        os.makedirs('results')
    fname = os.path.join('checkpoints', timestr)
    spath = os.path.join('checkpoints', timestr + ".pt")

    train_loss = train(model, train_iter, optimizer, criterion, binary_criterion)
    avg_train_losses.append(train_loss.item())

    precision, recall, f1, valid_loss = eval(model, eval_iter, fname, criterion, binary_criterion)
    avg_valid_losses.append(valid_loss.item())

    epoch_len = len(str(N_EPOCHS))
    print(
        f'[{epoch:>{epoch_len}}/{N_EPOCHS:>{epoch_len}}]     '
        f'train_loss: {train_loss:.5f} '
        f'valid_loss: {valid_loss:.5f}'
    )

    early_stopping(-f1, model, spath)

    if early_stopping.early_stop:
        print("Early stopping")
        break

step: 0, loss0: 2.995732545852661
step: 10, loss0: 2.995732307434082
step: 20, loss0: 2.995732545852661
step: 30, loss0: 2.995732545852661
step: 40, loss0: 2.986283540725708
step: 50, loss0: 2.7114439010620117
SLC precision: 1.0000
SLC recall: 0.0000
SLC f1-score: 0.0000
FLC number of predicted techniques: 0
FLC number of correct techniques: 0
FLC number of gold techniques: 371
FLC precision: nan
FLC recall: 0.0000
FLC f1-score: nan
[  1/100]     train_loss: 2.25847 valid_loss: 1.12684
Validation loss decreased (inf --> nan).  Saving model ...




step: 0, loss0: 1.6528111696243286
step: 10, loss0: 1.101335048675537
step: 20, loss0: 0.7737540602684021
step: 30, loss0: 0.5953529477119446
step: 40, loss0: 0.6294087767601013
step: 50, loss0: 0.9715940952301025
SLC precision: 1.0000
SLC recall: 0.0000
SLC f1-score: 0.0000
FLC number of predicted techniques: 0
FLC number of correct techniques: 0
FLC number of gold techniques: 371
FLC precision: nan
FLC recall: 0.0000
FLC f1-score: nan
[  2/100]     train_loss: 0.71820 valid_loss: 0.55310
Validation loss decreased (nan --> nan).  Saving model ...
step: 0, loss0: 0.8830757141113281
step: 10, loss0: 0.8371393084526062
step: 20, loss0: 0.8986011147499084
step: 30, loss0: 0.9577757120132446
step: 40, loss0: 0.9176406264305115
step: 50, loss0: 0.6308437585830688
SLC precision: 1.0000
SLC recall: 0.0000
SLC f1-score: 0.0000
FLC number of predicted techniques: 0
FLC number of correct techniques: 0
FLC number of gold techniques: 371
FLC precision: nan
FLC recall: 0.0000
FLC f1-score: nan
[  3

In [68]:
pt_files = list(Path('checkpoints').glob('*.pt'))
if pt_files:
    file_path = sorted(pt_files)[-1]
    print(f'last model: {file_path.as_posix()}')

last model: checkpoints/20200529-204137.pt


In [0]:
!mv checkpoints/20200529-204137.pt "drive/My Drive/data/protechn_corpus_eval/BERT_MULTIGRAN_model_sigmoid_ru.pt"

#### Evaluation

In [0]:
def load_model(model_type):
    """ load the the model """
    model = BertMultiTaskLearning.from_pretrained('/content/drive/My Drive/bert')  # 'bert-base-multilingual-cased'
    print("Detect ", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    model.to(DEVICE)

    spath = f"drive/My Drive/data/protechn_corpus_eval/{model_type}.pt"
    model.load_state_dict(torch.load(spath))
    return model

In [0]:
!mkdir -p results

In [0]:
res = os.path.join('results', 'tmp.txt')

In [40]:
# # Baseline quality
# num_task = 2
# masking = 1
# hier = 0

# model = load_model('BERT_MULTIGRAN_model_relu_ru')
# eval(model, test_iter, res, criterion, binary_criterion, baseline_1=True);

Detect  1 GPUs!
SLC precision: 0.2884
SLC recall: 1.0000
SLC f1-score: 0.4477
FLC number of predicted techniques: 16605
FLC number of correct techniques: 327
FLC number of gold techniques: 2018
FLC precision: 0.0197
FLC recall: 0.1620
FLC f1-score: 0.0351


In [41]:
model = load_model('BERT_MULTIGRAN_model_relu_ru')  # BERT_JOINT_model_ru

Detect  1 GPUs!


In [67]:
eval(model, test_iter, res, criterion, binary_criterion);



SLC precision: 0.6462
SLC recall: 0.6304
SLC f1-score: 0.6382
FLC number of predicted techniques: 645
FLC number of correct techniques: 214
FLC number of gold techniques: 2018
FLC precision: 0.3318
FLC recall: 0.1060
FLC f1-score: 0.1607


In [43]:
directory = pathlib.Path('./drive/My Drive/data/protechn_corpus_eval/test')
ids, texts = read_data(directory, is_test=True)

t_texts = clean_text(texts, ids)
flat_texts = [sentence for article in t_texts for sentence in article]

fi, prop_sents = convert(num_task - 1, flat_texts, res)
results = remove_duplicates(fi)

SLC precision: 0.6508
SLC recall: 0.3994
SLC f1_score: 0.4950
