In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!pip install pytorch_pretrained_bert

import torch
import json

import pandas as pd
import numpy as np

from pytorch_pretrained_bert import BertTokenizer, BertModel
from pathlib import Path
from collections import defaultdict



In [0]:
TECHNIQUES = [
    'No', 'Whataboutism', 'Thought-terminating_Cliches', 'Straw_Men', 'Slogans', 'Repetition',
    'Reductio_ad_hitlerum', 'Red_Herring', 'Obfuscation,Intentional_Vagueness,Confusion',
    'Name_Calling,Labeling', 'Loaded_Language', 'Flag-Waving', 'Exaggeration,Minimisation',
    'Doubt', 'Causal_Oversimplification', 'Black-and-White_Fallacy', 'Bandwagon',
    'Appeal_to_fear-prejudice', 'Appeal_to_Authority'
]

ARTICLE = 7
EMBEDDING_SIZE = 768

In [0]:
tokenizer = BertTokenizer.from_pretrained('/content/drive/My Drive/bert/vocab.txt', do_lower_case=False)
model = BertModel.from_pretrained('/content/drive/My Drive/bert')

In [6]:
tokenizer.tokenize("""Почти треть неплательщиков по кредитам заявили о потере работы.
А так?""")

['Почти',
 'треть',
 'неплат',
 '##ель',
 '##щиков',
 'по',
 'кредитам',
 'заявили',
 'о',
 'потере',
 'работы',
 '.',
 'А',
 'так',
 '?']

In [7]:
tokenizer.wordpiece_tokenizer.tokenize("Почти треть неплательщиков по кредитам заявили о потере работы")

['Почти',
 'треть',
 'неплат',
 '##ель',
 '##щиков',
 'по',
 'кредитам',
 'заявили',
 'о',
 'потере',
 'работы']

In [0]:
input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("Я не понимаю, что происходит. А ты?"))).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

#### Make dataset

In [0]:
def get_list(id_, directory):
    """
    Функция, возвращающая список [{}, {}, ..., {Flag-Waving, Bandwagon}, ..., {}, {}].
    """

    lines = []
    labels_file = directory.joinpath(f'article{id_}.labels.tsv')
    if labels_file.is_file():
        with open(labels_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    with open(directory.joinpath(f'article{id_}.txt'), 'r', encoding='utf-8') as inner_f:
        length = len(inner_f.read())
    lst = [set() for _ in range(length)]
    for line in lines:
        id_, technique, left, right = line.split()
        id_, left, right = list(map(int, (id_, left, right)))
        for i in range(left, right):
            lst[i].add(technique)
    return lst

In [0]:
techniques_to_ids = {technique: i for i, technique in enumerate(TECHNIQUES)}
ids_to_techniques = {i: technique for i, technique in enumerate(TECHNIQUES)}

In [0]:
def juxtapose_tokens_with_labels(text, tokenized_text, labels):
    new_labels = []
    text_index = 0
    for token in tokenized_text:
        if token != '#':
            token = token.replace('#', '').strip()
        if token == '[UNK]':
            while text_index < len(text) and text[text_index] != ' ':
                text_index += 1
            new_labels.append(set())
            continue
        i = 0
        while token[i] != text[text_index]:
            text_index += 1
        cur_labels = set()
        while i < len(token) and token[i] == text[text_index]:
            cur_labels |= labels[text_index]
            i += 1
            text_index += 1
        new_labels.append(cur_labels)
        while text_index < len(text) and text[text_index] == ' ':
            text_index += 1
    return new_labels

In [0]:
def get_dataset(directory):
    """ returns [([__emdeddig 768 nums here__], 2), ...] """

    result_lst = []
    for f in directory.glob('*.txt'):
        id_ = int(f.name.split('.')[0][ARTICLE:])


        print(f'id: {id_}')

        text = f.read_text(encoding='utf-8')
        labels = get_list(id_, directory)
        assert len(text) == len(labels)

        slash_n_indices = [-1] + [i for i, symbol in enumerate(text + '\n') if symbol == '\n']
        labels = [labels[ix1+1:ix2] for ix1, ix2 in zip(slash_n_indices, slash_n_indices[1:])]

        sents = text.split('\n')
        for sent, inner_labels in zip(sents, labels):
            assert len(sent) == len(inner_labels)
            if not sent.strip():
                continue
            tokenized_sent = tokenizer.tokenize(sent)
            inner_labels = juxtapose_tokens_with_labels(sent, tokenized_sent, inner_labels)
            input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokenized_sent)).unsqueeze(0)
            vectors = model(input_ids)[0][0][0]
            assert len(inner_labels) == len(vectors)
            for label_set, vector in zip(inner_labels, vectors):
                if not label_set:
                    result_lst.append((vector, techniques_to_ids['No']))
                for label in label_set:
                    result_lst.append((vector, techniques_to_ids[label]))
    return result_lst

In [13]:
train_lst = get_dataset(Path('/content/drive/My Drive/data/protechn_corpus_eval/train'))

id: 59526381559
id: 32194915387
id: 33748247649
id: 31808762171
id: 47605119071
id: 41105096806
id: 36163344507
id: 52498183368
id: 69294925216
id: 39414275139
id: 30155268335
id: 46988185699
id: 83173104362
id: 97506920380
id: 8359563559
id: 95967168572
id: 70596768299
id: 12402123807
id: 73261993887
id: 98031283058
id: 53367064078
id: 88984544092
id: 80813611079
id: 76806922030
id: 34042375985
id: 5326402550
id: 83366723989
id: 3490019195
id: 78669648346
id: 2966778328
id: 62799282082
id: 23687061547
id: 27152183323
id: 59051731723


In [14]:
test_lst = get_dataset(Path('/content/drive/My Drive/data/protechn_corpus_eval/test'))

id: 36081082999
id: 68462833391
id: 53424346461
id: 96558350001
id: 73936725916
id: 81020435922
id: 7838448925
id: 1241238761
id: 37505201774
id: 33351244185
id: 40120334507
id: 9894248866
id: 86789309327
id: 66812338278
id: 1173236160


In [15]:
len(train_lst), len(test_lst)

(31954, 22102)

In [0]:
train_tmp_lst = [(list(pair[0].detach().numpy()) + [int(pair[1])]) for pair in train_lst]
test_tmp_lst = [(list(pair[0].detach().numpy()) + [int(pair[1])]) for pair in test_lst]

In [0]:
# with open('/content/drive/My Drive/data/protechn_corpus_eval/train_lst.json', 'w', encoding='utf-8') as f:
#     json.dump(str(train_tmp_lst), f, ensure_ascii=False, indent=4)

In [0]:
# with open('/content/drive/My Drive/data/protechn_corpus_eval/test_lst.json', 'w', encoding='utf-8') as f:
#     json.dump(str(test_tmp_lst), f, ensure_ascii=False, indent=4)

In [0]:
train_df = pd.DataFrame.from_records(train_tmp_lst)
test_df = pd.DataFrame.from_records(test_tmp_lst)

In [0]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [19]:
train_df.shape, test_df.shape

((31954, 769), (22102, 769))

In [0]:
train_df = train_df.head(5000)
test_df = test_df.head(5000)

In [0]:
X_train, y_train = train_df.loc[:, train_df.columns != EMBEDDING_SIZE], train_df[EMBEDDING_SIZE]
X_test, y_test = test_df.loc[:, test_df.columns != EMBEDDING_SIZE], test_df[EMBEDDING_SIZE]

In [22]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5000, 768), (5000,), (5000, 768), (5000,))

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [0]:
# "Nearest Neighbors", 
names = [
    "Linear SVM", "RBF SVM", "Gaussian Process",
    "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
    "Naive Bayes", "QDA"
]

In [0]:
classifiers = [
    # KNeighborsClassifier(1),
    # SVC(kernel="linear", C=0.025),
    # SVC(gamma=2, C=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0)),
    # DecisionTreeClassifier(max_depth=5),
    # RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [32]:
from sklearn.metrics import f1_score, precision_score, recall_score

for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name}: {f1_score(y_test, y_pred, average='micro')}")
    print(f"{name}: {precision_score(y_test, y_pred, average='micro')}")
    print(f"{name}: {recall_score(y_test, y_pred, average='micro')}")
    break

Linear SVM: 0.06021775093266668
Linear SVM: 0.0681866559909289
Linear SVM: 0.05975088050525608


  _warn_prf(average, modifier, msg_start, len(result))
