In [7]:
%load_ext autoreload
%autoreload 2

from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

import os
import re
from functools import partial
from typing import List

import pandas as pd 
import numpy as np
from sklearn.metrics import cohen_kappa_score
from transformers import BertTokenizerFast

from src.manipulation_helpers.data_preparation import markup_conll, encode_tags, Markup, \
                                                      read_markup, create_span_targeting_data, \
                                                      SpanTargetingDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
PATH_TO_FULL_MARKUP = "data/markup_union_with_empty.json"
PATH_TO_MATCHED = "data/markup_union_matched.json"
PATH_TO_ENG_DATA = "data/data/protechn_corpus_eval/train"
MODEL_NAME = "DeepPavlov/rubert-base-cased"

In [22]:
text  = open(PATH_TO_ENG_DATA + f'/article{idx}.txt', 'rb').read().decode('utf-8').replace('\n', '')

In [23]:
text

'Confirmed: Authorities LIED About Las Vegas Shooter’s Hotel Check-In Date – What Else Are They Hiding?Over the last few days the alternative media has spent countless hours conducting their own investigations into what actually happened during the mass shooting in Las Vegas that left 59 dead and over 500 injured.From reports of multiple shooters to officials seemingly covering up the ISIS connection, many different theories have been put forth that counter the mainstream narrative.Now, new information released by investigative reporter Laura Loomer proves that authorities have directly lied to the American people about the case at least once by claiming that supposed shooter Stephen Paddock checked into the Mandalay Bay Hotel on September 28th when valet records (with photos) prove he actually arrived three days earlier.According to Loomer, she obtained the image from a source which shows that Paddock’s car first arrived September 25th.The photo even has a handwritten note that was re

In [38]:
eng_text[0][191: 222]

'Stop Islamization of America.\n\n'

In [46]:
tagged_text = [(letter, tag) for letter, tag in zip(
    eng_text[0], 
    [0 for _ in eng_text[0]][: 191] + [1 for _ in eng_text[0]][: 31] + [0 for _ in eng_text[0]][222:]
)]

In [58]:
def custom_split(tagged_text):
    text = []
    tags = []
    word = ''
    prev_tag = 0
    for letter, tag in tagged_text:
        if letter == ' ' or letter == '\n':
            if word == '':
                continue
                
            text.append(word)
            tags.append(prev_tag)
            word = ''
            prev_tag = 0
        else:
            word += letter
            prev_tag = tag
    return text, tags

def create_bin_span(text, bounds):
    span = [0 for _ in range(bounds[0][0])]
    for i, bound in enumerate(bounds):
        if i == len(bounds) - 1:
            span += [1 for _ in range(bound[1] - bound[0])]
            span += [0 for _ in range(len(text) - bound[1])]
        else:
            span += [1 for _ in range(bound[1] - bound[0])]
            span += [0 for _ in range(bounds[i + 1][0] - bound[1])]
    return [(letter, tag) for letter, tag in zip(text, span)]

In [59]:
tt_2 = create_bin_span(eng_text[0], [(191, 222)])

In [61]:
tt_2 == tagged_text

True

In [49]:
text, tags = custom_split(tagged_text)

In [57]:
text[30:35], tags[30:35]

(['Stop', 'Islamization', 'of', 'America.', 'They'], [1, 1, 1, 1, 0])

In [62]:
eng_text = []
eng_spans = []

for filename in os.listdir(PATH_TO_ENG_DATA):
    if filename.endswith('.tsv'):
        idx = re.search('\d+', filename).group(0)
        text = open(PATH_TO_ENG_DATA + f'/article{idx}.txt', 'rb').read().decode('utf-8')
        try:
            spans = pd.read_table(PATH_TO_ENG_DATA + f'/{filename}', header=None)
        except:
            continue
        bounds = []
        for i in range(len(spans)):
            bounds.append((spans[2].iloc[i], spans[3].iloc[i]+1))
        tagged_text = create_bin_span(text, bounds)
        text, tags = custom_split(tagged_text)
        eng_text.append(text)
        eng_spans.append(tags)

In [69]:
for i in range(len(eng_spans)):
    if len(eng_text[i]) != len(eng_spans[i]):
        print("PIZDA")

In [80]:
def post_proc(span):
    for i in range(len(span) - 1):
        if span[i] == 0 and span[i + 1] == 1:
            span[i + 1] = 2
    return span 

In [81]:
eng_spans = list(map(post_proc, eng_spans))

In [82]:
eng_spans[0][30:]

[2,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [2]:
markup = pd.read_json(PATH_TO_MATCHED, lines=True)
markup_union = pd.read_json(PATH_TO_FULL_MARKUP, lines=True)
markup_union = markup_union.merge(markup_union.groupby(['input_document_id'])['assignment_worker_id'].agg('count').reset_index(), on=['input_document_id'])


print("Всего разметок:", len(markup_union))
print("Всего сматченных разметок:", len(markup))

Всего разметок: 5868
Всего сматченных разметок: 1686


In [45]:
markup_union = markup_union[markup_union['assignment_worker_id_y'] == 3]
map_idx2markup = {
    i: {'data': markup_union[markup_union['markup_index'] == i]} for i in range(3)
}

In [79]:
len(set(map_idx2markup[1]['data'].input_document_id.unique()).intersection(set(markup.input_document_id.unique())))

1488

In [46]:
full_texts = []
full_markups = []
for i in range(3):
    texts_union = []
    markups_union = []
    for index, row in map_idx2markup[i]['data'].iterrows():
        text, row_markup = markup_conll(row["input_input"], 
                                        row["output_result"], 
                                        row["input_entitiesdata"])

        texts_union.append(text)
        markups_union.append(row_markup)
    full_texts += texts_union
    full_markups += markups_union
    map_idx2markup[i]['texts'] = texts_union
    map_idx2markup[i]['markups'] = markups_union
    clear_output()

In [47]:
unique_tags = {x.manipulation_class for m in markups_union for x in m}
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

bio2id = {
    'O': 0,
    'I': 1,
    'B': 2,
}
id2bio = {
    idx : tag for tag, idx in bio2id.items()
}

In [55]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

encode = partial(
    tokenizer,
    add_special_tokens=False, 
    is_split_into_words=True, 
    padding=True, 
    truncation=True, 
    max_length=512)

encoded_sep_token = encode([tokenizer.sep_token])['input_ids'][0]
encoded_cls_token = encode([tokenizer.cls_token])['input_ids'][0]

for idx in map_idx2markup:
    map_idx2markup[idx]['encodings'] = encode(map_idx2markup[idx]['texts'])
    map_idx2markup[idx]['spans'] = encode_tags(map_idx2markup[idx]['markups'], map_idx2markup[idx]['encodings'], "bio_span", tag2id=bio2id)

In [81]:
def span_matrics(span1: List, span2: List):
    span1, span2 = np.array(span1), np.array(span2)
    if (span1 > 0).sum() == 0 and (span2 > 0).sum() == 0:
        return None
    recall = np.nan_to_num((((span1 > 0) & (span2 > 0)).sum(-1) / (span1 > 0).sum(-1)).sum() / (span1 > 0).any().sum())
    precision = np.nan_to_num((((span1 > 0) & (span2 > 0)).sum(-1) / (span2 > 0).sum(-1)).sum() / (span2 > 0).any().sum())
    cohen_kappa = cohen_kappa_score(span1[span1 != -100], span2[span1 != -100])
    return recall, precision, cohen_kappa

span_matrics(map_idx2markup[0]['spans'][0], map_idx2markup[1]['spans'][0])

  recall = np.nan_to_num((((span1 > 0) & (span2 > 0)).sum(-1) / (span1 > 0).sum(-1)).sum() / (span1 > 0).any().sum())


(0.0, 0.0, 0.0)

In [85]:
precisions = []
recalls = []
cohen_kappas = []

for span1, span2 in zip(map_idx2markup[1]['spans'], map_idx2markup[0]['spans']):
    output = span_matrics(span1, span2)
    if output is None:
        continue
    p, r, ck = output
    precisions.append(p)
    recalls.append(r)
    cohen_kappas.append(ck)
    
print(sum(precisions) / len(precisions))
print(sum(recalls) / len(recalls))
print(sum(cohen_kappas) / len(cohen_kappas))

0.22477731183850802
0.22335330633831046
0.12036711069922158
