# Accuracy functions

In [2]:
def get_count_from_logits(logits, labels):

    probs = F.softmax(logits, dim=1)
    output = torch.argmax(probs, dim=1)
    
    count = (output == labels.squeeze()).long().sum()

    return count.item(), labels.squeeze(1).size(-1)

# Install requirements 

In [3]:
%%capture
!pip install transformers tensorboard

In [4]:
%%capture
!pip install tensorflow

# Set parameters 

In [5]:
#@title Enviroments
from datetime import datetime
current_time = datetime.today().strftime('%Y-%m-%d-%H-%M-%S')

language = 'ru' #@param ["eng", "ru", "ROCStories"]
hidden_layers = 0 #@param [0, 1, 2]
hidden_per_layer = 768 #@param [768, 1024] {type:"raw"}
bert_output = 'pooler' #@param ['pooler', 'last_hidden_state', 'concatenation']
bert_model_name = 'DeepPavlov/rubert-base-cased' #@param ['bert-base-uncased', 'DeepPavlov/rubert-base-cased', 'bert-large-uncased', 'bert-base-multilingual-cased']

folder_name = "1log_{}_{}_{}_{}_{}".format(language, bert_model_name.split('/')[-1], bert_output, hidden_layers, hidden_per_layer)
if language=='eng':
    dataset_path = "/content/drive/My Drive/Colab Notebooks/next_sentence/english_dataset"
    log_path_base = "/content/drive/My Drive/Colab Notebooks/next_sentence/logs/"
    log_path = log_path_base+folder_name+"/"

    training_set_filename = 'english_dataset_filtered_v2_train.json'
    test_set_filename = 'english_dataset_filtered_v2_test.json'
    test_samples_filename = 'english_dataset_filtered_v2_test_samples.json'

elif language=='ru':
    dataset_path = "/content/drive/My Drive/Colab Notebooks/next_sentence/russian_dataset"
    log_path_base = "/content/drive/My Drive/Colab Notebooks/next_sentence/logs/"
    log_path = log_path_base+folder_name+"/"

    training_set_filename = 'russian_dataset_filtered_train.json'
    test_set_filename = 'russian_dataset_filtered_test.json'
    test_samples_filename = 'russian_dataset_filtered_test_samples.json'
    
else:
    dataset_path = "/content/drive/My Drive/Colab Notebooks/next_sentence/english_dataset"
    log_path_base = "/content/drive/My Drive/Colab Notebooks/next_sentence/logs/"
    log_path = log_path_base+folder_name+"/"

    training_set_filename = 'ROCStories_train.json'
    test_set_filename = 'ROCStories_test.json'
    test_samples_filename = 'ROCStories_test_samples.json'

checkpoint_path = dataset_path+"/1checkpoint_{}_{}_{}_{}_{}.pt".format(language, bert_model_name.split('/')[-1], bert_output, hidden_layers, hidden_per_layer)

step = 0

In [6]:
from torch.utils.tensorboard import SummaryWriter

# tb = SummaryWriter(log_dir=log_path+"general_{}_{}_{}_{}_{}".format(language, bert_model_name.split('/')[-1], bert_output, hidden_layers, hidden_per_layer))
# tb.add_text("Experiment_details", "Hidden layers: {:} ".format(hidden_layers))
# tb.add_text("Experiment_details", "Bert output utilized: {}".format(bert_output))
# tb.add_text("Experiment_details", "Bert base model: {}".format(bert_model_name.split('/')[-1]))
# tb.add_text("Experiment_details", "Hidden per layer: {:}".format(hidden_per_layer))
# tb.add_text("Experiment_details", "Training set file: {}".format(training_set_filename))
# tb.add_text("Experiment_details", "Test set file: {}".format(test_set_filename))
# tb_training = SummaryWriter(log_dir=log_path+"training_{}_{}_{}_{}_{}".format(language, bert_model_name.split('/')[-1], bert_output, hidden_layers, hidden_per_layer))
# tb_validation = SummaryWriter(log_dir=log_path+"validation_{}_{}_{}_{}_{}".format(language, bert_model_name.split('/')[-1], bert_output, hidden_layers, hidden_per_layer))
# tb_test = SummaryWriter(log_dir=log_path+"test_{}_{}_{}_{}_{}".format(language, bert_model_name.split('/')[-1], bert_output, hidden_layers, hidden_per_layer))

# Prepare dataset

In [7]:
from os.path import join
import json


In [8]:
with open(join(dataset_path, training_set_filename), encoding='utf-8') as f:
    train_articles = json.load(f)
    train_articles = [a for a in train_articles if len(a['paragraphs'])>0 ]

with open(join(dataset_path, test_set_filename), encoding='utf-8') as f:
    test_articles = json.load(f)
    test_articles = [a for a in test_articles if len(a['paragraphs'])>0 ]

    

In [9]:
print(len(train_articles))
if language == 'eng':
    # train_articles = train_articles[:600]
    pass
elif language == 'ru':
    # train_articles = train_articles[:1800]
    pass
else:
    pass

1871


In [10]:
# print(json.dumps(dataset[0], indent=4, ensure_ascii=False))

Create the positive sample set

In [11]:
from random import randrange, choice

def get_random_sent_by_bm25(dataset, source_sentence):
    tokenized_query = source_sentence.split(" ")
    top_n = 100 if len(sentence_corpus)>100 else len(sentence_corpus)
    cands = bm25.get_top_n(tokenized_query, sentence_corpus, n=top_n)
    refs = [source_sentence for i in range(top_n)]
    _, _, F1 = scorer.score(cands, refs, verbose=False)
    # print(F1)
    F1=F1.detach().cpu().numpy()
    # print(F1)
    # print(json.dumps(cands, indent=2))

    for i in range(top_n):
        if F1[i]<0.7:
            return cands[i]
    return cands[top_n-1]

def get_random_sent(dataset, exclude_a_index, exclude_p_index):
    target_a_index = randrange(len(dataset))
    if target_a_index==exclude_a_index: # same article
        paragraphs = dataset[target_a_index]['paragraphs']
        p_index = (exclude_p_index + len(paragraphs)//2) % len(paragraphs)
        return choice(paragraphs[p_index])
    else: # different article
        paragraphs = dataset[target_a_index]['paragraphs']
        paragraph = choice(paragraphs)
        sent = choice(paragraph)
        return sent
        
def get_samples(dataset):
    first_sentences = []
    second_sentences = []
    labels = []
    paragraph_list = [p for article in dataset for p in article['paragraphs']]
    sentence_list = [s for p in paragraph_list for s in p]
    word_list = [w for s in sentence_list for w in s.split(' ')]
    print("Total paragraphs: {}".format(len(paragraph_list)))
    print("Total sentences: {}".format(len(sentence_list)))
    print("Total words: {}".format(len(word_list)))
    # Positive samples
    for paragraph in paragraph_list:
        for i in range(0,len(paragraph)-1):
            first_sentences.append(paragraph[i])
            second_sentences.append(paragraph[i+1])
            labels.append(0)
    # Negative samples
    for a_index, article in enumerate(dataset):
        for p_index, paragraph in enumerate(article['paragraphs']):
            for i in range(0,len(paragraph)):
                ch = randrange(4)
                if ch==0 and i<len(paragraph)-2:
                    first_sentences.append(paragraph[i])
                    second_sentences.append(paragraph[randrange(i+2,min(len(paragraph), i+4))])
                elif ch==1 and i<len(paragraph)-1:
                    first_sentences.append(paragraph[i+1])
                    second_sentences.append(paragraph[i])
                elif ch==2:
                    first_sentences.append(paragraph[i])
                    second_sentences.append(paragraph[i])
                else:
                    first_sentences.append(paragraph[i])
                    second_sentences.append(get_random_sent(dataset, a_index, p_index))
                labels.append(1)
    return first_sentences, second_sentences, labels

In [12]:
from sklearn.model_selection import train_test_split
import os.path

# Get samples from training dataset
train_s1, train_s2, train_labels = get_samples(train_articles)
# Split sample set to training set and validation set
train_s1, val_s1,train_s2, val_s2, train_labels, val_labels = train_test_split(train_s1, train_s2, train_labels, train_size=0.8, test_size=0.2)

# Get samples from test dataset
if not os.path.isfile(join(dataset_path, test_samples_filename)):
    test_s1, test_s2, test_labels = get_samples(test_articles)
    with open(join(dataset_path, test_samples_filename), 'w', encoding='utf-8') as f:
        json.dump({'s1': test_s1, 's2':test_s2, 'labels':test_labels}, f, ensure_ascii=False)
else:
    with open(join(dataset_path, test_samples_filename), encoding='utf-8') as f:
        test_json = json.load(f)
        test_s1 = test_json['s1']
        test_s2 = test_json['s2']
        test_labels = test_json['labels']
        test_json = None

print(len(train_s1))
print(len(val_s1))
print(len(test_s1))

Total paragraphs: 22454
Total sentences: 92655
Total words: 1575472
130284
32572
4648


In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1649718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




In [14]:
train_encodings = tokenizer(train_s1, train_s2, padding=False)

In [15]:
c=0
t=0
for encodings in train_encodings['input_ids']:
    c+=1
    t+=len(encodings)
print(c,t)
print(t/c)

130284 6740708
51.738571121549846


In [None]:
%%time
train_encodings = tokenizer(train_s1, train_s2, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_s1, val_s2, truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(test_s1, test_s2, truncation=True, padding=True, max_length=64)

CPU times: user 59.4 s, sys: 1.23 s, total: 1min
Wall time: 18.3 s


In [None]:
print(train_encodings['input_ids'][5], train_labels[5])
# print(train_encodings['input_ids'][7000], labels[7000])

print(train_labels[:20])


[101, 845, 8092, 3299, 15459, 4627, 27209, 51553, 845, 304, 21506, 14067, 98715, 9176, 326, 128, 845, 304, 21506, 66089, 326, 2748, 14776, 21150, 68732, 2570, 128, 11267, 68732, 2237, 304, 39018, 6201, 102, 32157, 37789, 10887, 1638, 4415, 25817, 304, 110746, 70509, 326, 128, 48527, 2075, 15484, 22129, 1699, 4564, 845, 3689, 15484, 4161, 128, 130, 106472, 852, 128, 104088, 10058, 37789, 102] 1
[0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]


In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, sentences1, sentences2, encodings, labels):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).unsqueeze(0)
        item['sentence1'] = self.sentences1[idx]
        item['sentence2'] = self.sentences2[idx]
        
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_dataset = CustomDataset(train_s1, train_s2, train_encodings, train_labels)
val_dataset = CustomDataset(val_s1, val_s2, val_encodings, val_labels)
test_dataset = CustomDataset(test_s1, test_s2, test_encodings, test_labels)

# Create model and training 


## Create model

In [None]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel, BertModel 

class NextSentenceClassifier(nn.Module):

    # bert_output = ['pooler', 'last_hidden_state', 'concatenation']
    def __init__(self, freeze_bert = True, bert_model_name='bert-base-uncased', hidden_per_layer=768, hidden_layers = 1, bert_output = 'pooler', state_dict=None):
        super(NextSentenceClassifier, self).__init__()
        #Instantiating BERT model object 
        config = AutoConfig.from_pretrained(bert_model_name, output_hidden_states=True)

        if state_dict is not None:
            self.bert_layer = BertModel(config)
        else:
            self.bert_layer = AutoModel.from_pretrained(bert_model_name, config=config)

        self.hidden_layers = hidden_layers
        self.hidden_per_layer = hidden_per_layer
        self.bert_output = bert_output

        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        if bert_output=='concatenation':
            self.input_length = 768*4    
        else:
            if bert_model_name=='bert-large-uncased':
                self.input_length = 1024
            else:    
                self.input_length = 768
        
        # dense layer 1
        if hidden_layers == 0:
            self.fc1 = nn.Linear(self.input_length,2)
        else:
            self.fc1 = nn.Linear(self.input_length,self.hidden_per_layer)
        
        if hidden_layers!=0:
            # dropout layer
            self.dropout = nn.Dropout(0.1)
            # relu activation function
            self.relu =  nn.ReLU()

        if hidden_layers == 1:
            # dense layer 2 (Output layer)
            self.fc2 = nn.Linear(self.hidden_per_layer,2)
        else:
            self.fc2 = nn.Linear(self.hidden_per_layer,self.hidden_per_layer)
        
        if hidden_layers>1:
            # dropout layer
            self.dropout2 = nn.Dropout(0.1)
        
        if hidden_layers == 2:
            # dense layer 2 (Output layer)
            self.fc3 = nn.Linear(self.hidden_per_layer,2)

        if state_dict is not None:
            self.load_state_dict(state_dict)


    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        '''
        Inputs:
            -input_ids : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        if input_ids is None:
            input_ids = torch.zeros(1,64).long().to(device)
        if attention_mask is None:
            attention_mask = torch.zeros(1,64).long().to(device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(1,64).long().to(device)

        #Feeding the input to BERT model to obtain contextualized representations
        res = self.bert_layer(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)
        
        #Obtaining the representation of [CLS] head

        if self.bert_output == 'pooler':
            logits = res['pooler_output']
        elif self.bert_output == 'last_hidden_state':
            logits = res['last_hidden_state'][:,0]
        elif self.bert_output == 'concatenation':
            logits = torch.cat((res['hidden_states'][-4][:,0],res['hidden_states'][-3][:,0],res['hidden_states'][-2][:,0],res['hidden_states'][-1][:,0]),-1)
        else:
            raise ValueError('Value not supported')

        logits = self.fc1(logits)
        if self.hidden_layers==1:
            logits = self.relu(logits)
            logits = self.dropout(logits)
            logits = self.fc2(logits)
        if self.hidden_layers==2:
            logits = self.relu(logits)
            logits = self.dropout2(logits)
            logits = self.fc3(logits)

        return logits

## Training 

In [None]:
def evaluate(model, validation_loader, step):
    model.eval()
    eval_accuracy = 0
    count = 0
    total_loss = 0
    with torch.no_grad():
        with tqdm(iter(validation_loader), total=len(validation_loader)) as t:
            for it, batch in enumerate(t):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                # outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.view(-1, 2), labels.view(-1))

                cnt, ttl = get_count_from_logits(outputs, labels)
                eval_accuracy += cnt
                total_loss += loss.item()
                count+=ttl
                t.set_description("Loss : {} Accuracy : {}".format(total_loss/count, eval_accuracy/count))
            # tb_validation.add_scalar("Loss", total_loss/count, step)
            # tb_validation.add_scalar("Accuracy", eval_accuracy/count, step)
    return total_loss

In [None]:
def train(model, train_loader, step):
    model.train()
    eval_accuracy = 0
    count = 0
    total_loss = 0
    with tqdm(iter(train_loader), total=len(train_loader)) as t:
        for it, batch in enumerate(t):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            loss = criterion(outputs.view(-1, 2), labels.view(-1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            cnt, ttl = get_count_from_logits(outputs, labels)
            eval_accuracy += cnt
            total_loss += loss.item()
            count+=ttl

            step+=1
            if (step) % 200 == 0:
                # tb_training.add_scalar("Loss", total_loss/count, step)
                # tb_training.add_scalar("Accuracy", eval_accuracy/count, step)
                t.set_description("Loss : {} Accuracy : {}".format(total_loss/count, eval_accuracy/count))
                eval_accuracy = 0
                count = 0
                total_loss = 0
    return step

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = NextSentenceClassifier(freeze_bert = False, bert_model_name=bert_model_name, hidden_per_layer=hidden_per_layer, hidden_layers=hidden_layers, bert_output=bert_output)
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
# optim = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(reduction='sum').to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

if language == 'ROCStories':
    total_epochs = 1
else:
    total_epochs = 1
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_epochs*len(train_loader))

min_eval_loss = 9999999

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456796.0, style=ProgressStyle(descri…




### Actual training 

In [None]:
evaluate(model, validation_loader, step)

Loss : 0.6887256913815788 Accuracy : 0.5550165786565148: 100%|██████████| 1018/1018 [00:38<00:00, 26.22it/s]


22433.173219680786

In [None]:
for epoch in range(total_epochs):
    # Training
    print("")
    print("Epoch: {}".format(epoch+1), flush=True, end=' ')
    step = train(model, train_loader, step)
    # Evaluation
    eval_loss = evaluate(model, validation_loader, step)
    if eval_loss<min_eval_loss:
        min_eval_loss = eval_loss
        # torch.save({
        #     'language':language ,
        #     'bert_model_name':bert_model_name,
        #     'bert_output':bert_output,
        #     'hidden_layers':hidden_layers,
        #     'hidden_per_layer':hidden_per_layer,
        #     'epoch': epoch,
        #     'model_state_dict': model.state_dict()
        #     }, checkpoint_path)



Epoch: 1 

Loss : 0.40648954272270205 Accuracy : 0.7990625: 100%|██████████| 4072/4072 [08:47<00:00,  7.71it/s]
Loss : 0.38953482748966434 Accuracy : 0.8136436202873634: 100%|██████████| 1018/1018 [00:38<00:00, 26.26it/s]


In [None]:
# tb.add_graph(model, torch.zeros(1,64).long().cuda())

# Details on test set

## Load model from checkpoint

In [None]:
# import torch 
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# checkpoint = torch.load(checkpoint_path, map_location='cpu')

# model = NextSentenceClassifier(
#     freeze_bert = True, 
#     bert_model_name=checkpoint['bert_model_name'], 
#     hidden_per_layer=checkpoint['hidden_per_layer'], 
#     hidden_layers=checkpoint['hidden_layers'], 
#     bert_output=checkpoint['bert_output'],
#     state_dict=checkpoint['model_state_dict']
#     ).to(device)

## Define helper functions

In [None]:
import textwrap
import itertools
def plot_confusion_matrix(cm, class_names, title="Confusion matrix", x_label="Predicted label", y_label="True label", normalize=False, side_labels=False):
    """
    Returns a matplotlib figure containing the plotted confusion matrix.
    
    Args:
       cm (array, shape = [n, n]): a confusion matrix of integer classes
       class_names (array, shape = [n]): String names of the integer classes
    """

    if side_labels:
        figure = plt.figure(figsize=(20, 10))
        ax1, ax2 = figure.subplots(1, 2)
    else:
        figure = plt.figure(figsize=(10,10))
        ax1 = figure.subplots(1,1)

    im = ax1.imshow(cm, interpolation='nearest', cmap=plt.cm.Oranges, vmin=0)
    ax1.set_title(title,fontsize=24)
    plt.colorbar(im,ax=ax1)
    tick_marks = np.arange(len(class_names))
    ax1.set_xticks(tick_marks)
    ax1.set_yticks(tick_marks)
    if side_labels:
        ax1.set_xticklabels(tick_marks, rotation=45)
        ax1.set_yticklabels(tick_marks)
    else:
        ax1.set_xticklabels(class_names, rotation=45)
        ax1.set_yticklabels(class_names)
    
    if normalize:
        # Normalize the confusion matrix.
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        # Use white text if squares are dark; otherwise black.
    else:
        pass

    threshold = cm.max() / 2.
    cm = np.around(cm, decimals=3)
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color = "white" if cm[i, j] > threshold else "black"
        ax1.text(j, i, cm[i, j], horizontalalignment="center", color=color)
        
    figure.tight_layout()
    ax1.set_ylabel(y_label)
    ax1.set_xlabel(x_label)

    if side_labels:
        ax2.axis('off')
        side_text = ""
        for i, cls in enumerate(class_names):
            side_text+=textwrap.fill("{} - {}".format(i,cls))+"\n"
        ax2.text(0, 0.5, side_text, horizontalalignment="left",fontsize=18, va='center', color='black', wrap=True)

    return figure

In [None]:
import matplotlib.pyplot as plt
import io
import tensorflow as tf
def plot_to_image(figure, close=True):
    """
    Converts the matplotlib plot specified by 'figure' to a PNG image and
    returns it. The supplied figure is closed and inaccessible after this call.
    """
    
    buf = io.BytesIO()
    
    # Use plt.savefig to save the plot to a PNG in memory.
    plt.savefig(buf, format='png')
    
    # Closing the figure prevents it from being displayed directly inside
    # the notebook.
    if close:
        plt.close(figure)
        buf.seek(0)
        
        # Use tf.image.decode_png to convert the PNG buffer
        # to a TF image. Make sure you use 4 channels.
        image = tf.image.decode_png(buf.getvalue(), channels=3)
        
        return image
    else:
        return None

## On neighboring-sentence level 

### Process data 

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support


test_loader = DataLoader(test_dataset, batch_size=32)

model.eval()
eval_accuracy = 0
count = 0
total_loss = 0

criterion = nn.CrossEntropyLoss(reduction='sum').to(device)

test_labels = []
test_answers = []
with tqdm(iter(test_loader), total=len(test_loader)) as t:
    for it, batch in enumerate(t):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        first = batch['sentence1']
        second = batch['sentence2']

        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        probs = F.softmax(outputs, dim=1)
        model_answer = torch.argmax(probs, dim=1)
        
        test_labels.append(labels)
        test_answers.append(model_answer)

        for i in range(len(first)):
            tag = "Positive sample"
            if model_answer[i]==0:
                tag = "Negative sample"
            # tb_test.add_text(tag, "1st: {}  \n2nd:  {}  \nProbability: {}  \nModel answer: {}  \nTrue label: {}".format(
            #     first[i], second[i], probs[i].detach().cpu().numpy(), model_answer[i].detach().cpu().numpy(),labels[i].cpu().numpy()), 
            #     global_step = step )
        
        loss = criterion(outputs.view(-1, 2), labels.view(-1))

        # print(outputs, labels)
        cnt, ttl = get_count_from_logits(outputs, labels)
        eval_accuracy += cnt
        total_loss += loss.item()
        count+=ttl
        t.set_description("Loss : {} Accuracy : {}".format(total_loss/count, eval_accuracy/count))
    # tb_test.add_text("Loss", str(total_loss/count), global_step = step )
    # tb_test.add_text("Accuracy", str(eval_accuracy/count), global_step = step)

Loss : 0.34142061072544366 Accuracy : 0.8442340791738382: 100%|██████████| 146/146 [00:08<00:00, 18.06it/s]


In [None]:
test_labels = torch.cat(test_labels).cpu().numpy()
test_answers = torch.cat(test_answers).cpu().numpy()

### Neighboring Sentence Result 

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef
import matplotlib.pyplot as plt
import numpy as np
import itertools
import io
import tensorflow as tf

cm = confusion_matrix(test_labels, test_answers)
# figure = plot_confusion_matrix(cm, class_names=["True", "False"], title="Accuracy: {:3f}\nMc: {:3f}".format(accuracy_score(test_labels, test_answers), matthews_corrcoef(test_labels, test_answers)))
# cm_image = plot_to_image(figure)

print(cm[1][1],cm[0][0],cm[1][0],cm[0][1])

a,b,c,d = precision_recall_fscore_support(test_labels, test_answers, labels=[0,1])
print(a[1],b[1],c[1],d,"\n")
# tb_test.add_image("Neighboring sentence-pair classification result", cm_image.numpy(), global_step=step, dataformats='HWC')

2235 1689 407 317
0.8757836990595611 0.8459500378501136 0.8606083943011167 [2006 2642] 



## On paragraph level 


### Examine sentence level relation in paragraph

In [None]:
from itertools import product
import numpy as np
from torch.utils.data import DataLoader
import torch.nn.functional as F
def create_paragraph_matrix(model, paragraph, mask=None, threshold=0.5):

    if mask is not None:
        height = len(mask)
        width = len(mask[0])
        pair_index = []
        first_sentences = []
        second_sentences = []
        for i in range(height):
            for j in range(width):
                if mask[i][j]==1:
                    pair_index.append(i*width+j)
                    first_sentences.append(paragraph[i])
                    second_sentences.append(paragraph[j])
    else:
        combinations = list(product(paragraph, paragraph))
        first_sentences = [pair[0] for pair in combinations]
        second_sentences = [pair[1] for pair in combinations]
    total_elements = len(second_sentences)
    labels = [0]*total_elements

    encodings = tokenizer(first_sentences, second_sentences, truncation=True, padding=True, max_length=64)
    ds = CustomDataset(first_sentences, second_sentences, encodings, labels)
    loader = DataLoader(ds, batch_size=32, shuffle=False)

    results = []
    model.eval()
    with torch.no_grad():
        # with tqdm(iter(loader), total=len(loader),leave=False) as t:
        for it, batch in enumerate(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            probs = F.softmax(outputs, dim=1)
            probs = probs[:,0]
            results.append(probs)
            
    if mask is not None:
        results = torch.cat(results).detach().cpu().numpy()
        temp = [0]*(width*height)
        for i, val in enumerate(results):
            temp[pair_index[i]]=1 if val>threshold else 0

        results = np.reshape(temp, (len(paragraph), len(paragraph)))
    else:
        results = torch.cat(results)
        results = torch.reshape(results, (len(paragraph), len(paragraph))).detach().cpu().numpy()

    return results

In [None]:
# p_index = 0
# for article in test_articles:
#     for paragraph in article['paragraphs']:
#         matrix = create_paragraph_matrix(model, paragraph)
#         figure = plot_confusion_matrix(matrix, 
#                                     title="Sentence relation", 
#                                     y_label="First sentence", 
#                                     x_label="Second sentence", 
#                                     class_names=paragraph, 
#                                     normalize=False, 
#                                     side_labels=True)
#         cm_image = plot_to_image(figure)
#         tb_test.add_image("Sentence relation within paragraph", cm_image.numpy(), global_step=p_index, dataformats='HWC')
        
#         p_index+=1
#         if p_index>50:
#             break
#     if p_index>50:
#         break

In [None]:
# print(p_index)

In [None]:
# We work with the CAD routine CylindricalAlgebraicDecompose: part of the RegularChains Library for Maple. 
# It builds decompositions first of Cn before refining to a CAD of Rn.  
# We ran the code in Maple 2018 but used an updated version of the RegularChains Library.  
# Brown's heuristic and the features for ML were coded in the sympy package v1.3 for Python 2.7.  
# The sotd} heuristic was implemented in Maple as part of the ProjectionCAD package EWBD14. 
# Training and evaluation of the ML models was done using the scikit-learn package v0.20.2 for Python 2.7.  
# In order to implement our adapted cross-validation procedure we had to rewrite a number of the standard commands within the package to both use the redefined hopt in, and to access the data it requires during the cross-validation.


In [None]:
# We use our algorithm to approximate the coefficients of some small dimension hammocks, i.e., the 3×5, and the 5×5 hammocks. 
# The implementation of the algorithm was done in Maple software. 
# The exact reliability polynomials considered here are taken from. 
# We also compute upper and lower bounds for such networks, more exactly Stanley type of bounds (see), denoted by

### Classify paragraphs and detect unrelated sentences with and without sentence-distance 

In [None]:
import json
# matrix = [[0,1,1,0,0,0],
#           [0,0,0,0,0,0],
#           [0,1,0,1,0,0],
#           [0,1,0,0,1,0],
#           [0,1,0,0,0,1],
#           [0,1,0,0,0,0]]
def create_mask_matrix(sizeY, sizeX, distance):
    masked_matrix = [[0 for col in range(sizeX)] for row in range(sizeY)]
    cell_list = []
    for i in range(sizeY):
        for j in range(sizeX):
            if j>i and j-i<=distance:
                masked_matrix[i][j] = 1
                cell_list.append([i,j])
            else:
                masked_matrix[i][j] = 0
    return masked_matrix, cell_list

def mask_paragraph_matrix(matrix, distance, threshold=0.5):
    masked_matrix = [row[:] for row in matrix]
    height = len(masked_matrix)
    width = len(masked_matrix[0])
    cell_list = []
    for i in range(height):
        for j in range(width):
            if j>i and j-i<=distance:
                if masked_matrix[i][j]>threshold:
                    masked_matrix[i][j] = 1
                    cell_list.append([i,j])
                else:
                    masked_matrix[i][j] = 0
            else:
                masked_matrix[i][j] = 0
    return masked_matrix, cell_list

def cluster_paragraph_matrix(number_elements, edges):
    elements = [[i] for i in range(number_elements)]
    def find(i):
        for arr in elements:
            if i in arr:
                return arr
        return None
    def union(set1, set2):
        new_set = [*set1, *set2]
        elements.remove(set1)
        elements.remove(set2)
        elements.append(new_set)

    for edge in edges:
        if find(edge[0]) != find(edge[1]):
            union(find(edge[0]), find(edge[1]))
    
    return elements

# masked_arr, cell_list = mask_paragraph_matrix(matrix, 1)
# clusters = cluster_paragraph_matrix(len(masked_arr), cell_list)
# large_cluster = max(clusters, key=lambda item: len(item))
# unrelated = [e for arr in clusters for e in arr if arr!=large_cluster]
# print(matrix)
# print(masked_arr)
# print(cells)
# print(clusters)
# print(large_cluster)
# print(unrelated)

In [None]:
def process_paragraph(model, paragraph, distance=1, cmatrix=None, generate_full_matrix=True, threshold=0.5):
    if cmatrix is not None:
        matrix = cmatrix
    else:
        if generate_full_matrix:
            matrix = create_paragraph_matrix(model, paragraph)
        else:
            masked_arr, cell_list = create_mask_matrix(len(paragraph),len(paragraph), distance)
            matrix = create_paragraph_matrix(model, paragraph, mask=masked_arr, threshold=threshold)

    # process the matrix
    masked_arr, cell_list = mask_paragraph_matrix(matrix, distance,threshold=threshold)
    clusters = cluster_paragraph_matrix(len(masked_arr), cell_list)
    large_cluster = max(clusters, key=lambda item: len(item))
    # unrelated_sentences = [e for arr in clusters for e in arr if arr!=large_cluster]
    unrelated_sentences = [e for arr in clusters for e in arr if len(arr)==1]
    

    # overall connection is satisfied if there are no unrelated sentence
    paragraph_connection = 1 if len(unrelated_sentences)>0 else 0

    # convert array of indexes to marking
    unrelated_sentences = [ 1 if i in unrelated_sentences else 0 for i in range(len(masked_arr))]
    return paragraph_connection, unrelated_sentences # indexes of unrelated sentences

In [None]:
%%capture
!pip install bert_score
!pip install rank_bm25

In [None]:
import random
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef

from rank_bm25 import BM25Okapi
from bert_score import BERTScorer

sentence_corpus = [s for a in test_articles for p in a['paragraphs'] for s in p]
tokenized_sentence_corpus = [doc.split(" ") for doc in sentence_corpus]
bm25 = BM25Okapi(tokenized_sentence_corpus)
if language=='ROCStories' or language=='eng':
    scorer = BERTScorer(lang='en')
else:
    scorer = BERTScorer(lang='ru')

def run_paragraph_classification_test(distance = 1, threshold = 0.5):
    total_images = 0
    paragraph_labels = []
    paragraph_predictions = []

    sentence_labels = []
    sentence_predictions = []
    with tqdm(iter(test_articles), total=len(test_articles),leave=True) as t:
        for a_index, article in enumerate(t):
            for p_index, paragraph in enumerate(article['paragraphs']):
                # random label
                label = 1 # random.randrange(2)
                # create sample accordingly
                sample = [s for s in paragraph]

                unrelated_sentence_indexes = []
                if label==0:
                    pass
                elif label==1:
                    if language=='ROCStories':
                        index = randrange(len(sample))
                        unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                        sample[index] = unrelated_sentence
                        unrelated_sentence_indexes.append(index)
                    elif language=='eng':
                        numbers_of_unrelated = min(randrange(1,3),len(sample)//2)
                        for i in range(numbers_of_unrelated):
                            index = randrange(len(sample))
                            unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                            sample[index] = unrelated_sentence
                            unrelated_sentence_indexes.append(index)
                    else:
                        numbers_of_unrelated = 1 #min(randrange(1,3),len(sample)//2)
                        for i in range(numbers_of_unrelated):
                            index = randrange(len(sample))
                            unrelated_sentence = get_random_sent(test_articles, a_index, p_index)
                            sample[index] = unrelated_sentence
                            unrelated_sentence_indexes.append(index)
                            
                    #         unrelated_sentence = get_random_sent(test_articles, a_index, p_index)
                    #         index = randrange(len(sample))
                    #         sample.insert(index, unrelated_sentence)
                    #         unrelated_sentence_indexes = [a if a<index else a+1 for a in unrelated_sentence_indexes]
                    #         unrelated_sentence_indexes.append(index)

                # if distance==1 and total_images<50:
                #     matrix = create_paragraph_matrix(model, sample)
                #     figure = plot_confusion_matrix(matrix, 
                #                                 title="Sentence relation", 
                #                                 y_label="First sentence", 
                #                                 x_label="Second sentence", 
                #                                 class_names=sample, 
                #                                 normalize=False, 
                #                                 side_labels=True)
                #     cm_image = plot_to_image(figure)
                #     # tb_test.add_image("Sentence relation within paragraph", cm_image.numpy(), global_step=total_images, dataformats='HWC')
                    
                #     total_images+=1

                # process_paragraph (f(sample) -> label, unrelated_sentence_indexes)
                connection_pred, sentence_pred = process_paragraph(model, sample, distance=distance, generate_full_matrix=False,threshold=threshold)

                # check inserted sentence indexes in case of negative samples (unrelated_pred, compare to unrelated_sentence_indexes)
                # if label==1 and label==connection_pred:
                sentence_labels.extend([ 1 if i in unrelated_sentence_indexes else 0 for i in range(len(sample))])
                sentence_predictions.extend(sentence_pred)

                # check label
                paragraph_labels.append(label)
                paragraph_predictions.append(connection_pred)
                
    # cm = confusion_matrix(paragraph_labels, paragraph_predictions)
    # # figure = plot_confusion_matrix(cm, class_names=["Not containing unrelated sentence", "Containing unrelated sentence"], title="Accuracy: {:3f}\nMc: {:3f}".format(
    # #     accuracy_score(paragraph_labels, paragraph_predictions), matthews_corrcoef(paragraph_labels, paragraph_predictions)))
    # # cm_image = plot_to_image(figure)

    # print(cm[1][1],cm[0][0],cm[1][0],cm[0][1])

    # a,b,c,d = precision_recall_fscore_support(paragraph_labels, paragraph_predictions, labels=[0,1])
    # print(a[1],b[1],c[1],d,"\n")

    # tb_test.add_image("Paragraph classification", cm_image.numpy(), global_step=distance, dataformats='HWC')

    cm = confusion_matrix(sentence_labels, sentence_predictions)
    # figure = plot_confusion_matrix(cm, class_names=["Ok sentences", "Unrelated sentences"], title="Accuracy: {:3f}\nMc: {:3f}".format(
    #     accuracy_score(sentence_labels, sentence_predictions), matthews_corrcoef(sentence_labels, sentence_predictions)))
    # cm_image = plot_to_image(figure)

    print(cm[1][1],cm[0][0],cm[1][0],cm[0][1])

    a,b,c,d = precision_recall_fscore_support(sentence_labels, sentence_predictions, labels=[0,1])
    print(a[1],b[1],c[1],d,"\n")
    # tb_test.add_image("Unrelated sentence detection in case of negative sample", cm_image.numpy(), global_step=distance, dataformats='HWC')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




### Paragraph Classification and Incoherent Sentence Detection

In [None]:
for i in range(10):
    # for j in range(6):
    run_paragraph_classification_test(i+1, threshold=0.5)

100%|██████████| 209/209 [00:30<00:00,  6.90it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

1953 4951 176 1690
0.5360966236618172 0.9173320807891029 0.6767151767151768 [6641 2129] 



100%|██████████| 209/209 [00:33<00:00,  6.22it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

1868 5999 261 642
0.7442231075697211 0.8774072334429309 0.8053459797370123 [6641 2129] 



100%|██████████| 209/209 [00:34<00:00,  5.99it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

1843 6116 286 525
0.778293918918919 0.8656646312822922 0.8196575494774294 [6641 2129] 



100%|██████████| 209/209 [00:35<00:00,  5.83it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

1819 6109 310 532
0.773713313483624 0.8543917332080789 0.8120535714285715 [6641 2129] 



100%|██████████| 209/209 [00:36<00:00,  5.70it/s]
  0%|          | 1/209 [00:00<00:21,  9.70it/s]

1780 6125 349 516
0.7752613240418118 0.8360732738374824 0.8045197740112994 [6641 2129] 



100%|██████████| 209/209 [00:38<00:00,  5.47it/s]
  0%|          | 1/209 [00:00<00:20,  9.94it/s]

1797 6143 332 498
0.7830065359477124 0.8440582433067167 0.8123869801084991 [6641 2129] 



100%|██████████| 209/209 [00:38<00:00,  5.38it/s]
  0%|          | 1/209 [00:00<00:23,  8.92it/s]

1801 6150 328 491
0.7857766143106457 0.8459370596524189 0.8147477946166026 [6641 2129] 



100%|██████████| 209/209 [00:38<00:00,  5.41it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

1799 6103 330 538
0.7697903294822422 0.8449976514795678 0.80564263322884 [6641 2129] 



100%|██████████| 209/209 [00:38<00:00,  5.42it/s]
  0%|          | 1/209 [00:00<00:24,  8.53it/s]

1804 6139 325 502
0.782307025151778 0.8473461719116956 0.8135287485907553 [6641 2129] 



100%|██████████| 209/209 [00:38<00:00,  5.40it/s]

1813 6135 316 506
0.7818025010780509 0.8515735086895256 0.8151978417266187 [6641 2129] 






## Missing Sentence


In [None]:
def process_paragraph2(model, paragraph, distance=1, cmatrix=None, generate_full_matrix=True,threshold=0.5):
    if cmatrix is not None:
        matrix = cmatrix
    else:
        if generate_full_matrix:
            matrix = create_paragraph_matrix(model, paragraph)
        else:
            masked_arr, cell_list = create_mask_matrix(len(paragraph),len(paragraph), distance)
            matrix = create_paragraph_matrix(model, paragraph, mask=masked_arr, threshold=threshold)

    # process the matrix
    masked_arr, cell_list = mask_paragraph_matrix(matrix, distance,threshold=threshold)
    clusters = cluster_paragraph_matrix(len(masked_arr), cell_list)
    large_cluster = max(clusters, key=lambda item: len(item))
    unrelated_sentences = [e for arr in clusters for e in arr if len(arr)==1]
    
    # convert array of indexes to marking
    unrelated_sentences = [ 1 if i in unrelated_sentences else 0 for i in range(len(masked_arr))]

    cluster_marking = [0 for i in range(len(masked_arr))]
    for cls_i, cls in enumerate(clusters):
        for i in cls:
            cluster_marking[i]=cls_i

    # print(clusters)
    missing_sentences = [ 1 if cluster_marking[i]!=cluster_marking[i+1] and (len(clusters[cluster_marking[i]])>1 or len(clusters[cluster_marking[i+1]])>1) else 0 for i in range(len(cluster_marking)-1)]
    
    # overall connection is satisfied if there are no unrelated sentence
    paragraph_connection = 1 if 1 in missing_sentences else 0

    # print(missing_sentences)
    return paragraph_connection, unrelated_sentences, missing_sentences # marking array ( [0,1,0,0 ...] )

In [None]:
import random
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef

def run_paragraph_classification_test2(distance = 1, threshold=0.5):
    # total_images = 0
    paragraph_labels = []
    paragraph_predictions = []

    sentence_labels = []
    sentence_predictions = []
    with tqdm(iter(test_articles), total=len(test_articles),leave=True) as t:
        for a_index, article in enumerate(t):
            for p_index, paragraph in enumerate([x for x in article['paragraphs'] if len(x)>2]):
                # random label
                label = 1 #random.randrange(2)
                # create sample accordingly
                sample = [s for s in paragraph]
                # print(len(sample))
                gap_indexes = []
                if label==0:
                    pass
                elif label==1:
                    if language=='ROCStories':
                        index = randrange(1,len(sample)-1)
                        sample.pop(index) 
                        gap_indexes.append(index-1)
                    else:
                        numbers_of_unrelated = 1 #min(randrange(1,3),len(sample)//2)
                        for i in range(numbers_of_unrelated):
                            index = randrange(1,len(sample)-1)
                            sample.pop(index)
                            gap_indexes.append(index-1)

                # process_paragraph (f(sample) -> label, unrelated_sentence_indexes)
                connection_pred, sentence_pred, missing_pred = process_paragraph2(model, sample, distance=distance, generate_full_matrix=False, threshold=threshold)

                # check inserted sentence indexes in case of negative samples (unrelated_pred, compare to unrelated_sentence_indexes)
                # if label==1 and label==connection_pred:
                sentence_labels.extend([ 1 if i in gap_indexes else 0 for i in range(len(sample)-1)])
                sentence_predictions.extend(missing_pred)

                # print([ 1 if i in gap_indexes else 0 for i in range(len(sample)-1)])
                # check label
                paragraph_labels.append(label)
                paragraph_predictions.append(connection_pred)
                # print(" ")
                
    # cm = confusion_matrix(paragraph_labels, paragraph_predictions, labels=[0,1])
    # # figure = plot_confusion_matrix(cm, class_names=["Not containing unrelated sentence", "Containing unrelated sentence"], title="Accuracy: {:3f}\nMc: {:3f}".format(
    # #     accuracy_score(paragraph_labels, paragraph_predictions), matthews_corrcoef(paragraph_labels, paragraph_predictions)))
    # # cm_image = plot_to_image(figure)

    # print(cm)
    # print(cm[1][1],cm[0][0],cm[1][0],cm[0][1])

    # a,b,c,d = precision_recall_fscore_support(paragraph_labels, paragraph_predictions, labels=[0,1])
    # print(a[1],b[1],c[1],d,"\n")

    # tb_test.add_image("Paragraph classification", cm_image.numpy(), global_step=distance, dataformats='HWC')

    cm = confusion_matrix(sentence_labels, sentence_predictions, labels=[0,1])
    # figure = plot_confusion_matrix(cm, class_names=["Ok sentences", "Unrelated sentences"], title="Accuracy: {:3f}\nMc: {:3f}".format(
    #     accuracy_score(sentence_labels, sentence_predictions), matthews_corrcoef(sentence_labels, sentence_predictions)))
    # cm_image = plot_to_image(figure)

    print(cm[1][1],cm[0][0],cm[1][0],cm[0][1])

    a,b,c,d = precision_recall_fscore_support(sentence_labels, sentence_predictions, labels=[0,1])
    print(a[1],b[1],c[1],d,"\n")
    # tb_test.add_image("Unrelated sentence detection in case of negative sample", cm_image.numpy(), global_step=distance, dataformats='HWC')

### Missing Sentence Result 

In [None]:
for i in range(5):
    # for j in range(6):
    run_paragraph_classification_test2(i+1, threshold=0.5)

100%|██████████| 209/209 [00:31<00:00,  6.56it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

321 2079 1808 304
0.5136 0.15077501174260216 0.23311546840958602 [2383 2129] 



100%|██████████| 209/209 [00:32<00:00,  6.35it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

176 2233 1953 150
0.5398773006134969 0.08266791921089714 0.1433808553971487 [2383 2129] 



100%|██████████| 209/209 [00:33<00:00,  6.23it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

128 2278 2001 105
0.5493562231759657 0.060122123062470646 0.1083827265029636 [2383 2129] 



100%|██████████| 209/209 [00:34<00:00,  6.12it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

136 2295 1993 88
0.6071428571428571 0.06387975575387506 0.11559711007224818 [2383 2129] 



100%|██████████| 209/209 [00:34<00:00,  6.02it/s]


121 2290 2008 93
0.5654205607476636 0.05683419445749178 0.1032863849765258 [2383 2129] 



## Paragraph classification 


In [None]:
import random
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef

def run_paragraph_classification_test3(distance = 1, distance2 =1):
    # total_images = 0
    paragraph_labels = []
    paragraph_predictions = []

    sentence_labels = []
    sentence_predictions = []
    count = 0
    with tqdm(iter(test_articles), total=len(test_articles),leave=True) as t:
        for a_index, article in enumerate(t):
            for p_index, paragraph in enumerate([x for x in article['paragraphs'] if len(x)>2]):
                # random label
                count+=1
                label = count % 3 #random.randrange(2)
                if len(paragraph)>4 and random.randrange(3)!=0:
                    label=3
                # create sample accordingly
                sample = [s for s in paragraph]
                # print(len(sample))
                # gap_indexes = []
                if label==0:
                    pass
                else:
                    if language=='ROCStories':
                        if label==1 or label==3:
                            index = randrange(1,len(sample)-1)
                            sample.pop(index) 
                            # gap_indexes.append(index-1)
                        if label==2 or label==3:
                            index = randrange(len(sample))
                            unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                            sample[index] = unrelated_sentence
                            # unrelated_sentence_indexes.append(index)
                    elif language=='eng':
                        if label==3:
                            indexes = [x + i for i, x in enumerate(sorted(random.sample(range(1,5-(2)), 2)))]
                            sample.pop(indexes[0])
                            unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                            sample[indexes[1]] = unrelated_sentence
                        if label==1:
                            numbers_of_unrelated = 1 
                            for i in range(numbers_of_unrelated):
                                index = randrange(1,len(sample)-1)
                                sample.pop(index)
                                # gap_indexes.append(index-1)
                        if label==2:
                            numbers_of_unrelated = 1 #min(randrange(1,3),len(sample)//2)
                            for i in range(numbers_of_unrelated):
                                index = randrange(len(sample))
                                unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                                sample[index] = unrelated_sentence
                                # unrelated_sentence_indexes.append(index)
                    else:
                        if label==3:
                            indexes = [x + i for i, x in enumerate(sorted(random.sample(range(1,len(sample)-(2)), 2)))]
                            unrelated_sentence = get_random_sent(test_articles, a_index, p_index)
                            sample[indexes[0]] = unrelated_sentence
                            sample.pop(indexes[1])
                        if label==1:
                            numbers_of_unrelated = 1 
                            for i in range(numbers_of_unrelated):
                                index = randrange(1,len(sample)-1)
                                sample.pop(index)
                                # gap_indexes.append(index-1)
                        if label==2:
                            numbers_of_unrelated = 1 #min(randrange(1,3),len(sample)//2)
                            for i in range(numbers_of_unrelated):
                                index = randrange(len(sample))
                                unrelated_sentence = get_random_sent(test_articles, a_index, p_index)
                                sample[index] = unrelated_sentence
                                # unrelated_sentence_indexes.append(index)
                # process_paragraph (f(sample) -> label, unrelated_sentence_indexes)
                connection_pred, _ = process_paragraph(model, sample, distance=distance, generate_full_matrix=False)
                connection_pred2, _, _ = process_paragraph2(model, sample, distance=distance2, generate_full_matrix=False)

                # check inserted sentence indexes in case of negative samples (unrelated_pred, compare to unrelated_sentence_indexes)
                # if label==1 and label==connection_pred:
                # sentence_labels.extend([ 1 if i in gap_indexes else 0 for i in range(len(sample)-1)])
                # sentence_predictions.extend(missing_pred)

                # print([ 1 if i in gap_indexes else 0 for i in range(len(sample)-1)])
                # check label
                paragraph_labels.append(label)
                paragraph_predictions.append(connection_pred*2 + connection_pred2)
                # print(" ")
                
    cm = confusion_matrix(paragraph_labels, paragraph_predictions, labels=[0,1,2,3])
    # figure = plot_confusion_matrix(cm, class_names=["Not containing unrelated sentence", "Containing unrelated sentence"], title="Accuracy: {:3f}\nMc: {:3f}".format(
    #     accuracy_score(paragraph_labels, paragraph_predictions), matthews_corrcoef(paragraph_labels, paragraph_predictions)))
    # cm_image = plot_to_image(figure)

    print(cm)
    
    a,b,c,d = precision_recall_fscore_support(paragraph_labels, paragraph_predictions, labels=[0,1,2,3])
    print(a,b,c,d,"\n")

    # figure = plot_confusion_matrix(cm, 
    #                             title="Sentence relation", 
    #                             y_label="First sentence", 
    #                             x_label="Second sentence", 
    #                             class_names=["ok","discordant sentence", "missing sentence", "both"], 
    #                             normalize=False, 
    #                             side_labels=True)
    # cm_image = plot_to_image(figure, close=False)

### Paragraph Classification Result

In [None]:
for i in range(2):
    for j in range(2):
        print(i,"-----",j)
        run_paragraph_classification_test3(i+1,j+1)


  0%|          | 0/209 [00:00<?, ?it/s]

0 ----- 0


100%|██████████| 209/209 [01:04<00:00,  3.23it/s]
  0%|          | 1/209 [00:00<00:32,  6.36it/s]

[[374  28  12 156]
 [369  14 103  91]
 [ 25  11 150 415]
 [  8  16  48 309]]
[0.48195876 0.20289855 0.47923323 0.31822863] [0.65614035 0.02426343 0.24958403 0.81102362] [0.55572065 0.04334365 0.32822757 0.45710059] [570 577 601 381] 

0 ----- 1


100%|██████████| 209/209 [01:04<00:00,  3.22it/s]
  0%|          | 1/209 [00:00<00:31,  6.60it/s]

[[396   3  82  81]
 [396   2 128  54]
 [ 23   2  81 472]
 [ 10   7  41 351]]
[0.48       0.14285714 0.2439759  0.36638831] [0.70462633 0.00344828 0.14013841 0.85819071] [0.57101658 0.00673401 0.17802198 0.51353328] [562 580 578 409] 

1 ----- 0


100%|██████████| 209/209 [01:05<00:00,  3.21it/s]
  0%|          | 1/209 [00:00<00:32,  6.36it/s]

[[374 116  12  70]
 [378  49 103  57]
 [ 24  44 130 397]
 [  1  48  48 278]]
[0.48133848 0.19066148 0.44368601 0.34663342] [0.65384615 0.0834753  0.21848739 0.74133333] [0.5544848  0.11611374 0.29279279 0.47238743] [572 587 595 375] 

1 ----- 1


100%|██████████| 209/209 [01:05<00:00,  3.17it/s]

[[483  11   4  74]
 [440   5  95  44]
 [ 46   3  63 471]
 [ 34  15  15 326]]
[0.48155533 0.14705882 0.3559322  0.35628415] [0.84440559 0.00856164 0.10806175 0.83589744] [0.61333333 0.01618123 0.16578947 0.49961686] [572 584 583 390] 






In [None]:
torch.save({
    'language':language ,
    'bert_model_name':bert_model_name,
    'model_state_dict': model.state_dict()
    }, checkpoint_path)

## Paragraph classification 


In [None]:
import random
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef

def run_paragraph_classification_test4(distance = 1, type=1):
    # total_images = 0
    paragraph_labels = []
    paragraph_predictions = []

    sentence_labels = []
    sentence_predictions = []
    count = 0
    with tqdm(iter(test_articles), total=len(test_articles),leave=True) as t:
        for a_index, article in enumerate(t):
            for p_index, paragraph in enumerate([x for x in article['paragraphs'] if len(x)>2]):
                # random label
                count+=1
                label = count % 2 #random.randrange(2)
                # create sample accordingly
                sample = [s for s in paragraph]
                # print(len(sample))
                # gap_indexes = []
                if label==0:
                    pass
                else:
                    if language=='ROCStories':
                        if type==0:
                            index = randrange(1,len(sample)-1)
                            sample.pop(index) 
                            # gap_indexes.append(index-1)
                        if type==1:
                            index = randrange(len(sample))
                            unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                            sample[index] = unrelated_sentence
                            # unrelated_sentence_indexes.append(index)
                    elif language=='eng':
                        if type==0:
                            numbers_of_unrelated = 1 
                            for i in range(numbers_of_unrelated):
                                index = randrange(1,len(sample)-1)
                                sample.pop(index)
                                # gap_indexes.append(index-1)
                        if type==1:
                            numbers_of_unrelated = 1 #min(randrange(1,3),len(sample)//2)
                            for i in range(numbers_of_unrelated):
                                index = randrange(len(sample))
                                unrelated_sentence = get_random_sent_by_bm25(test_articles, sample[index])
                                sample[index] = unrelated_sentence
                                # unrelated_sentence_indexes.append(index)
                    else:
                        if type==0:
                            numbers_of_unrelated = 1 
                            for i in range(numbers_of_unrelated):
                                index = randrange(1,len(sample)-1)
                                sample.pop(index)
                                # gap_indexes.append(index-1)
                        if type==1:
                            numbers_of_unrelated = 1 #min(randrange(1,3),len(sample)//2)
                            for i in range(numbers_of_unrelated):
                                index = randrange(len(sample))
                                unrelated_sentence = get_random_sent(test_articles, a_index, p_index)
                                sample[index] = unrelated_sentence
                                # unrelated_sentence_indexes.append(index)
                # process_paragraph (f(sample) -> label, unrelated_sentence_indexes)
                if type==0:
                    connection_pred, _ = process_paragraph(model, sample, distance=distance, generate_full_matrix=False)
                if type==1:
                    connection_pred, _, _ = process_paragraph2(model, sample, distance=distance, generate_full_matrix=False)

                # check inserted sentence indexes in case of negative samples (unrelated_pred, compare to unrelated_sentence_indexes)
                # if label==1 and label==connection_pred:
                # sentence_labels.extend([ 1 if i in gap_indexes else 0 for i in range(len(sample)-1)])
                # sentence_predictions.extend(missing_pred)

                # print([ 1 if i in gap_indexes else 0 for i in range(len(sample)-1)])
                # check label
                paragraph_labels.append(label)
                paragraph_predictions.append(connection_pred)
                # print(" ")
                
    cm = confusion_matrix(paragraph_labels, paragraph_predictions, labels=[0,1])
    # figure = plot_confusion_matrix(cm, class_names=["Not containing unrelated sentence", "Containing unrelated sentence"], title="Accuracy: {:3f}\nMc: {:3f}".format(
    #     accuracy_score(paragraph_labels, paragraph_predictions), matthews_corrcoef(paragraph_labels, paragraph_predictions)))
    # cm_image = plot_to_image(figure)

    print(cm)
    
    a,b,c,d = precision_recall_fscore_support(paragraph_labels, paragraph_predictions, labels=[0,1])
    print(a,b,c,d,"\n")

    # figure = plot_confusion_matrix(cm, 
    #                             title="Sentence relation", 
    #                             y_label="First sentence", 
    #                             x_label="Second sentence", 
    #                             class_names=["ok","discordant sentence", "missing sentence", "both"], 
    #                             normalize=False, 
    #                             side_labels=True)
    # cm_image = plot_to_image(figure, close=False)

### Paragraph Classification Result

In [None]:
for i in range(2):
    for j in range(2):
        print(i,j)
        run_paragraph_classification_test4(distance=j+1,type=i)


  0%|          | 0/209 [00:00<?, ?it/s]

0 0


100%|██████████| 209/209 [00:33<00:00,  6.26it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

[[770 294]
 [715 350]]
[0.51851852 0.54347826] [0.72368421 0.3286385 ] [0.60415849 0.40959626] [1064 1065] 

0 1


100%|██████████| 209/209 [00:33<00:00,  6.29it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

[[927 137]
 [812 253]]
[0.53306498 0.64871795] [0.8712406  0.23755869] [0.66143418 0.34776632] [1064 1065] 

1 0


100%|██████████| 209/209 [00:32<00:00,  6.50it/s]
  0%|          | 0/209 [00:00<?, ?it/s]

[[685 379]
 [252 813]]
[0.73105656 0.68204698] [0.64379699 0.76338028] [0.68465767 0.72042534] [1064 1065] 

1 1


100%|██████████| 209/209 [00:33<00:00,  6.22it/s]

[[906 158]
 [199 866]]
[0.8199095  0.84570312] [0.85150376 0.81314554] [0.83540802 0.82910483] [1064 1065] 






## Close Tensorboard Writers

In [None]:
# tb.close()
# tb_test.close()
# tb_training.close()
# tb_validation.close()

# Tensorboard Summary

In [None]:
# %load_ext tensorboard

**1. Run the next block to get experiment' reports list.**

In [None]:
#@title Available save files
# import os
# from os.path import join

# for p in [ f.path for f in os.scandir(join(log_path_base)) if f.is_dir() ]:
#     print("\""+p+"\"")

**2. Copy 1 of those and paste in examining_path and run it.**

In [None]:
# import ipywidgets as widgets
# examining_path = "\"/content/drive/My Drive/Colab Notebooks/next_sentence/logs/log_ROCStories_bert-base-uncased_pooler_0_768\"" #@param {type:"string"}

**3. Launch tensorboard**

In [None]:
# %tensorboard --logdir {examining_path} --samples_per_plugin "text=100,images=200"

# Interactive form


Notes: require running 3 first main item + ((prepare dataset + create model and training) or (load model from checkpoint))

In [None]:
# #@title form 

# input = "There are commonly two approaches when evaluating a \
# keyphrase extraction model. The first approach involves a human \
# annotator, who reads the article and the result extracted by the \
# model and assesses them manually. \
# Thus, the result of this paper can serve as a valuable resource to improve existing systems. \
# This approach requires a high \
# amount of manual effort, and the result can be affected by subjective \
# opinions. The second approach makes use of the metrics like \
# Precision, Recall, and F1-score and compares the extracted \
# list of keyphrases with the list of keyphrases annotated by \
# authors." 

# distance = 1 #@param {type:"slider", min:1, max:5, step:1}

In [None]:
# import numpy 
# # preprocessing input
# paragraph = [s.strip() for s in input.split('.') if s.strip()]
# # process with model
# matrix = create_paragraph_matrix(model, paragraph)
# # output result (sentence relation matrix, paragraph integrity prediction (+unrelated sentence prediction in negative case))
# figure = plot_confusion_matrix(matrix, 
#                             title="Sentence relation", 
#                             y_label="First sentence", 
#                             x_label="Second sentence", 
#                             class_names=paragraph, 
#                             normalize=False, 
#                             side_labels=True)
# cm_image = plot_to_image(figure, close=False)

# masked_matrix, _ = mask_paragraph_matrix(matrix, 2)
# figure = plot_confusion_matrix(numpy.array(masked_matrix), 
#                             title="Sentence relation", 
#                             y_label="First sentence", 
#                             x_label="Second sentence", 
#                             class_names=paragraph, 
#                             normalize=False, 
#                             side_labels=True)
# cm_image = plot_to_image(figure, close=False)

# integrity, unrelated = process_paragraph(model, paragraph, distance, cmatrix=matrix)

# print("Paragraph integrity: {}; Unrelated sentences: {}".format(str(integrity), str(unrelated)))
