In [1]:
import pandas as pd
import numpy as np
class Webis17:
    truth_file = None
    problem_file = None
    corpus = [] # (title, paragraphs, label)

    def __init__(self, path):
        self.truth_file = path + 'truth.jsonl'
        self.problem_file = path + 'instances.jsonl'

    def get_truths(self, size=100):
        df = pd.read_json(self.truth_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['truthMean'].values

    def get_texts(self, size=100):
        df = pd.read_json(self.problem_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['targetTitle'], df['targetParagraphs']

    def build_corpus(self, size=100):
        (truth_id, label) = self.get_truths(size)
        ground_truth = {truth_id[i] : label[i] for i in range(len(label))}
        (tweet_id, titles, texts) = self.get_texts(size)
        for i, tid in enumerate(tweet_id):
            try:
                if abs(ground_truth[tid] - 0.5) > 0.2: # getting only high confidence examples
                    self.corpus.append( (titles[i], ' '.join(txt for txt in texts[i]), ground_truth[tid]) ) # tid is discarded from now on
            except KeyError:
                print(f'Tweet {tid} is not in ground truth!')
                pass
        print(f'Getting {len(self.corpus)} valid examples from training set.')
web17 = Webis17('./data/clickbait17/')
web17.build_corpus(size=19538)

Getting 12963 valid examples from training set.


In [2]:
import nltk
sentences = []
truths = []
for tweet in web17.corpus:
    title, paragraph, label = tweet
    sent_pairs = []
    # for each sent in paragraph, pair it with the text
    for sent in nltk.sent_tokenize(paragraph):
        #sent_pairs.append("[CLS] " + title + " [SEP]" + sent + "[sep]")
        sent_pairs.append( (title, sent) )
    sentences.append(sent_pairs)
    truths.append(label)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences, truths, test_size=0.01)
class Trainset():
    def __getitem__(self, i):
        return X_train[i], y_train[i]
    def __len__(self):
        return len(y_train)
class Testset():
    def __getitem__(self, i):
        return X_test[i], y_test[i]
    def __len__(self):
        return len(y_test)
trainset, testset = Trainset(), Testset()


In [21]:
def get_attentions(outputs, layer=0, attention_head=0, avg=False):
    '''
    get the particular output for a particular layer and attention head
    layer -> 0 to 11
    attention_head -> 0 to 11
    '''
    if avg:
        #avg over all attention heads in a layer
        return outputs[layer].squeeze(0).mean(dim=0)  
    #return values for a particular attention head inside a specific layer
    return outputs[layer].squeeze(0)[attention_head]

In [22]:
len(testset)

130

In [14]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
bert_model.eval()

COS = torch.nn.CosineSimilarity(dim=0)


In [24]:
import torch
from torch.utils.data import DataLoader

batch_size = 256
train_loader = torch.utils.data.DataLoader( trainset, batch_size, shuffle=True )
test_loader = torch.utils.data.DataLoader( testset, batch_size, shuffle=True )

preds = []
labels = []

for data, label in test_loader:
    similarity_scores = []
    for sent_pair in data:
        (title, sent) = sent_pair
        input1 = tokenizer(title, return_tensors="pt", padding=True, truncation=True)
        input2 = tokenizer(sent,  return_tensors="pt", padding=True, truncation=True)

        #print(input1)
        #print(input2)

        with torch.no_grad():
            outputs1 = bert_model(**input1)
            outputs2 = bert_model(**input2)
            attention1 = get_attentions(outputs1).detach()
            attention2 = get_attentions(outputs2).detach()
            similarity_scores.append( COS(attention1, attention2).item() )
    score = np.mean(np.array(similarity_scores))
    preds.append(score)
    labels.append(label)

RuntimeError: The size of tensor a (29) must match the size of tensor b (114) at non-singleton dimension 0