In [4]:
import pandas as pd
import numpy as np
class Webis17:
    truth_file = None
    problem_file = None
    corpus = [] # (title, paragraphs, label)

    def __init__(self, path):
        self.truth_file = path + 'truth.jsonl'
        self.problem_file = path + 'instances.jsonl'

    def get_truths(self, size=100):
        df = pd.read_json(self.truth_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['truthMean'].values

    def get_texts(self, size=100):
        df = pd.read_json(self.problem_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['targetTitle'], df['targetParagraphs']

    def build_corpus(self, size=100):
        (truth_id, label) = self.get_truths(size)
        ground_truth = {truth_id[i] : label[i] for i in range(len(label))}
        (tweet_id, titles, texts) = self.get_texts(size)
        for i, tid in enumerate(tweet_id):
            try:
                if abs(ground_truth[tid] - 0.5) > 0.2: # getting only high confidence examples
                    self.corpus.append( (titles[i], ' '.join(txt for txt in texts[i]), ground_truth[tid]) ) # tid is discarded from now on
            except KeyError:
                print(f'Tweet {tid} is not in ground truth!')
                pass
        print(f'Getting {len(self.corpus)} valid examples from training set.')
web17 = Webis17('./data/clickbait17/')
web17.build_corpus(size=19538)

Getting 12963 valid examples from training set.


In [8]:
import nltk
sentences = []
truths = []
for tweet in web17.corpus:
    title, paragraph, label = tweet
    seconds = []
    # for each sent in paragraph, pair it with the text
    for sent in nltk.sent_tokenize(paragraph):
        seconds.append( sent )
    sentences.append((title, seconds))
    truths.append(label)

In [6]:
def get_attentions(outputs, layer=0, attention_head=0, avg=False):
    '''
    get the particular output for a particular layer and attention head
    layer -> 0 to 11
    attention_head -> 0 to 11
    '''
    if avg:
        #avg over all attention heads in a layer
        return outputs[layer].squeeze(0).mean(dim=0)  
    #return values for a particular attention head inside a specific layer
    return outputs[layer].squeeze(0)[attention_head]

In [26]:
import torch
import numpy as np
from transformers import BertModel
from transformers import AutoTokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model.eval()
COS = torch.nn.CosineSimilarity()


In [31]:
def predict(tweet):
    title, sents = tweet
    titles = [title] * len(sents)
    input1 = tokenizer(titles, padding='max_length', max_length=200, return_tensors="pt", truncation=True) #'longest_first', 
    input2 = tokenizer(sents,  padding='max_length', max_length=200, return_tensors="pt", truncation=True) #'longest_first', 
    with torch.no_grad():
        outputs1 = bert_model(**input1)
        outputs2 = bert_model(**input2)
        attention1 = get_attentions(outputs1).detach()
        attention2 = get_attentions(outputs2).detach()
        score = torch.mean(COS(attention1, attention2)).item() 
        print(f'Getting score {score}')
        return score

In [32]:
test_size = 1000

predictions = [predict(sentences[i]) for i in range(test_size)]
labels = truths[:test_size]


Getting score 0.7008105516433716
Getting score 0.7388438582420349
Getting score 0.6172273755073547
Getting score 0.7433063387870789
Getting score 0.6820316314697266
Getting score 0.7619144916534424
Getting score 0.6675407886505127
Getting score 0.7138804793357849
Getting score 0.7003026604652405
Getting score 0.6875993609428406


In [38]:
cos_vec = torch.nn.CosineSimilarity(dim=0)
print(f'The overall score on the test set is {cos_vec(torch.tensor(predictions), torch.tensor(labels)).item()}.')

The overall score on the test set is 0.6527574062347412.
