In [6]:
import pandas as pd
import numpy as np
class Webis17:
    truth_file = None
    problem_file = None
    corpus = [] # (title, paragraphs, label)

    def __init__(self, path):
        self.truth_file = path + 'truth.jsonl'
        self.problem_file = path + 'instances.jsonl'

    def get_truths(self, size=100):
        df = pd.read_json(self.truth_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['truthMean'].values

    def get_texts(self, size=100):
        df = pd.read_json(self.problem_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['targetTitle'], df['targetParagraphs']

    def build_corpus(self, size=100):
        (truth_id, label) = self.get_truths(size)
        ground_truth = {truth_id[i] : label[i] for i in range(len(label))}
        (tweet_id, titles, texts) = self.get_texts(size)
        for i, tid in enumerate(tweet_id):
            try:
                if abs(ground_truth[tid] - 0.5) > 0.2: # getting only high confidence examples
                    self.corpus.append( (titles[i], ' '.join(txt for txt in texts[i]), ground_truth[tid]) ) # tid is discarded from now on
            except KeyError:
                print(f'Tweet {tid} is not in ground truth!')
                pass
        print(f'Getting {len(self.corpus)} valid examples from training set.')
web17 = Webis17('../input/eecs498/data/clickbait17/')
web17.build_corpus(size=19538)

Getting 12963 valid examples from training set.


In [7]:
import nltk
sentences = []
truths = []
for tweet in web17.corpus:
    title, paragraph, label = tweet
    seconds = []
    # for each sent in paragraph, pair it with the text
    for sent in nltk.sent_tokenize(paragraph):
        seconds.append( sent )
    sentences.append((title, seconds))
    truths.append(label)

In [8]:
def get_attentions(outputs, layer=0, attention_head=0, avg=False):
    '''
    get the particular output for a particular layer and attention head
    layer -> 0 to 11
    attention_head -> 0 to 11
    '''
    if avg:
        #avg over all attention heads in a layer
        return outputs[layer].squeeze(0).mean(dim=0)  
    #return values for a particular attention head inside a specific layer
    return outputs[layer].squeeze(0)[attention_head]

In [9]:
import torch
import numpy as np
from transformers import BertModel
from transformers import AutoTokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_model.eval()
COS = torch.nn.CosineSimilarity()


In [10]:
def predict(tweet):
    title, sents = tweet
    titles = [title] * len(sents)
    input1 = tokenizer(titles, padding='max_length', max_length=200, return_tensors="pt", truncation=True) #'longest_first', 
    input2 = tokenizer(sents,  padding='max_length', max_length=200, return_tensors="pt", truncation=True) #'longest_first', 
    input1 = input1.to(device)    
    input2 = input2.to(device)

    with torch.no_grad():
        outputs1 = bert_model(**input1)
        outputs2 = bert_model(**input2)
        attention1 = get_attentions(outputs1).detach()
        attention2 = get_attentions(outputs2).detach()
        score = torch.mean(COS(attention1, attention2)).item() 
        #print(f'Getting score {score}')
        return score

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
test_size = 12963
predictions, labels = [], []
counter = 0
for i in range(test_size):
    try:
        predictions.append(predict(sentences[i]))
        labels.append(truths[i])
        counter += 1
        if counter % 200 == 0:
            print(f'Finish running on example #  {counter}/10000', end='\r')
    except KeyboardInterrupt:
        exit(1)
    except:
        pass
predictions = torch.tensor(predictions)
labels = torch.tensor(labels)

Finish running on example #       1200/10000

In [16]:
'''
run evaluation on the results
'''
import torch.nn as nn
loss = nn.MSELoss()
print(f'The MSE loss of naive similarity model on first {counter} examples is {loss(predictions, labels)}.')    

The MSE loss of naive similarity model on first 10 examples is 0.2851293122376381.


In [17]:
import pickle
pickle.dump(predictions, open(f'../working/predictions_{counter}.pkl', 'wb'))
pickle.dump(labels, open(f'../working/labels_{counter}.pkl', 'wb'))