In [2]:
import pandas as pd
import numpy as np
import os
class Webis17:
    truth_file = None
    problem_file = None
    corpus = [] # (title, paragraphs, label)

    def __init__(self, path):
        self.truth_file = path + 'truth.jsonl'
        self.problem_file = path + 'instances.jsonl'

    def get_truths(self, size=100):
        df = pd.read_json(self.truth_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['truthMean'].values

    def get_texts(self, size=100):
        df = pd.read_json(self.problem_file, lines=True)
        df = df.loc[:size, :]
        return df['id'], df['targetTitle'], df['targetParagraphs']

    def build_corpus(self, size=100):
        (truth_id, label) = self.get_truths(size)
        ground_truth = {truth_id[i] : label[i] for i in range(len(label))}
        (tweet_id, titles, texts) = self.get_texts(size)
        for i, tid in enumerate(tweet_id):
            try:
                self.corpus.append( (titles[i], ' '.join(txt for txt in texts[i]), ground_truth[tid]) ) # tid is discarded from now on
            except KeyError:
                pass
                #print(f'Tweet {tid} is not in ground truth!')
        print(f'Getting {len(self.corpus)} valid examples from training set.')
web17 = Webis17('./data/clickbait17/')
web17.build_corpus(size=19538)

Getting 19538 valid examples from training set.


In [4]:
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [21]:
import nltk
train_sentences = []
truths = []
for data in web17.corpus:
    title, texts, label = data
    sent_pairs = []
    for text in nltk.sent_tokenize(texts):
        sent_pairs.append("[CLS] " + title + " [SEP]" + text + "[sep]")
    train_sentences.append(sent_pairs)
    truths.append(label)

In [9]:
bert_model.train()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
my_model = nn.Sequential(
    nn.Linear(768, 1)
)

In [23]:
import torch.nn as nn
def finetune():
    ''' make word embeddings of a given corpus '''
    ''' then dump all the results in .pkl files for later use '''
    for sentence in train_sentences[:size]:
        sentence = sentence[0]
        print(sentence)
        tokenized = tokenizer.tokenize(sentence)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized)
        indexed_tokens = torch.tensor([indexed_tokens])
        segment_ids = [1] * len(tokenized)
        segment_ids = torch.tensor([segment_ids])
        #with torch.no_grad():
        outputs = bert_model(indexed_tokens, segment_ids)
        print(outputs.shape)
        print(outputs.dtype)


finetune()

[CLS] ‘Inexcusable’ failures in UK’s response to modern slavery leaving victims destitute while abusers go free, report warns [SEP]Thousands of modern slavery victims have not come forward, while others who have chosen to report their enslavers have ended up destitute as a result of insufficient support, say MPs “Inexcusable” failures in the UK’s system for dealing with modern slavery are leaving victims reduced to destitution while their abusers go free because they are not adequately supported to testify against them, an alarming report has warned.[sep]


AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'shape'

In [24]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
bert_classi = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [26]:
len(train_sentences[0][0])

561

In [28]:
from transformers import BertTokenizer
input_ids = []
attention_masks = []
for sent in train_sentences:
    sent = sent[0]
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = False, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [29]:
labels = torch.tensor(truths)

In [32]:
optimizer = AdamW(model.parameters(), lr=2e-5)
MAX_LEN = 100
for epoch in range(10):
    print(f'======== Epoch {epoch} ========')
    bert_classi.train()
    b_input_ids = input_ids[:MAX_LEN]
    b_input_mask = attention_masks[:MAX_LEN]
    b_labels = labels[:MAX_LEN]
    bert_classi.zero_grad()        
    loss, logits = bert_classi(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask, 
                               labels=b_labels)
#       total_train_loss += loss.item()
    loss.backward()
    print(f'curr loss is {loss}')
    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()



In [None]:
'''
        prediction = nn.Linear(outputs)



        hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)

        token_vecs = np.zeros((MAX_LEN, token_embeddings.shape[2]))
        # `token_embeddings` is a [22 x 12 x 768] tensor.
        for i, token in enumerate(token_embeddings):
            sum_vec = torch.sum(token[-4:], dim=0)
            token_vecs[i] = sum_vec.numpy()
        Sen.append(token_vecs)
bert_input = torch.tensor(train_sentences)
'''
