In [1]:
#!pip install transformers

# adapted and expanded from here: 
# https://github.com/GeorgeLuImmortal/Hierarchical-BERT-Model-with-Limited-Labelled-Data/blob/main/run_hbm.py
import os
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
#from google.colab import drive
import textwrap
import progressbar
import keras
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import json

from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig


In [2]:
# we're going to generate token embeddings using the pretrained Roberta provided 
model = RobertaForSequenceClassification.from_pretrained("./Finetuned/mini_RoBERTa/")

Some weights of the model checkpoint at ./Finetuned/mini_RoBERTa/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
#give mini_dataset path
#GIVE ABSOLUTE PATHS HERE
mini_dataset_path = "./Data/ILDC_single.csv"          #give path to single_dataset.csv
path_on_which_finetuned_model_is_saved = " "      #obvious
dataset_path = "./Data/ILDC_multi.csv"           # give path to multi_dataset.csv
#path_train_npy_file = " "  #give .npy filename as well e.g. "/content/Drive/My Drive/LNLP/Hierarchical/RoBERTa_full/RoBERTa_train.npy"
#path_val_npy_file = " "    #similar as above
#path_test_npy_file = " "   #similar as above

In [4]:
df = pd.read_csv(mini_dataset_path)
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")

In [5]:

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

model_type = 'roberta' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
model_name = 'roberta-base'

In [37]:
#chunk documents into sentences which can be turned into BERT representations to be fed to an attention model later


from nltk.tokenize import sent_tokenize


all_things = [sent_tokenize(row) for row in validation_set['text']]
lengths = [len(document) for document in all_things]

print(max(lengths))

1647


In [7]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [8]:
def grouped_input_ids(all_toks):
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = [CLS] + l_t + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="post")
  att_masks = att_masking(e_sents)
  return e_sents, att_masks

In [9]:
def generate_np_files_for_training(dataf, tokenizer):
  all_input_ids, all_att_masks, all_labels = [], [], []
  for i in progressbar.progressbar(range(len(dataf['text']))):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)
    doc_label = dataf['label'].iloc[i]
    for i in range(len(splitted_input_ids)):
      all_input_ids.append(splitted_input_ids[i])
      all_att_masks.append(splitted_att_masks[i])
      all_labels.append(doc_label)

  return all_input_ids, all_att_masks, all_labels

In [10]:
def generate_np_files_for_evaluation(dataf, tokenizer):
  all_input_ids, all_att_masks, all_labels, all_docs = [], [], [], []
  for i in progressbar.progressbar(range(len(dataf['text']))):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)
    doc_label = dataf['label'].iloc[i]
    for i in range(len(splitted_input_ids)):
      all_input_ids.append(splitted_input_ids[i])
      all_att_masks.append(splitted_att_masks[i])
      all_labels.append(doc_label)
      all_docs.append(dataf['name'].iloc[i])

  return all_input_ids, all_att_masks, all_labels, all_docs

In [11]:
def input_id_maker(dataf, tokenizer):
  input_ids = []
  lengths = []

  for i in progressbar.progressbar(range(len(dataf['text']))):
    sen = dataf['text'].iloc[i]
    sen = tokenizer.tokenize(sen, add_prefix_space=True)
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
    if(len(sen) > 510):
      sen = sen[len(sen)-510:]

    sen = [CLS] + sen + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(sen)
    input_ids.append(encoded_sent)
    lengths.append(len(encoded_sent))

  input_ids = pad_sequences(input_ids, maxlen=512, value=0, dtype="long", truncating="pre", padding="post")
  return input_ids, lengths

In [12]:
"""
lr = 2e-6
max_grad_norm = 1.0
epochs = 10
num_total_steps = len(train_dataloader)*epochs
num_warmup_steps = 1000
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

seed_val = 21


np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
"""

'\nlr = 2e-6\nmax_grad_norm = 1.0\nepochs = 10\nnum_total_steps = len(train_dataloader)*epochs\nnum_warmup_steps = 1000\nwarmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1\noptimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)\nscheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)\n\ndef flat_accuracy(preds, labels):\n    pred_flat = np.argmax(preds, axis=1).flatten()\n    labels_flat = labels.flatten()\n    return np.sum(pred_flat == labels_flat) / len(labels_flat)\n\nseed_val = 21\n\n\nnp.random.seed(seed_val)\ntorch.manual_seed(seed_val)\ntorch.cuda.manual_seed_all(seed_val)\n'

In [13]:
"""
train_loss_values = []
train_accuracy = []
val_loss_values = []
val_accuracy = []

# For each epoch...
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    model.train()
    total_loss=0
    train_batch_accuracy = 0

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}. : loss: {:} '.format(step, len(train_dataloader), total_loss/step))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]

        total_loss+=loss.item()

        loss.backward()

        batch_logits = logits
        logits = batch_logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        train_batch_accuracy = flat_accuracy(logits, label_ids)

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        if step%1000 == 0 and not step == 0:
            print("\nRunning Validation...")
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for batch in validation_dataloader:
              batch = tuple(t.to(device) for t in batch)
              b_input_ids, b_input_mask, b_labels = batch
              with torch.no_grad():        
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

              loss = outputs[0]
              logits = outputs[1]
    
              logits = logits.detach().cpu().numpy()
              label_ids = b_labels.to('cpu').numpy()
        
              tmp_eval_accuracy = flat_accuracy(logits, label_ids)
              eval_accuracy += tmp_eval_accuracy

              eval_loss+=loss

              nb_eval_steps += 1

            val_accuracy.append(eval_accuracy/nb_eval_steps)
            val_loss_values.append(eval_loss/nb_eval_steps)

            print('Validation loss: {:} : Validation accuracy: {:}'.format(val_loss_values[-1], val_accuracy[-1]))

        
    train_loss_values.append(total_loss/len(train_dataloader))
    train_accuracy.append(train_batch_accuracy/len(train_dataloader))


print("Training complete!")
"""



In [14]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [15]:
def grouped_input_ids(all_toks):
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = [CLS] + l_t + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="post")
  att_masks = att_masking(e_sents)
  return e_sents, att_masks

In [16]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

  vec = encoded_layers[12][0][0]
  vec = vec.detach().cpu().numpy()
  return vec

In [17]:
def generate_np_files_for_emb(dataf, tokenizer):
  all_docs = []
  for i in progressbar.progressbar(range(len(dataf['text']))):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

    vecs = []
    for index,ii in enumerate(splitted_input_ids):
      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))
 
    one_doc = np.asarray(vecs)
    all_docs.append(one_doc)

  all_docs = np.asarray(all_docs)
  return all_docs

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_dir = "./Finetuned/mini_RoBERTa/"
tokenizer = RobertaTokenizer.from_pretrained(output_dir)
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [19]:
#vecs_dev = generate_np_files_for_emb(validation_set, tokenizer)
#np.save(path_val_npy_file, vecs_dev)

In [20]:
#vecs_test = generate_np_files_for_emb(test_set, tokenizer)
#np.save(path_test_npy_file, vecs_test)

In [21]:
#vecs_train = generate_np_files_for_emb(train_set, tokenizer)
#np.save(path_train_npy_file, vecs_train)

In [22]:
path_val_np_file = "./Finetuned/mini_RoBERTa/RoBERTa_val.npy"

np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

vecs_dev = np.load(path_val_np_file)

np.load = np_load_old


In [23]:
def predict_document_whole(row):
    return row

In [24]:
mini_dataset_path = "./Data/ILDC_single.csv"          #give path to single_dataset.csv
dataset_path = "./Data/ILDC_multi.csv"           # give path to multi_dataset.csv

df = pd.read_csv(mini_dataset_path)
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")

In [25]:
#TODO: unclear if I should just be using the OG tokenizer?
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
train_input_ids, train_att_masks, train_labels,train_doc_names = generate_np_files_for_evaluation(train_set, tokenizer)
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)

100% (5082 of 5082) |####################| Elapsed Time: 0:02:20 Time:  0:02:20
100% (994 of 994) |######################| Elapsed Time: 0:00:20 Time:  0:00:20


In [31]:
validation_attention_masks = att_masking(validation_input_ids)
validation_labels = validation_set['label'].to_numpy().astype('int')


validation_inputs = validation_input_ids
validation_masks = validation_attention_masks
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)


train_doc_name_dict = {}
index = 0
new_thing = []
for name in train_doc_names:
    if name not in train_doc_name_dict:
        train_doc_name_dict[name] = index
        index+=1
    
    new_thing.append(train_doc_name_dict[name])
    
train_doc_names = new_thing
        
    
    

train_input_ids = torch.tensor(train_input_ids)
train_att_masks = torch.tensor(train_att_masks)
train_labels = torch.tensor(train_labels)
train_doc_names = torch.tensor(train_doc_names)



batch_size = 4
validation_data = TensorDataset(train_input_ids, train_att_masks, train_labels,train_doc_names)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size = batch_size)




  train_input_ids = torch.tensor(train_input_ids)
  train_att_masks = torch.tensor(train_att_masks)
  train_labels = torch.tensor(train_labels)


In [32]:
lr = 2e-6
max_grad_norm = 1.0
epochs = 10
num_warmup_steps = 1000

optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

seed_val = 21


In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_dir = "./Finetuned/mini_RoBERTa/"
tokenizer = RobertaTokenizer.from_pretrained(output_dir)
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [34]:
predictions_dict = {}

for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels, b_docnames = batch

    b_labels = b_labels.type(torch.LongTensor)
    
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    
    loss = outputs[0]
    logits = outputs[1]

    batch_logits = logits
    
    logits = batch_logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    
    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    

    if b_docnames not in predictions_dict:
        predictions_dict[b_docnames] = {}
        predictions_dict[b_docnames]["predictions"] = []
        predictions_dict[b_docnames]["logits"] = []
        predictions_dict[b_docnames]["actual"] = []
    
    predictions_dict[b_docnames]["predictions"].append(pred_flat)
    predictions_dict[b_docnames]["logits"].append(logits)
    predictions_dict[b_docnames]["actual"].append(labels_flat)
    print("Batch done")


Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done
Batch done


KeyboardInterrupt: 

In [None]:
print(len(predictions_dict))

In [None]:
bad = []

new_dict = {}

for doc in predictions_dict:
    key = doc.item()
    if key not in new_dict:
        new_dict[key] = {}
        new_dict[key]["predictions"] = []
        new_dict[key]["logits"] = []
        new_dict[key]["actual"] = []
        
    new_dict[key]["predictions"].extend(predictions_dict[doc]["predictions"])
    new_dict[key]["logits"].extend(predictions_dict[doc]["logits"])
    new_dict[key]["actual"].extend(predictions_dict[doc]["actual"])
    
print(len(new_dict))
    
with open("stats.txt",'w+') as outfile:
    for doc in new_dict:
        whole_doc = sum([pred.item() for pred in new_dict[doc]["predictions"]])
        whole_doc = round(whole_doc / len(new_dict[doc]))
        actual = new_dict[doc]["actual"][0].item()
        if(actual != whole_doc):
            bad.append(doc)
        

print(len(bad) / len(new_dict))
        


In [None]:
print(bad)