In [1]:
import os
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
import textwrap
import progressbar
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import json

In [2]:
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

# MODEL_CLASSES = {
#     'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
#     'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
#     'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
#     'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
#     'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

# model_type = 'roberta' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
# model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
# model_name = 'roberta-base'

In [3]:
output_dir = "Results/RoBERTa_Multi/pretrained/"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = RobertaForSequenceClassification.from_pretrained(output_dir, output_hidden_states=True)
model = RobertaForSequenceClassification.from_pretrained(output_dir)
tokenizer = RobertaTokenizer.from_pretrained(output_dir)
model.to(device)
     

Some weights of the model checkpoint at Results/RoBERTa_Multi/pretrained/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [4]:
df = pd.read_csv('Data/ILDC_multi.csv')
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")

In [5]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks
     

In [6]:
def grouped_input_ids(all_toks):
  splitted_toks = []
  l=0
  r=510
  while(l<len(all_toks)):
    splitted_toks.append(all_toks[l:min(r,len(all_toks))])
    l+=410
    r+=410

  CLS = tokenizer.cls_token
  SEP = tokenizer.sep_token
  e_sents = []
  for l_t in splitted_toks:
    l_t = [CLS] + l_t + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(l_t)
    e_sents.append(encoded_sent)

  e_sents = pad_sequences(e_sents, maxlen=512, value=0, dtype="long", padding="post")
  att_masks = att_masking(e_sents)
  return e_sents, att_masks

In [7]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      # logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)
      logits = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

  # vec = encoded_layers[12][0][0]
  # vec = vec.detach().cpu().numpy()
  # return vec
  # logits = logits.detach().cpu().numpy()
  # label = np.argmax(logits, axis=1).flatten()
  label = np.argmax(logits.logits.numpy(), axis=1).flatten()

  return label

In [8]:
def generate_np_files_for_emb(dataf, tokenizer):
  all_docs = []
  ####
  my_preds = []
  ####
  for i in range(len(dataf['text'])):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

    vecs = []
    for index,ii in enumerate(splitted_input_ids):
      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))
 
    lval = 0.0
    for i in vecs:
      lval+=i

    if(lval/len(vecs) > 0.5):
      my_preds.append(1)
    else:
      my_preds.append(0)

  return my_preds
  #   one_doc = np.asarray(vecs)
  #   all_docs.append(one_doc)

  # all_docs = np.asarray(all_docs)
  # return all_docs

In [9]:
preds_dev = generate_np_files_for_emb(validation_set, tokenizer)
dev_labels = validation_set['label'].to_list()
print(len(preds_dev))
correct=0
for index in range(len(preds_dev)):
  if(preds_dev[index] == dev_labels[index]):
    correct+=1

print(correct/len(preds_dev))
np.save("Results/RoBERTa_Multi/ensemble npy/RoBERTa_dev.npy", preds_dev)

994
0.6348088531187123


In [10]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import textwrap
import progressbar
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences


def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP = []
    FP = []
    FN = []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1

In [11]:
metrics_calculator(preds_dev, dev_labels)

(0.6942946832843206,
 0.6348088531187123,
 0.663220583725073,
 0.6348088531187123,
 0.6348088531187123,
 0.6348088531187123)

In [12]:
preds_test = generate_np_files_for_emb(test_set, tokenizer)
test_labels = test_set['label'].to_list()
print(len(preds_test))
correct=0
for index in range(len(preds_test)):
  if(preds_test[index] == test_labels[index]):
    correct+=1

print(correct/len(preds_test))
np.save("Results/RoBERTa_Multi/ensemble npy/RoBERTa_test.npy", preds_test)

1517
0.6242584047462096


In [13]:
metrics_calculator(preds_test, test_labels)

(0.6820188702068465,
 0.6255462272513949,
 0.6525630456204138,
 0.6242584047462096,
 0.6242584047462096,
 0.6242584047462096)

### Raw logits

In [14]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      # logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)
      logits = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

  # vec = encoded_layers[12][0][0]
  # vec = vec.detach().cpu().numpy()
  # return vec
  # logits = logits.detach().cpu().numpy()
  # label = np.argmax(logits, axis=1).flatten()
  label = logits.logits[0][1].numpy().flatten()

  return label

def generate_np_files_for_emb(dataf, tokenizer):
  all_docs = []
  ####
  my_preds = []
  ####
  for i in range(len(dataf['text'])):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

    vecs = []
    for index,ii in enumerate(splitted_input_ids):
      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))
 
    lval = 0.0
    for i in vecs:
      lval+=i

    if(lval/len(vecs) > 0.5):
      my_preds.append(1)
    else:
      my_preds.append(0)

  return my_preds
  #   one_doc = np.asarray(vecs)
  #   all_docs.append(one_doc)

  # all_docs = np.asarray(all_docs)
  # return all_docs

In [15]:
preds_dev = generate_np_files_for_emb(validation_set, tokenizer)
dev_labels = validation_set['label'].to_list()
print(len(preds_dev))
correct=0
for index in range(len(preds_dev)):
  if(preds_dev[index] == dev_labels[index]):
    correct+=1

print(correct/len(preds_dev))
np.save("Results/RoBERTa_Multi/ensemble npy/raw_logits_dev.npy", preds_dev)

994
0.5895372233400402


In [16]:
metrics_calculator(preds_dev, dev_labels)

(0.733028479912337,
 0.5895372233400402,
 0.6534988373189736,
 0.5895372233400402,
 0.5895372233400402,
 0.5895372233400402)

In [17]:
preds_test = generate_np_files_for_emb(test_set, tokenizer)
test_labels = test_set['label'].to_list()
print(len(preds_test))
correct=0
for index in range(len(preds_test)):
  if(preds_test[index] == test_labels[index]):
    correct+=1

print(correct/len(preds_test))
np.save("Results/RoBERTa_Multi/ensemble npy/raw_logits_test.npy", preds_test)

1517
0.5880026367831246


In [18]:
metrics_calculator(preds_test, test_labels)

(0.7221584262317644,
 0.5897855069440823,
 0.6492938649870699,
 0.5880026367831246,
 0.5880026367831246,
 0.5880026367831246)

### Weight later chunks

In [21]:
def get_output_for_one_vec(input_id, att_mask):
  input_ids = torch.tensor(input_id)
  att_masks = torch.tensor(att_mask)
  input_ids = input_ids.unsqueeze(0)
  att_masks = att_masks.unsqueeze(0)
  model.eval()
  input_ids = input_ids.to(device)
  att_masks = att_masks.to(device)
  with torch.no_grad():
      # logits, encoded_layers = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)
      logits = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

  # vec = encoded_layers[12][0][0]
  # vec = vec.detach().cpu().numpy()
  # return vec
  # logits = logits.detach().cpu().numpy()
  # label = np.argmax(logits, axis=1).flatten()
  label = np.argmax(logits.logits.numpy(), axis=1).flatten()

  return label

def generate_np_files_for_emb(dataf, tokenizer):
  all_docs = []
  ####
  my_preds = []
  ####
  for i in range(len(dataf['text'])):
    text = dataf['text'].iloc[i]
    toks = tokenizer.tokenize(text, add_prefix_space=True)
    if(len(toks) > 10000):
      toks = toks[len(toks)-10000:]

    splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

    vecs = []
    for index,ii in enumerate(splitted_input_ids):
      vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))
 
    lval = 0.0
    for weight, i in enumerate(vecs):
      lval+=(weight+1)*i

    if(lval/sum(range(len(vecs)+1)) > 0.5):
      my_preds.append(1)
    else:
      my_preds.append(0)

  return my_preds
  #   one_doc = np.asarray(vecs)
  #   all_docs.append(one_doc)

  # all_docs = np.asarray(all_docs)
  # return all_docs

In [22]:
preds_dev = generate_np_files_for_emb(validation_set, tokenizer)
dev_labels = validation_set['label'].to_list()
print(len(preds_dev))
correct=0
for index in range(len(preds_dev)):
  if(preds_dev[index] == dev_labels[index]):
    correct+=1

print(correct/len(preds_dev))
np.save("Results/RoBERTa_Multi/ensemble npy/weighted_natural_dev.npy", preds_dev)

994
0.6569416498993964


In [23]:
metrics_calculator(preds_dev, dev_labels)

(0.7322429906542056,
 0.6569416498993963,
 0.6925514497712028,
 0.6569416498993964,
 0.6569416498993964,
 0.6569416498993964)

In [24]:
preds_test = generate_np_files_for_emb(test_set, tokenizer)
test_labels = test_set['label'].to_list()
print(len(preds_test))
correct=0
for index in range(len(preds_test)):
  if(preds_test[index] == test_labels[index]):
    correct+=1

print(correct/len(preds_test))
np.save("Results/RoBERTa_Multi/ensemble npy/weighted_natural_test.npy", preds_test)

1517
0.6413974950560316


In [25]:
metrics_calculator(preds_test, test_labels)

(0.7077636780538669,
 0.6426917661782343,
 0.6736599718680749,
 0.6413974950560316,
 0.6413974950560316,
 0.6413974950560316)

### Base transformer model 

In [7]:
def input_id_maker(dataf, tokenizer):
  input_ids = []
  lengths = []

  for i in range(len(dataf['text'])):
    sen = dataf['text'].iloc[i]
    sen = tokenizer.tokenize(sen, add_prefix_space=True)
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
    if(len(sen) > 510):
      sen = sen[len(sen)-510:]

    sen = [CLS] + sen + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(sen)
    input_ids.append(encoded_sent)
    lengths.append(len(encoded_sent))

  input_ids = pad_sequences(input_ids, maxlen=512, value=0, dtype="long", truncating="pre", padding="post")
  return input_ids, lengths


In [9]:
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)


In [10]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks


In [11]:
validation_attention_masks = att_masking(validation_input_ids)
validation_labels = validation_set['label'].to_numpy().astype('int')


In [12]:
validation_inputs = validation_input_ids
validation_masks = validation_attention_masks
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [13]:
# max batch size should be 6 due to colab limits
batch_size = 6
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size = batch_size)

In [14]:
labels = test_set.label.to_numpy().astype(int)

input_ids, input_lengths = input_id_maker(test_set, tokenizer)
attention_masks = att_masking(input_ids)

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 6  

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [15]:
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
model.eval()

predictions , true_labels = [], []

for (step, batch) in enumerate(prediction_dataloader):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')


Predicting labels for 1,517 test sentences...
    DONE.


In [16]:
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

flat_accuracy(predictions,true_labels)

0.7125906394199077

In [17]:
import os
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import textwrap
import progressbar
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random


In [18]:
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP = []
    FP = []
    FN = []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1

macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(pred_flat, labels_flat)
print(macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1)


0.7225405297460197 0.7130712137803967 0.7177746418902077 0.7125906394199077 0.7125906394199077 0.7125906394199077


In [19]:
batch_size = 6  

# Create the DataLoader.
prediction_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
prediction_sampler = SequentialSampler(validation_data)
prediction_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size = batch_size)


In [21]:
print('Predicting labels for {:,} test sentences...'.format(len(validation_inputs)))
model.eval()

predictions , true_labels = [], []

for (step, batch) in enumerate(prediction_dataloader):
  batch = tuple(t.to(device) for t in batch)
  
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')


Predicting labels for 994 test sentences...
    DONE.


In [22]:

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

flat_accuracy(predictions,true_labels)

0.7193158953722334

In [23]:
macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1 = metrics_calculator(pred_flat, labels_flat)
print(macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1)

0.7416193891386569 0.7193158953722334 0.7302973931556175 0.7193158953722334 0.7193158953722334 0.7193158953722335
