# BERT **ASPECT SENTIMENT CLASSIFICATION** TASK

**Prepare GPU:**

1. Check: Edit --> Notebook settings -> Hardware accelerator -> *GPU* 


2. Datasets are uploaded in *content*-folder: 

*   predictions_ae.json
*   test_set_final.csv

3. Datasets are uploaded in *MyDrive*-folder:


*   germeval_pt folder
*   train_asc_task.json
*   test_asc_task.json
*   dev_asc_task.json



In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
# Identify and specify the GPU as the device
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# **Install packages**

In [None]:
#!pip install pytorch_pretrained_bert
#!pip install transformers
#!pip install nltk
import os
import logging
import argparse
import random
import json
import sklearn.metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import nltk

import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#from transformers import BertForSequenceClassification 

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel, BertPreTrainedModel, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam


logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

# **Helper Functions**

In [None]:
def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

# specifically for asc task

def create_examples(lines, set_type):
    examples = []
    for (i, ids) in enumerate(lines["data"]):
        guid = "%s-%s" % (set_type, ids )
        text_a = lines["data"][ids]['term']
        text_b = lines["data"][ids]['sentence']
        label = lines["data"][ids]['polarity']
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) )
    return examples   

def read_json(input_file):
    """Reads a json file for tasks in sentiment analysis."""
    with open(input_file) as f:
        return json.load(f)
        
def get_train_examples(data_dir):
    """See base class."""
    return create_examples(
        read_json(data_dir), "train")

def get_dev_examples(data_dir):
    """See base class."""
    return create_examples(
        read_json(data_dir), "dev")    

def get_test_examples(data_dir):
    """See base class."""
    return create_examples(
        read_json(data_dir), "test")    
    
class BertForSequenceLabeling(BertPreTrainedModel):
    def __init__(self, config, num_labels=3):
        super(BertForSequenceLabeling, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-1)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, mode):
    """Loads a data file into a list of `InputBatch`s.""" #check later if we can merge this function with the SQuAD preprocessing 
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    features = []
    for (ex_index, example) in enumerate(examples):
        if mode!="ae":
            tokens_a = tokenizer.tokenize(example.text_a)
        else: #only do subword tokenization.
            tokens_a, labels_a, example.idx_map= tokenizer.subword_tokenize([token.lower() for token in example.text_a], example.label )

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if mode!="ae":
            label_id = label_map[example.label]
        else:
            label_id = [-1] * len(input_ids) #-1 is the index to ignore
            #truncate the label length if it exceeds the limit.
            lb=[label_map[label] for label in labels_a]
            if len(lb) > max_seq_length - 2:
                lb = lb[0:(max_seq_length - 2)]
            label_id[1:len(lb)+1] = lb

        features.append(
                InputFeatures(
                        input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id))
    return features

class ABSATokenizer(BertTokenizer):     
    def subword_tokenize(self, tokens, labels): # for AE
        split_tokens, split_labels= [], []
        idx_map=[]
        for ix, token in enumerate(tokens):
            sub_tokens=self.wordpiece_tokenizer.tokenize(token)
            for jx, sub_token in enumerate(sub_tokens):
                split_tokens.append(sub_token)
                if labels[ix]=="B" and jx>0:
                    split_labels.append("I")
                else:
                    split_labels.append(labels[ix])
                idx_map.append(ix)
        return split_tokens, split_labels, idx_map  
 

### Evaluation part

def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    
    cm = confusion_matrix(y_true=true_labels, 
                                  y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  codes=[[0,0],[0,1]]), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                codes=[[0,0],[0,1]])) 
    return cm_frame   

# **Set Arguments**

In [None]:
train_batch_size = 16           
num_train_epochs = 4      
max_seq_length = 250
learning_rate = 2e-5
warmup_proportion = 0.1

do_valid = True
pretrained_model = "./drive/MyDrive/BERT_Files/germeval_pt/" # tweets_unlabeled_pt, base_german_cased_dbmdz, "bert-base-german-cased" , tweets_unlabeled_pt ./drive/MyDrive/BERT_Files/germeval_pt/

# **Training Setup**

In [None]:
label_list = ["negative", "positive"] 

tokenizer = ABSATokenizer.from_pretrained(pretrained_model)
train_examples = get_train_examples(data_dir = "./drive/MyDrive/BERT_Files/BERT_ASC_Task/train_asc_task.json")
num_train_steps = int(len(train_examples) / train_batch_size) * num_train_epochs

train_features = convert_examples_to_features(examples = train_examples, 
                                              label_list = label_list, 
                                              max_seq_length = max_seq_length, 
                                              tokenizer = tokenizer, 
                                              mode = "asc")
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
logger.info("  Num steps = %d", num_train_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_label_ids)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

03/19/2021 22:15:40 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file ./drive/MyDrive/BERT_Files/germeval_pt/vocab.txt
03/19/2021 22:15:41 - INFO - __main__ -   ***** Running training *****
03/19/2021 22:15:41 - INFO - __main__ -     Num examples = 778
03/19/2021 22:15:41 - INFO - __main__ -     Batch size = 16
03/19/2021 22:15:41 - INFO - __main__ -     Num steps = 192


# **Validation Setup**

In [None]:
valid_examples = get_dev_examples(data_dir = "./drive/MyDrive/BERT_Files/BERT_ASC_Task/dev_asc_task.json")
valid_features= convert_examples_to_features(
    valid_examples, label_list,  max_seq_length, tokenizer, "asc")
valid_all_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long)
valid_all_segment_ids = torch.tensor([f.segment_ids for f in valid_features], dtype=torch.long)
valid_all_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long)
valid_all_label_ids = torch.tensor([f.label_id for f in valid_features], dtype=torch.long)
valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids, valid_all_input_mask, valid_all_label_ids)

logger.info("***** Running validations *****")
logger.info("  Num orig examples = %d", len(valid_examples))
logger.info("  Num split examples = %d", len(valid_features))
logger.info("  Batch size = %d",  train_batch_size)

valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size= train_batch_size)    

best_valid_loss=float('inf')
valid_losses=[]

03/19/2021 22:15:44 - INFO - __main__ -   ***** Running validations *****
03/19/2021 22:15:44 - INFO - __main__ -     Num orig examples = 194
03/19/2021 22:15:44 - INFO - __main__ -     Num split examples = 194
03/19/2021 22:15:44 - INFO - __main__ -     Batch size = 16


# **Optimization & Actual Training**

In [None]:
    model = BertForSequenceClassification.from_pretrained(pretrained_model, 
                                                          num_labels = 2) # "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased.tar.gz", num_labels = len(label_list))
    model.cuda()
 
    # Prepare optimizer

    
    optimizer = BertAdam(model.parameters(),
                         lr=learning_rate,
                         eps = 1e-8)

03/19/2021 22:15:47 - INFO - pytorch_pretrained_bert.modeling -   loading archive file ./drive/MyDrive/BERT_Files/germeval_pt/
03/19/2021 22:15:47 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

03/19/2021 22:15:58 - INFO - pytorch_pretrained_bert.modeling -   Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
03/19/2021 22:15:58 - INFO - pytorch_pretrained_bert.modeling -   Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', '

In [None]:
from transformers import get_linear_schedule_with_warmup

t_total = len(train_dataloader) * num_train_epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = t_total)

In [None]:
# helper function for formatting elapsed times
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# global_step = 0
# model.train()
# for _ in range(num_train_epochs):
#     for step, batch in enumerate(train_dataloader):
#         batch = tuple(t.cuda() for t in batch)
#         input_ids, segment_ids, input_mask, label_ids = batch
#         loss = model(input_ids, segment_ids, input_mask, label_ids)
#         loss.backward()

#         lr_this_step = learning_rate * warmup_linear(global_step/t_total, warmup_proportion)
#         for param_group in optimizer.param_groups:
#             param_group['lr'] = lr_this_step
#         optimizer.step()
#         optimizer.zero_grad()
#         global_step += 1
#         #>>>> perform validation at the end of each epoch .
#     if do_valid:
#         model.eval()
#         with torch.no_grad():
#             losses=[]
#             valid_size=0
#             for step, batch in enumerate(valid_dataloader):
#                 batch = tuple(t.cuda() for t in batch) # multi-gpu does scattering it-self
#                 input_ids, segment_ids, input_mask, label_ids = batch
#                 loss = model(input_ids, segment_ids, input_mask, label_ids)
#                 losses.append(loss.data.item()*input_ids.size(0) )
#                 valid_size+=input_ids.size(0)
#             valid_loss=sum(losses)/valid_size
#             logger.info("validation loss: %f", valid_loss)
#             valid_losses.append(valid_loss)
#         if valid_loss<best_valid_loss:
#             torch.save(model, "model.pt")
#             best_valid_loss=valid_loss
#         model.train()
# if do_valid:
#     with open("valid.json", "w") as fw:
#         json.dump({"valid_losses": valid_losses}, fw)
# else:
#     torch.save(model, "model.pt")

03/19/2021 18:47:29 - INFO - __main__ -   validation loss: 0.406666
03/19/2021 18:48:15 - INFO - __main__ -   validation loss: 0.354458
03/19/2021 18:49:01 - INFO - __main__ -   validation loss: 0.327326
03/19/2021 18:49:47 - INFO - __main__ -   validation loss: 0.394943


In [None]:
# helper function for calculating accuracy
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

seed_val = 55

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

#global_step = 0
#model.train()
for epoch_i in range(0, num_train_epochs):

   ## TRAIN
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')

    # how long does the training epoch take.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Turn on the training mode for the model. 
    model.train()

    for step, batch in enumerate(train_dataloader):

        batch = tuple(t.cuda() for t in batch)
        
        input_ids, segment_ids, input_mask, label_ids = batch
        
        # Remove any previously calculated gradients before performing a
        # backward pass. 
        model.zero_grad()  

        # Do a forward pass. 

        loss = model(input_ids, segment_ids, input_mask, label_ids)
        
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end.
        total_loss += loss.item()
        
        # Do a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0. 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
        # Update parameters and take a step.
        optimizer.step()
        scheduler.step()

    # Avg loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Need for plotting the learning curve later.
    loss_values.append(avg_train_loss)

    print("")
    print("  Avg training loss: {0:.2f}".format(avg_train_loss))
    print("  Epoch time: {:}".format(format_time(time.time() - t0)))


        #>>>> perform validation at the end of each epoch .
    model.eval()
    with torch.no_grad():
        losses=[]
        valid_size=0
        for step, batch in enumerate(valid_dataloader):
            batch = tuple(t.cuda() for t in batch) # multi-gpu does scattering it-self
            input_ids, segment_ids, input_mask, label_ids = batch
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            losses.append(loss.data.item()*input_ids.size(0) )
            valid_size+=input_ids.size(0)
        valid_loss=sum(losses)/valid_size
        logger.info("validation loss: %f", valid_loss)
        valid_losses.append(valid_loss)
    if valid_loss<best_valid_loss:
        torch.save(model, "model.pt")
        best_valid_loss=valid_loss
    model.train()
with open("valid.json", "w") as fw:
        json.dump({"valid_losses": valid_losses}, fw)



Training...

  Avg training loss: 0.42
  Epoch time: 0:00:43


03/19/2021 22:17:09 - INFO - __main__ -   validation loss: 0.265672



Training...

  Avg training loss: 0.17
  Epoch time: 0:00:42


03/19/2021 22:17:56 - INFO - __main__ -   validation loss: 0.253996



Training...

  Avg training loss: 0.03
  Epoch time: 0:00:43


03/19/2021 22:18:43 - INFO - __main__ -   validation loss: 0.408720



Training...

  Avg training loss: 0.01
  Epoch time: 0:00:42


03/19/2021 22:19:29 - INFO - __main__ -   validation loss: 0.374914


In [None]:

# ## VALIDATE
#     # After each training epoch, we measure performance on
#     # validation set.

#     t0 = time.time()

#     # Turn on the evaluation mode of the model.
#     model.eval()

#     # Tracking variables 
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps, nb_eval_examples = 0, 0

#     # Evaluate data for one epoch
#     for batch in valid_dataloader:
        
#         # Add batch to GPU
#         batch = tuple(t.to(device) for t in batch)
        
#         # Unpack the inputs from our dataloader
#         input_ids, segment_ids, input_mask, label_ids = batch
        
        
#         with torch.no_grad():        

#             # Forward pass, calculate logit predictions.
#             outputs = model(input_ids, 
#                             segment_ids, 
#                             input_mask)
        
#         # Get logit values predicted for each class.
#         logits = outputs[0]

#         # Move logits and labels to CPU
#         logits = logits.detach().cpu().numpy()
#         label_ids = label_ids.to('cpu').numpy()
        
#         # Accuracy for this batch of validation tweets.
#         tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
#         # Accumulate the total accuracy.
#         eval_accuracy += tmp_eval_accuracy

#         # Number of batches
#         nb_eval_steps += 1

#     # Report the final accuracy for this validation run.
#     print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
#     print("  Validation time: {:}".format(format_time(time.time() - t0)))

# print("")
# print("Training complete!")

# step = list(enumerate(valid_dataloader))[0][0]
# batch = list(enumerate(valid_dataloader))[0][1]
# batch = tuple(t.cuda() for t in batch) # multi-gpu does scattering it-self
# input_ids, segment_ids, input_mask, label_ids = batch
# loss = model(input_ids, segment_ids, input_mask)
# logits = loss[0]
# logits

# **Final Evaluation**

This is the End to End Analysis, the combined AE and ASC tasks will be evaluated in this section: 


1.   Insert original Tweets without topics and labels: tweets
2.   Insert predicted and post-processed aspects: y_pred_aspects
3.   Insert tweets in tokenized form during AE task: pred_json["raw_X"]
4.   Result: Aspect Extraction. Fill in the topic-column

Prepare test examples based on topics, gained due to AE Task    

In [None]:
eval_batch_size = 16
eval_examples = get_test_examples(data_dir = "./drive/MyDrive/BERT_Files/BERT_ASC_Task/test_asc_task.json")
eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer, "asc")

In [None]:


logger.info("***** Running evaluation *****")
logger.info("  Num examples = %d", len(eval_examples))
logger.info("  Batch size = %d", eval_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_label_ids)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size= eval_batch_size)

model = torch.load("model.pt" )
model.cuda()
model.eval()

full_logits=[]
full_label_ids=[]
for step, batch in enumerate(eval_dataloader):
    batch = tuple(t.cuda() for t in batch)
    input_ids, segment_ids, input_mask, label_ids = batch
    
    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask)

    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.cpu().numpy()

    full_logits.extend(logits.tolist() )
    full_label_ids.extend(label_ids.tolist() )

with open("predictions_asc.json", "w") as fw:
    json.dump({"logits": full_logits, "label_ids": full_label_ids}, fw)

03/19/2021 22:20:17 - INFO - __main__ -   ***** Running evaluation *****
03/19/2021 22:20:17 - INFO - __main__ -     Num examples = 243
03/19/2021 22:20:17 - INFO - __main__ -     Batch size = 16


# **Evaluation Metrics**

In [None]:
# Various evaluation metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

with open( "predictions_asc.json" ) as f:
    results=json.load(f)
y_true=results['label_ids']
y_pred=[np.argmax(logit) for logit in results['logits'] ]
p_macro, r_macro, f_macro, _=sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average='weighted')
f_macro = 2*p_macro*r_macro/(p_macro+r_macro)
acc = 100*sklearn.metrics.accuracy_score(y_true, y_pred)
f_mac = 100*f_macro 

f1_value = f1_score(y_true, y_pred, average="weighted")
accuracy = accuracy_score(y_true, y_pred)

# Confusion Matrix
display_confusion_matrix(true_labels = y_true, predicted_labels = y_pred)


Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,1,0
Actual:,1,47,20
Actual:,0,7,169


In [None]:
f1_value

0.8848988012126244

In [None]:
accuracy

0.8888888888888888