In [1]:
# A new enviroment was created to run this experiment
# Pip install pytorch_pretrained_bert
# The model class and dataloader class called base class provided by https://github.com/huggingface/pytorch-pretrained-BERT
# Pytorch v1.0
# Run with 2 GPUs

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertPreTrainedModel, BertModel, BertConfig

from pathlib import Path
import torch
import re
from torch import Tensor
from torch import nn
import torch.nn.functional as F
from itertools import chain
import pandas as pd
import collections
import os
import pdb
from tqdm import tqdm, trange
import sys
# import apex
import random
import numpy as np
from sklearn.model_selection import train_test_split
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
import logging

In [2]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [3]:
DATA_PATH=Path('Data/experiment')
DATA_PATH.mkdir(exist_ok=True)

PATH = Path('Data/')
PATH.mkdir(exist_ok= True)

OUT_PATH = Path('Output/')
OUT_PATH.mkdir(exist_ok = True)

logger = logging.getLogger(__name__)

## Model Parameters

In [4]:
#Change these parameters to run experiments

args = {
    "train_size": -1,
    "val_size": -1,
    "full_data_dir": DATA_PATH,
    "data_dir": PATH,
    "task_name": "Sentiment_multiclass",
    "bert_model": 'bert-base-cased',
    "output_dir": OUT_PATH,
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": False,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "learning_rate": 3e-5,
    "num_train_epochs": 4.0,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 421,
    "gradient_accumulation_steps": 1,
    "fp16": False,
    "loss_scale": 128,
    'n_embd': 768,
    "classifier_dropout": 0.3,
    "heads_per_class": 1,
    "num_hidden_layers": 12,
    "use_softmax": True,
    "layer_sizes": [2048,1024, 5 ],
    "train_file": 'train_5000_train.csv',
    "val_file": 'train_5000_val.csv',
    "test_file": 'test_onehot.csv',
    'train_on_full': True,
    'train_full_file': 'train_5000.csv',
    'model_fname': 'BertFineTuneClf',
    'test_pred_file': 'test_using_5000.csv'
    
}

## Model Class


In [5]:
class BertMultiClassClf(BertPreTrainedModel):
    """BERT model for multi-class classification.
    This module is composed of the BERT model with multiple linear layers on top of
    the pooled output.
    Each linear layers has PReLU activation and dropout. 
    
    Params:
        config: a BertConfig class instance with the configuration to build a new model.
        num_labels: the number of classes for the classifier
    Inputs:
        input_ids: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        token_type_ids: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        attention_mask: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        labels: labels for the classification output: torch.LongTensor of shape [batch_size, num_labels].
    Outputs:
        if labels is not None:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if labels is None:
            Outputs the classification logits of shape [batch_size, num_labels].

    """
    def __init__(self, config, num_labels=2):
        super(BertMultiClassClf, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        #self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.classifier = MultiLayerClassifier(input_layer_size=args['n_embd'], 
                                               layer_sizes = args['layer_sizes'], 
                                               dropout=args['classifier_dropout'], 
                                                softmax=args['use_softmax'])
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        out = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(out.view(-1, self.num_labels), torch.max(labels,1)[1])
            return loss
        else:
            return out
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True
    
class MultiLayerClassifier(nn.Module):
    def __init__(self, input_layer_size, layer_sizes, dropout=0.1, init_dropout=False,
                 softmax=True):
        super(MultiLayerClassifier, self).__init__()
        self.nclasses = num_labels
        self.device = -1
        self.final = num_labels
        self.dropout = dropout
        self.nonlinearity = nn.PReLU()  #Use PReLU for activation
        self.layer_sizes = [input_layer_size] + list(map(int, layer_sizes))
        self.final = self.layer_sizes[-1]
        # layer_sizes are sizes of the input and hidden layers, so the final 1 is assumed.
        layer_list = []
        if init_dropout:
            layer_list.extend([nn.Dropout(p=self.dropout)])
#         layer_list.extend(list(chain.from_iterable(
#             [[nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]),
#               nn.BatchNorm1d(self.layer_sizes[i+1]), self.nonlinearity, 
#               nn.Dropout(p=self.dropout)] for i in range(len(self.layer_sizes) - 2)])))
        layer_list.extend(list(chain.from_iterable(
            [[nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]),
              self.nonlinearity, 
              nn.Dropout(p=self.dropout)] for i in range(len(self.layer_sizes) - 2)])))
        self.final_layer = nn.Linear(*self.layer_sizes[-2:])
        extend_list = [self.final_layer]
        if not softmax:
            extend_list += [nn.Sigmoid()]
        layer_list.extend(extend_list)

        self.model = nn.Sequential(*layer_list)
        self.softmax = softmax

    def forward(self, X, **kwargs):
        out = self.model(X).float()
 
        return out

## Data Class

In [6]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, labels=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

In [7]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir, filename):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_val_examples(self, data_dir, filename):
        """Gets a collection of `InputExample`s for the validation set."""
        raise NotImplementedError()
    
    def get_test_examples(self, data_dir, filename):
        """Gets a collection of `InputExample`s for the test set."""
        raise NotImplementedError() 

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

In [8]:
class MultiLabelTextProcessor(DataProcessor):
    
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.labels = None
    
    
    def get_train_examples(self, data_dir, filename):
        data_df = pd.read_csv(os.path.join(data_dir, filename))
        return self._create_examples(data_df, "train")

        
    def get_val_examples(self, data_dir,filename):
        data_df = pd.read_csv(os.path.join(data_dir, filename))
        return self._create_examples(data_df, "val")
        
    
    def get_test_examples(self, data_dir, filename):
        data_df = pd.read_csv(os.path.join(data_dir, filename))
        return self._create_examples(data_df, "test")


    def get_labels(self):
        
        return ['rating_1','rating_2','rating_3','rating_4','rating_5']

    def _create_examples(self, df, set_type, labels_available=True):
        """Creates examples for the training and val sets."""
        examples = []
        for (i, row) in enumerate(df.values):
            guid = i
            text_a = row[0]
            if labels_available:
                labels = row[1:]
            else:
                labels = []
            examples.append(
                InputExample(guid=guid, text_a=text_a, labels=labels))
        return examples

In [9]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length -2)]
            
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        labels_ids = []
        for label in example.labels:           
            labels_ids.append(float(label))

        if ex_index < 0:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %s)" % (example.labels, labels_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=labels_ids))
    return features

## Metrics

In [11]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

from sklearn.metrics import fbeta_score

def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = fbeta_score(y_true=labels, y_pred=preds, beta = 1,average='weighted' )
    return {
        "acc": acc,
        "f1-macro": f1,
        "acc_and_f1-macro": (acc + f1) / 2,
    }


## Process for Training

In [12]:
processors = {
    "sentiment_multiclass": MultiLabelTextProcessor
}

# Setup GPU parameters

if args["local_rank"] == -1 or args["no_cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
    n_gpu = torch.cuda.device_count()

else:
    torch.cuda.set_device(args['local_rank'])
    device = torch.device("cuda", args['local_rank'])
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args['local_rank'] != -1), args['fp16']))

In [13]:
args['train_batch_size'] = int(args['train_batch_size'] / args['gradient_accumulation_steps'])

In [14]:
#Set up task
task_name = args['task_name'].lower()

if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

In [15]:
processor = processors[task_name](args['data_dir'])
label_list = processor.get_labels()
num_labels = len(label_list)

In [16]:
label_list

['rating_1', 'rating_2', 'rating_3', 'rating_4', 'rating_5']

In [17]:
# Load Tokenizer from Bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [18]:
# Prepare training set
train_examples = None
num_train_steps = None
if args['do_train']:
    if args['train_on_full']:
        train_examples = processor.get_train_examples(args['full_data_dir'], args['train_full_file'])
    else:
        train_examples = processor.get_train_examples(args['full_data_dir'],args['train_file'])
    
    num_train_steps = int(
        len(train_examples) / args['train_batch_size'] / args['gradient_accumulation_steps'] * args['num_train_epochs'])
    
    
    

In [20]:
# Prepare model
def get_model():
    model = BertMultiClassClf.from_pretrained('bert-base-cased', num_labels = num_labels)
    return model

model = get_model()

if args['fp16']:
    model.half()
model.to(device)
if args['local_rank'] != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

In [21]:
model.to(device)

DataParallel(
  (module): BertMultiClassClf(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): FusedLaye

In [None]:
# Set up Learning Rate
from torch.optim.lr_scheduler import _LRScheduler, Optimizer

class SlantedTriangularLR(_LRScheduler):
    """
    Implement the slanted triangular learning rate schedule used for ULMFiT.
    
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        lr_ratio (float): ratio of minimum to maximum learning rate
        max_val (float): highest learning rate
        cut_frac (float): fraction of iterations we increase the LR
        num_iters (int): number of epochs times the number of updates per epoch
    """
    def __init__(self, optimizer, lr_ratio=32, max_val=0.01, cut_frac=0.1, num_iters=1000):
        self.optimizer = optimizer
        self.min_val = max_val / lr_ratio
        self.max_val = max_val
        self.cut = num_iters * cut_frac 
        self.end_triangle_iter = num_iters
        self.num_iters = 0
        self.lr_func = self.create_lr_func()


    def create_lr_func(self):
        lr_range = self.max_val - self.min_val

        up_slope = lr_range / self.cut
        up_intercept = self.min_val
        down_slope = -lr_range / (self.end_triangle_iter - self.cut)
        down_intercept = -down_slope * self.cut + self.max_val

        def lr_func():
            if self.num_iters <= self.cut:
                return up_slope * self.num_iters + up_intercept
            else:
                return down_slope * self.num_iters + down_intercept

        return lr_func

    def step(self, step_num=None):
        if step_num is None:
            step_num = self.num_iters + 1
        self.num_iters = step_num
        new_lr = self.lr_func()
                
            

In [23]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
t_total = num_train_steps
if args['local_rank'] != -1:
    t_total = t_total // torch.distributed.get_world_size()

optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args['learning_rate'],
                         warmup=args['warmup_proportion'],
                         t_total=t_total)





In [None]:
# Add Evaluation step
eval_examples = processor.get_val_examples(args['full_data_dir'], args['val_file'])

def eval():
    
    eval_features = convert_examples_to_features(
        eval_examples, label_list, args['max_seq_length'], tokenizer)
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    
    model.eval()
    eval_loss = 0
    nb_eval_steps, nb_eval_examples = 0, 0
    preds = []
    
    for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
        
        loss_fct = torch.nn.CrossEntropyLoss()
        tmp_eval_loss = loss_fct(logits.view(-1, num_labels), torch.max(label_ids,1)[1])
        
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)

        eval_loss += tmp_eval_loss.mean().item()

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    preds = preds[0]
    preds = np.argmax(preds, axis=1)
    all_label_ids2 = np.argmax(all_label_ids.numpy(), axis = 1)
    eval_accuracy = simple_accuracy(preds, all_label_ids2)
    eval_acc_f1 = acc_and_f1(preds, all_label_ids2)

    

    result = {'eval_loss': eval_loss,
              'eval_accuracy': eval_accuracy,
             'f1-macro': eval_acc_f1['f1-macro']}

    output_eval_file = os.path.join(OUT_PATH, "eval_results.txt")
    if os.path.exists(output_eval_file):
        cmd = 'a'
    else: cmd = 'w'
        
    with open(output_eval_file, cmd) as writer:
        logger.info("***** Eval results *****")
        writer.write("Model "+ args['model_fname']+"\n")
        writer.write("Layers "+ ' + '.join([str(i) for i in args['layer_sizes']])+'\n')
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result

## Training


In [None]:
train_features = convert_examples_to_features(
    train_examples, label_list, args['max_seq_length'], tokenizer)

In [None]:
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", args['train_batch_size'])
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args['local_rank'] == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size'])

In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
iters_per_epoch = len(train_features)
num_iters = iters_per_epoch * args['num_train_epochs']

scheduler = SlantedTriangularLR(optimizer, lr_ratio=32, 
                                max_val=0.01, cut_frac=0.1, num_iters=num_iters)


def fit(num_epocs=args['num_train_epochs']):
    global_step = 0
    model.train()
    for i_ in tqdm(range(int(num_epocs)), desc="Epoch"):

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):

            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']
            
            loss.backward()
                
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
#                 scheduler.optimizer.step()
                # modify learning rate with special warm up BERT uses
                lr_this_step = args['learning_rate'] * warmup_linear(global_step/t_total, args['warmup_proportion'])
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
#                 scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

        logger.info('Loss after epoc {}'.format(tr_loss / nb_tr_steps))
        if args['train_on_full'] == False:
            logger.info('Eval after epoc {}'.format(i_+1))
            eval()
            
        

In [None]:
model.module.freeze_bert_encoder()

In [None]:
fit(1)

In [None]:
model.module.unfreeze_bert_encoder()

In [None]:
fit()

In [None]:
#Save model
# Save a trained model
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE


model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, "finetuned_pytorch_model_full.bin")
torch.save(model_to_save.state_dict(), output_model_file)


# #Load pretrained model
# model_state_dict = torch.load(output_model_file)
# model = BertMultiClassClf.from_pretrained(args['bert_model'], num_labels = num_labels, state_dict=model_state_dict)
# model.to(device)

## Evaluation

In [35]:
result = eval()
result

HBox(children=(IntProgress(value=0, description='Evaluating', max=63, style=ProgressStyle(description_width='i…




{'eval_loss': 0.9228694003725809,
 'eval_accuracy': 0.64,
 'f1-macro': 0.6428485577571229}

## Test Evaluation

In [33]:
def predict(model, path, test_filename):
    predict_processor = MultiLabelTextProcessor(path)
    test_examples = predict_processor.get_test_examples(path, test_filename)
    
    # Hold input data for returning it 
    input_data = [{ 'id': input_example.guid, 'text': input_example.text_a } for input_example in test_examples]

    test_features = convert_examples_to_features(
        test_examples, label_list, args['max_seq_length'], tokenizer)
    
    logger.info("***** Running prediction *****")
    logger.info("  Num examples = %d", len(test_examples))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    
    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in test_features], dtype=torch.long)
    
    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    
    # Run prediction for full data
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args['eval_batch_size'])
    
    model.eval()
    test_loss = 0
    nb_eval_steps, nb_eval_examples = 0, 0
    preds = []
    
    for step, batch in enumerate(tqdm(test_dataloader, desc="Prediction Iteration")):
        input_ids, input_mask, segment_ids,label_ids = batch
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
        
        loss_fct = torch.nn.CrossEntropyLoss()
        tmp_test_loss = loss_fct(logits.view(-1, num_labels), torch.max(label_ids,1)[1])
        
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
        test_loss += tmp_test_loss.mean().item()
        
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1
        
    test_loss = test_loss / nb_eval_steps
    
    preds = preds[0]
    preds = np.argmax(preds, axis=1)
    all_label_ids2 = np.argmax(all_label_ids.numpy(), axis = 1)
    test_accuracy = simple_accuracy(preds, all_label_ids2)
    test_acc_f1 = acc_and_f1(preds, all_label_ids2)

    

    result = {'test_loss': test_loss,
              'test_accuracy': test_accuracy,
             'test_f1-macro': test_acc_f1['f1-macro']}

    output_test_file = os.path.join(OUT_PATH, "test_results.txt")
    if os.path.exists(output_test_file):
        cmd = 'a'
    else: cmd = 'w'
        
    with open(output_test_file, cmd) as writer:
        logger.info("***** Test results *****")
        writer.write("*******Test Results *******\n")
        writer.write("Model "+ args['model_fname']+"\n")
        writer.write("Layers "+ ' + '.join([str(i) for i in args['layer_sizes']])+'\n')
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
            writer.write("------------------------------------\n")
    
    test_df = pd.DataFrame(input_data) 
    test_df['prediction'] = preds
    return result, test_df

In [34]:
result, test_df = predict(model, args['data_dir'], args['test_file'])

HBox(children=(IntProgress(value=0, description='Prediction Iteration', max=3125, style=ProgressStyle(descript…




  'precision', 'predicted', average, warn_for)


In [35]:
result

{'test_loss': 1.6622514208984376,
 'test_accuracy': 0.1994,
 'test_f1-macro': 0.10847427651888963}

In [36]:
#Save prediction result: 
test_df.to_csv(OUT_PATH/args['test_pred_file'], index = False)