<a href="https://colab.research.google.com/github/heraclex12/R-BERT-Relation-Classification/blob/master/BERT_for_Relation_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip3 install vncorenlp
# !pip3 install fairseq
# !pip3 install fastBPE
!pip3 install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 46kB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 69kB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 172kB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[

In [2]:
from transformers import *
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm, trange
os.chdir('drive/My Drive/vncorenlp')

In [3]:
class FCLayer(torch.nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate=0., use_activation=True):
    super(FCLayer, self).__init__()
    self.use_activation = use_activation
    self.dropout = torch.nn.Dropout(dropout_rate)
    self.linear = torch.nn.Linear(input_dim, output_dim)
    self.tanh = torch.nn.Tanh()
  
  def forward(self, x):
    x = self.dropout(x)
    if self.use_activation:
      x = self.tanh(x)
    return self.linear(x)

In [5]:
class RBERT(BertPreTrainedModel):
  base_model_prefix = "roberta"
  config_class = RobertaConfig
  pretrained_model_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
  def __init__(self, config, args):
    super(RBERT, self).__init__(config)
    self.roberta = RobertaModel(config=config)
    self.num_labels = config.num_labels
    self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, args['DROPOUT_RATE'])
    self.e1_fc_layer = FCLayer(config.hidden_size, config.hidden_size, args['DROPOUT_RATE'])
    self.e2_fc_layer = FCLayer(config.hidden_size, config.hidden_size, args['DROPOUT_RATE'])
    self.label_classifier = FCLayer(config.hidden_size * 3, self.num_labels, args['DROPOUT_RATE'], use_activation=False)

  @staticmethod
  def entity_average(hidden_output, e_mask):
    e_mask_unqueeze = e_mask.unsqueeze(1)
    length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1)

    sum_vector = torch.bmm(e_mask_unqueeze.float(), hidden_output).squeeze(1)
    avg_vector = sum_vector.float() / length_tensor.float()
    return avg_vector
  
  def forward(self, input_ids, attention_mask, token_type_ids, labels, e1_mask, e2_mask):
    outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    sequence_output = outputs[0]
    pooled_output = outputs[1]

    e1_h = self.entity_average(sequence_output, e1_mask)
    e2_h = self.entity_average(sequence_output, e2_mask)

    pooled_output = self.cls_fc_layer(pooled_output)
    e1_h = self.e1_fc_layer(e1_h)
    e2_h = self.e2_fc_layer(e2_h)

    concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
    logits = self.label_classifier(concat_h)

    outputs = (logits, ) + outputs[2:]

    if labels is not None:
      if self.num_labels == 1:
        loss_fct = torch.nn.MSELoss()
        loss = loss_fct(logits.view(-1), labels.view(-1))
      else:
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      
      outputs = (loss,) + outputs
    return outputs

In [6]:
train_df = pd.read_csv('relation_data/train.tsv', sep='\t', header=None, names=['label', 'text'])
test_df = pd.read_csv('relation_data/test.tsv', sep='\t', header=None, names=['label', 'text'])
labels = pd.read_fwf('relation_data/label.txt', header=None, names=['label'])
train_df.head()

Unnamed: 0,label,text
0,"Component-Whole(e2,e1)",The system as described above has its greatest...
1,Other,The <e1> child </e1> was carefully wrapped and...
2,"Instrument-Agency(e2,e1)",The <e1> author </e1> of a keygen uses a <e2> ...
3,Other,A misty <e1> ridge </e1> uprises from the <e2>...
4,"Member-Collection(e1,e2)",The <e1> student </e1> <e2> association </e2> ...


In [7]:
def seed_everything(SEED):
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  torch.cuda.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True

if torch.cuda.is_available():
  device = torch.device("cuda")

  print("We will use the GPU:", torch.cuda.get_device_name())

args = {
    'NUM_LABELS' : len(labels),
    'DROPOUT_RATE' : 0.1,
    'LEARNING_RATE' : 2e-5,
    'EPOCHS' : 5,
    'MAX_SEQUENCE_LENGTH' : 128,
    'BATCH_SIZE' : 32,
    'ADAM_EPSILON' : 1e-8,
    'MAX_STEPS' : -1,
    'GRADIENT_ACCUMULATION_STEPS' : 1,
    'MAX_GRAD_NORM' : 1.0,
    'LOGGING_STEPS' : 250,
    'SAVE_STEPS' : 250,
    'WEIGHT_DECAY' : 0.1,
    'NUM_WARMUP_STEPS' : 100,
}

We will use the GPU: Tesla V100-SXM2-16GB


In [8]:
ADDITIONAL_SPECIAL_TOKENS = ["<e1>", "</e1>", "<e2>", "</e2>"]
def load_model(args, mode='en'):
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  tokenizer.add_special_tokens({"additional_special_tokens" : ADDITIONAL_SPECIAL_TOKENS})

  config = RobertaConfig.from_pretrained('roberta-base', num_labels = args['NUM_LABELS'])
  model = RBERT.from_pretrained('roberta-base', config=config, args=args)
  model.cuda()
  return config, tokenizer, model

seed_everything(69)
config, tokenizer, model = load_model(args)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RBERT: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RBERT were not initialized from the model checkpoint at roberta-base and are newly initialized: ['cls_fc_layer.linear.weight', 'cls_fc_layer.linear.bias', 'e1_fc_layer.linear.weight', 'e1_fc_layer.linear.bias', 'e2_fc_layer.linear.weight', 'e2_fc_layer.linear.bias', 'label_classifier.linear.weight', '

In [9]:
def convert_lines(df, label_indexes, max_seq_len, tokenizer, cls_token='[CLS]', cls_token_segment_id=0,
                  sep_token='[SEP]', pad_token=0, pad_token_segment_id=0, sequence_a_segment_id=0,
                  add_sep_token=False, mask_padding_with_zero=True):
  input_ids = []
  attention_masks = []
  token_type_ids = []
  e1_masks = []
  e2_masks = []
  labels = []
  print("Converting sentence...")
  for row in df.itertuples():
    if (row.Index % 5000 == 0 and row.Index > 0) or row.Index == len(df) - 1:
      print('Parsing {} of {}'.format(row.Index + 1, len(df)))
    
    tokens = tokenizer.tokenize(row.text)
    e11_p = tokens.index("<e1>")
    e12_p = tokens.index("</e1>")
    e21_p = tokens.index("<e2>")
    e22_p = tokens.index("</e2>")

    tokens[e11_p] = '$'
    tokens[e12_p] = '$'
    tokens[e21_p] = '#'
    tokens[e22_p] = '#'

    e11_p += 1
    e12_p += 1
    e21_p += 1
    e22_p += 1

    if add_sep_token:
      special_tokens_count = 2
    else:
      special_tokens_count = 1
    
    if len(tokens) > max_seq_len - special_tokens_count:
      tokens = tokens[:(max_seq_len - special_tokens_count)]
    
    if add_sep_token:
      tokens += [sep_token]
    
    token_type_id = [sequence_a_segment_id] * len(tokens)
    tokens = [cls_token] + tokens
    token_type_id = [cls_token_segment_id] + token_type_id

    input_id = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_id)

    padding_length = max_seq_len - len(input_id)
    input_id = input_id + ([pad_token] * padding_length)
    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    token_type_id = token_type_id + ([pad_token_segment_id] * padding_length)

    e1_mask = [0] * len(attention_mask)
    e2_mask = [0] * len(attention_mask)

    for i in range(e11_p, e12_p + 1):
      e1_mask[i] = 1
    for i in range(e21_p, e22_p + 1):
      e2_mask[i] = 1

    assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
    assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
    assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    labels.append(label_indexes.index[label_indexes.label == row.label][0])
    e1_masks.append(e1_mask)
    e2_masks.append(e2_mask)
  
  dataset = torch.utils.data.TensorDataset(torch.tensor(input_ids), 
                          torch.tensor(attention_masks), 
                          torch.tensor(token_type_ids), 
                          torch.tensor(labels), 
                          torch.tensor(e1_masks),
                          torch.tensor(e2_masks))
  return dataset

In [10]:
train_dataset = convert_lines(train_df, labels, args['MAX_SEQUENCE_LENGTH'], tokenizer, cls_token='<s>', sep_token='</s>', pad_token=1)
test_dataset = convert_lines(test_df, labels, args['MAX_SEQUENCE_LENGTH'], tokenizer, cls_token='<s>', sep_token='</s>', pad_token=1)

train_sampler = torch.utils.data.RandomSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=args['BATCH_SIZE'])

test_sampler = torch.utils.data.SequentialSampler(test_dataset)
test_loader = torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=args['BATCH_SIZE'] * 2)

Converting sentence...
Parsing 5001 of 8000
Parsing 8000 of 8000
Converting sentence...
Parsing 2717 of 2717


In [11]:
def evaluate(model, device, test_loader):
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    model.eval()

    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels': batch[3],
                      'e1_mask': batch[4],
                      'e2_mask': batch[5]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)

    result = {'accuracy' : accuracy_score(out_label_ids, preds), 'pred' : preds}
    return result

def save_model():
    torch.save(model.state_dict(), 'relation_data/trained_models/model.bin')

def load_saved_model(args):
  tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
  tokenizer.add_special_tokens({"additional_special_tokens" : ADDITIONAL_SPECIAL_TOKENS})

  config = RobertaConfig.from_pretrained('roberta-base', output_hideen_states=True, num_labels = args['NUM_LABELS'])
  model = RBERT.from_pretrained('relation_data/trained_models/model.bin', config=config, args=args)
  model.cuda()
  return config, tokenizer, model

In [12]:

if args['MAX_STEPS'] > 0:
  t_total = args['MAX_STEPS']
  args['EPOCHS'] = args['MAX_STEPS'] // (len(train_loader) // args['GRADIENT_ACCUMULATION_STEPS']) + 1
else:
  t_total = len(train_loader) // args['GRADIENT_ACCUMULATION_STEPS'] * args['EPOCHS']

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': args['WEIGHT_DECAY']},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['LEARNING_RATE'], eps=args['ADAM_EPSILON'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['NUM_WARMUP_STEPS'], num_training_steps=t_total)


global_step = 0
tr_loss = 0.0
model.zero_grad()

train_iterator = trange(int(args['EPOCHS']), desc="Epoch")


for _ in train_iterator:
    epoch_iterator = tqdm(train_loader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(device) for t in batch)  # GPU or CPU
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels': batch[3],
                  'e1_mask': batch[4],
                  'e2_mask': batch[5]}
        outputs = model(**inputs)
        loss = outputs[0]

        if args['GRADIENT_ACCUMULATION_STEPS'] > 1:
            loss = loss / args['GRADIENT_ACCUMULATION_STEPS']

        loss.backward()

        tr_loss += loss.item()
        if (step + 1) % args['GRADIENT_ACCUMULATION_STEPS'] == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), args['MAX_GRAD_NORM'])

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if args['LOGGING_STEPS'] > 0 and global_step % args['LOGGING_STEPS']== 0:
                print("ACCURACY: ", evaluate(model, device, test_loader)['accuracy'])

            # if args['SAVE_STEPS'] > 0 and global_step % args['SAVE_STEPS'] == 0:
            #     save_model()

        if 0 < args['MAX_STEPS'] < global_step:
            epoch_iterator.close()
            break

    if 0 < args['MAX_STEPS'] < global_step:
        train_iterator.close()
        break

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/250 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/250 [00:00<02:40,  1.55it/s][A
Iteration:   1%|          | 2/250 [00:00<02:09,  1.92it/s][A
Iteration:   1%|          | 3/250 [00:01<01:47,  2.30it/s][A
Iteration:   2%|▏         | 4/250 [00:01<01:32,  2.65it/s][A
Iteration:   2%|▏         | 5/250 [00:01<01:21,  2.99it/s][A
Iteration:   2%|▏         | 6/250 [00:01<01:14,  3.29it/s][A
Iteration:   3%|▎         | 7/250 [00:02<01:08,  3.54it/s][A
Iteration:   3%|▎         | 8/250 [00:02<01:04,  3.73it/s][A
Iteration:   4%|▎         | 9/250 [00:02<01:01,  3.89it/s][A
Iteration:   4%|▍         | 10/250 [00:02<01:00,  3.99it/s][A
Iteration:   4%|▍         | 11/250 [00:02<00:58,  4.08it/s][A
Iteration:   5%|▍         | 12/250 [00:03<00:57,  4.15it/s][A
Iteration:   5%|▌         | 13/250 [00:03<00:56,  4.20it/s][A
Iteration:   6%|▌         | 14/250 [00:03<00:55,  4.23it/s][A
Iteration:   6%|▌         | 

ACCURACY:  0.742730953257269



Iteration:   0%|          | 1/250 [00:00<00:57,  4.31it/s][A
Iteration:   1%|          | 2/250 [00:00<00:59,  4.19it/s][A
Iteration:   1%|          | 3/250 [00:00<00:59,  4.19it/s][A
Iteration:   2%|▏         | 4/250 [00:00<00:58,  4.20it/s][A
Iteration:   2%|▏         | 5/250 [00:01<00:58,  4.21it/s][A
Iteration:   2%|▏         | 6/250 [00:01<00:57,  4.23it/s][A
Iteration:   3%|▎         | 7/250 [00:01<00:58,  4.18it/s][A
Iteration:   3%|▎         | 8/250 [00:01<00:57,  4.19it/s][A
Iteration:   4%|▎         | 9/250 [00:02<00:57,  4.20it/s][A
Iteration:   4%|▍         | 10/250 [00:02<00:56,  4.22it/s][A
Iteration:   4%|▍         | 11/250 [00:02<00:56,  4.24it/s][A
Iteration:   5%|▍         | 12/250 [00:02<00:56,  4.24it/s][A
Iteration:   5%|▌         | 13/250 [00:03<00:55,  4.25it/s][A
Iteration:   6%|▌         | 14/250 [00:03<00:55,  4.23it/s][A
Iteration:   6%|▌         | 15/250 [00:03<00:55,  4.23it/s][A
Iteration:   6%|▋         | 16/250 [00:03<00:55,  4.24it/s][A


ACCURACY:  0.8148693411851307



Iteration:   0%|          | 1/250 [00:00<00:59,  4.18it/s][A
Iteration:   1%|          | 2/250 [00:00<00:58,  4.21it/s][A
Iteration:   1%|          | 3/250 [00:00<00:58,  4.23it/s][A
Iteration:   2%|▏         | 4/250 [00:00<00:58,  4.23it/s][A
Iteration:   2%|▏         | 5/250 [00:01<00:57,  4.23it/s][A
Iteration:   2%|▏         | 6/250 [00:01<00:57,  4.25it/s][A
Iteration:   3%|▎         | 7/250 [00:01<00:57,  4.25it/s][A
Iteration:   3%|▎         | 8/250 [00:01<00:57,  4.22it/s][A
Iteration:   4%|▎         | 9/250 [00:02<00:56,  4.24it/s][A
Iteration:   4%|▍         | 10/250 [00:02<00:56,  4.21it/s][A
Iteration:   4%|▍         | 11/250 [00:02<00:57,  4.18it/s][A
Iteration:   5%|▍         | 12/250 [00:02<00:57,  4.17it/s][A
Iteration:   5%|▌         | 13/250 [00:03<00:56,  4.19it/s][A
Iteration:   6%|▌         | 14/250 [00:03<00:55,  4.22it/s][A
Iteration:   6%|▌         | 15/250 [00:03<00:55,  4.24it/s][A
Iteration:   6%|▋         | 16/250 [00:03<00:54,  4.26it/s][A


ACCURACY:  0.8251748251748252



Iteration:   0%|          | 1/250 [00:00<00:58,  4.23it/s][A
Iteration:   1%|          | 2/250 [00:00<00:58,  4.22it/s][A
Iteration:   1%|          | 3/250 [00:00<00:58,  4.22it/s][A
Iteration:   2%|▏         | 4/250 [00:00<00:58,  4.22it/s][A
Iteration:   2%|▏         | 5/250 [00:01<00:58,  4.20it/s][A
Iteration:   2%|▏         | 6/250 [00:01<00:58,  4.20it/s][A
Iteration:   3%|▎         | 7/250 [00:01<00:58,  4.19it/s][A
Iteration:   3%|▎         | 8/250 [00:01<00:57,  4.19it/s][A
Iteration:   4%|▎         | 9/250 [00:02<00:57,  4.21it/s][A
Iteration:   4%|▍         | 10/250 [00:02<00:56,  4.21it/s][A
Iteration:   4%|▍         | 11/250 [00:02<00:56,  4.20it/s][A
Iteration:   5%|▍         | 12/250 [00:02<00:57,  4.17it/s][A
Iteration:   5%|▌         | 13/250 [00:03<00:56,  4.19it/s][A
Iteration:   6%|▌         | 14/250 [00:03<00:56,  4.16it/s][A
Iteration:   6%|▌         | 15/250 [00:03<00:56,  4.16it/s][A
Iteration:   6%|▋         | 16/250 [00:03<00:56,  4.15it/s][A


ACCURACY:  0.8292234081707766



Iteration:   0%|          | 1/250 [00:00<00:59,  4.19it/s][A
Iteration:   1%|          | 2/250 [00:00<00:59,  4.19it/s][A
Iteration:   1%|          | 3/250 [00:00<00:58,  4.20it/s][A
Iteration:   2%|▏         | 4/250 [00:00<00:58,  4.21it/s][A
Iteration:   2%|▏         | 5/250 [00:01<00:57,  4.22it/s][A
Iteration:   2%|▏         | 6/250 [00:01<00:57,  4.22it/s][A
Iteration:   3%|▎         | 7/250 [00:01<00:57,  4.22it/s][A
Iteration:   3%|▎         | 8/250 [00:01<00:57,  4.24it/s][A
Iteration:   4%|▎         | 9/250 [00:02<00:56,  4.24it/s][A
Iteration:   4%|▍         | 10/250 [00:02<00:56,  4.26it/s][A
Iteration:   4%|▍         | 11/250 [00:02<00:56,  4.25it/s][A
Iteration:   5%|▍         | 12/250 [00:02<00:55,  4.25it/s][A
Iteration:   5%|▌         | 13/250 [00:03<00:55,  4.24it/s][A
Iteration:   6%|▌         | 14/250 [00:03<00:55,  4.25it/s][A
Iteration:   6%|▌         | 15/250 [00:03<00:55,  4.26it/s][A
Iteration:   6%|▋         | 16/250 [00:03<00:55,  4.24it/s][A


ACCURACY:  0.8354803091645197





In [16]:
result = evaluate(model, device, test_loader)
print("\nAccuracy: ", result['accuracy'])
# labels.iloc[result['pred']].values

Evaluating: 100%|██████████| 43/43 [00:05<00:00,  7.73it/s]


Accuracy:  0.8354803091645197





In [17]:
def convert_lines_new(df, label_indexes, max_seq_len, tokenizer, cls_token='[CLS]', cls_token_segment_id=0,
                  sep_token='[SEP]', pad_token=0, pad_token_segment_id=0, sequence_a_segment_id=0,
                  add_sep_token=False, mask_padding_with_zero=True):
  input_ids = []
  attention_masks = []
  token_type_ids = []
  e1_masks = []
  e2_masks = []
  labels = []
  print("Converting sentence...")
  for row in df.itertuples():
    if (row.Index % 5000 == 0 and row.Index > 0) or row.Index == len(df) - 1:
      print('Parsing {} of {}'.format(row.Index + 1, len(df)))
    
    tokens = tokenizer.tokenize(row.text)

    if add_sep_token:
      special_tokens_count = 2
    else:
      special_tokens_count = 1
    
    if len(tokens) > max_seq_len - special_tokens_count:
      tokens = tokens[:(max_seq_len - special_tokens_count)]
    
    if add_sep_token:
      tokens += [sep_token]
    
    token_type_id = [sequence_a_segment_id] * len(tokens)
    tokens = [cls_token] + tokens
    token_type_id = [cls_token_segment_id] + token_type_id

    input_id = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_id)

    padding_length = max_seq_len - len(input_id)
    input_id = input_id + ([pad_token] * padding_length)
    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    token_type_id = token_type_id + ([pad_token_segment_id] * padding_length)

    e1_mask = [0] * len(attention_mask)
    e2_mask = [0] * len(attention_mask)

    assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
    assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
    assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    token_type_ids.append(token_type_id)
    labels.append(label_indexes.index[label_indexes.label == row.label][0])
    e1_masks.append(e1_mask)
    e2_masks.append(e2_mask)
  
  dataset = torch.utils.data.TensorDataset(torch.tensor(input_ids), 
                          torch.tensor(attention_masks), 
                          torch.tensor(token_type_ids), 
                          torch.tensor(labels), 
                          torch.tensor(e1_masks),
                          torch.tensor(e2_masks))
  return dataset

In [18]:
own_data = {
    'label' : ['Message-Topic(e1,e2)', 'Message-Topic(e1,e2)', 'Component-Whole(e1,e2)',
               'Message-Topic(e2,e1)', 'Cause-Effect(e2,e1)', 'Product-Producer(e1,e2)',
               'Entity-Destination(e1,e2)', 'Component-Whole(e1,e2)', 'Entity-Origin(e1,e2)'],
    'text' : ['	This <e1> article </e1> gives details on 2004 in <e2> music </e2> in the United Kingdom, including the official charts from that year.',
	'We have therefore taken the initiative to convene the first international open <e1> meeting </e1> dedicated solely to <e2> rural history </e2>.',
	'The <e1> timer </e1> of the <e2> device </e2> automatically eliminates wasted "standby power" consumption by automatically turn off electronics plugged into the "auto off" outlets.',
	'Bob Parks made a similar <e1> offer </e1> in a <e2> phone call </e2> made earlier this week.',
	'He had chest pains and <e1> headaches </e1> from <e2> mold </e2> in the bedrooms.',
	'The silver-haired author was not just laying India\'s politician saint to rest but healing a generations-old rift in the family of the <e1> country </e1>\'s founding <e2> father </e2>.',
	'It describes a method for loading a horizontal <e1> stack </e1> of containers into a <e2> carton </e2>.',
	'The Foundation decided to repurpose the building in order to reduce wear and tear on the <e1> plumbing </e1> in the <e2> manor house </e2> by redirecting visitors during restoration projects and beyond.',
	'The technology is available to produce and transmit <e1> electricity </e1> economically from OTEC <e2> systems </e2>.']
}

In [19]:
df = pd.DataFrame(data=own_data)
p_data = convert_lines(df, labels, 128, tokenizer)

Converting sentence...
Parsing 9 of 9


In [20]:
# config, tokenizer, model = load_saved_model(args)

p_sampler = torch.utils.data.SequentialSampler(p_data)
p_loader = torch.utils.data.DataLoader(p_data, sampler=p_sampler, batch_size=args['BATCH_SIZE'] * 2)

result_p = evaluate(model, device, p_loader)
result_p

Evaluating: 100%|██████████| 1/1 [00:00<00:00, 37.38it/s]


{'accuracy': 1.0, 'pred': array([17, 17, 13, 18,  2,  5, 11, 13,  9])}