In [1]:
import sys
sys.path.append('..')

import torch
import pandas as pd
import re
import numpy as np
from datasets import DatasetDict, Dataset
from torch import nn 
from transformers import BertModel, AutoModelForSequenceClassification, BertForTokenClassification, EarlyStoppingCallback

from utils import *
from dataset import *
from preprocess import *
from wrapper import *

torch.cuda.is_available()

True

In [2]:
train_df_full = pd.read_csv('../data/train.csv', sep='\t')
test_df_full = pd.read_csv('../data/test.csv', sep='\t')

train_df = train_df_full[:1000]
test_df = test_df_full.iloc[np.random.randint(0, len(test_df_full), 200)]

(train_df.label == 0).sum(), (train_df.label == 1).sum()  # label distribution is pretty balanced

(257, 743)

In [3]:
MODEL_NAME = 'hfl/chinese-macbert-base'

# Define training arguments
arguments = AdversarialTrainingArguments(
    output_dir="sample_trainer",  # output directory
    per_device_train_batch_size=16,  # set training and eval batch size
    per_device_eval_batch_size=16,
    num_train_epochs=4,  # number of training epochs
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",  # save checkpoint at each epoch
    learning_rate=1e-5,
    load_best_model_at_end=True,
    label_names=['labels'],   # need to specify this to pass the labels to the trainer
    epsilon=0, 
    gamma=0.2,
    alpha=0.3,
)

In [4]:
# test = SimpleDataset(test_df, model_name=MODEL_NAME, test=True, split_words=False)

test = DatasetWithAuxiliaryEmbeddings(
    test_df, 
    model_name=MODEL_NAME, 
    aux_model_name='uer/roberta-base-finetuned-cluener2020-chinese',
    test=True, split_words=False, 
)
test.tokenize()
test.construct_dataset()

  indexed_value = torch.tensor(value[index]).squeeze()


In [5]:
k = 8
folds = generate_folds(len(train_df), k)
logits = []

In [6]:
val_idx = folds[0]

train = DatasetWithAuxiliaryEmbeddings(train_df, model_name=MODEL_NAME, 
    aux_model_name='uer/roberta-base-finetuned-cluener2020-chinese', 
    train_val_split=0.8, split_words=False
)
train.tokenize()
train.construct_dataset(val_idx=val_idx)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  indexed_value = torch.tensor(value[index]).squeeze()


In [7]:
class BertWithNER(nn.Module):
    def __init__(self, bert_model, ner_model, n_labels=2):
        super(BertWithNER, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model)
        self.ner = BertModel.from_pretrained(ner_model)
        for param in self.ner.parameters():
            param.requires_grad = False 

        self.classifier = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(768*2, 768, bias=True),
            nn.Tanh(),
            nn.Dropout(p=0.1),
            nn.Linear(768, n_labels, bias=True)
        )

    def forward(self, input_ids, attention_mask, auxiliary_input_ids, **kwargs):
        logits_bert = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        logits_ner = self.ner(auxiliary_input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        concatenated_vectors = torch.concat((logits_bert, logits_ner), axis=1)
        output = self.classifier(concatenated_vectors)
        return {'logits':output}

model = BertWithNER(
    bert_model=MODEL_NAME, 
    ner_model='uer/roberta-base-finetuned-cluener2020-chinese', 
    n_labels=2, 
)

model.cuda()

Some weights of the model checkpoint at hfl/chinese-macbert-base were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at uer/roberta-base-finetuned-cluener2020-chinese were not used when initializi

BertWithNER(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [8]:
arguments.remove_unused_columns = False

trainer = AdversarialTrainer(
    model=model, 
    args=arguments, 
    train_dataset=train.dataset['train'], 
    eval_dataset=train.dataset['val'],   # change to test when you do your final evaluation!
    tokenizer=train.tokenizer, 
    compute_metrics=compute_metrics, 
)

trainer.add_callback(EarlyStoppingCallback(
    early_stopping_patience=3, 
    early_stopping_threshold=0.0, 
))  # apply early stopping - stop training immediately if the loss cease to decrease

In [9]:
# Train the model 
trainer.train()

***** Running training *****
  Num examples = 875
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 220


  0%|          | 0/220 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


  0%|          | 0/8 [00:00<?, ?it/s]

Saving model checkpoint to sample_trainer\checkpoint-55
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'eval_loss': 3.7417609691619873, 'eval_F1': 0.8173076923076923, 'eval_precision': 0.7327586206896551, 'eval_recall': 0.9239130434782609, 'eval_runtime': 1.7419, 'eval_samples_per_second': 71.76, 'eval_steps_per_second': 4.593, 'epoch': 1.0}


tokenizer config file saved in sample_trainer\checkpoint-55\tokenizer_config.json
Special tokens file saved in sample_trainer\checkpoint-55\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


  0%|          | 0/8 [00:00<?, ?it/s]

Saving model checkpoint to sample_trainer\checkpoint-110
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'eval_loss': 3.7796630859375, 'eval_F1': 0.8450704225352111, 'eval_precision': 0.743801652892562, 'eval_recall': 0.9782608695652174, 'eval_runtime': 1.79, 'eval_samples_per_second': 69.833, 'eval_steps_per_second': 4.469, 'epoch': 2.0}


tokenizer config file saved in sample_trainer\checkpoint-110\tokenizer_config.json
Special tokens file saved in sample_trainer\checkpoint-110\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


  0%|          | 0/8 [00:00<?, ?it/s]

Saving model checkpoint to sample_trainer\checkpoint-165
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'eval_loss': 3.625704288482666, 'eval_F1': 0.7314285714285714, 'eval_precision': 0.7710843373493976, 'eval_recall': 0.6956521739130435, 'eval_runtime': 1.8023, 'eval_samples_per_second': 69.355, 'eval_steps_per_second': 4.439, 'epoch': 3.0}


tokenizer config file saved in sample_trainer\checkpoint-165\tokenizer_config.json
Special tokens file saved in sample_trainer\checkpoint-165\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 16


  0%|          | 0/8 [00:00<?, ?it/s]

Saving model checkpoint to sample_trainer\checkpoint-220
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'eval_loss': 3.725074291229248, 'eval_F1': 0.8358208955223881, 'eval_precision': 0.7706422018348624, 'eval_recall': 0.9130434782608695, 'eval_runtime': 1.8031, 'eval_samples_per_second': 69.324, 'eval_steps_per_second': 4.437, 'epoch': 4.0}


tokenizer config file saved in sample_trainer\checkpoint-220\tokenizer_config.json
Special tokens file saved in sample_trainer\checkpoint-220\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from sample_trainer\checkpoint-165 (score: 3.625704288482666).


{'train_runtime': 223.7147, 'train_samples_per_second': 15.645, 'train_steps_per_second': 0.983, 'train_loss': 3.4824873490767048, 'epoch': 4.0}


TrainOutput(global_step=220, training_loss=3.4824873490767048, metrics={'train_runtime': 223.7147, 'train_samples_per_second': 15.645, 'train_steps_per_second': 0.983, 'train_loss': 3.4824873490767048, 'epoch': 4.0})

In [None]:
for i in range(k):
    val_idx = folds[i]

    train = SimpleDataset(train_df, model_name=MODEL_NAME, train_val_split=0.8, split_words=False)
    train.tokenize()
    train.construct_dataset(val_idx=val_idx)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=2,
    )   

    model.cuda()

    trainer = AdversarialTrainer(
        model=model, 
        args=arguments, 
        train_dataset=train.dataset['train'], 
        eval_dataset=train.dataset['val'],   # change to test when you do your final evaluation!
        tokenizer=train.tokenizer, 
        compute_metrics=compute_metrics, 
    )

    trainer.add_callback(EarlyStoppingCallback(
        early_stopping_patience=3, 
        early_stopping_threshold=0.0, 
    ))  # apply early stopping - stop training immediately if the loss cease to decrease

    # Train the model (~15min for bert with max_seq_len=128)
    trainer.train()
    del model
    hiddens = trainer.predict(test.dataset['train']).predictions
    logits.append(hiddens)
    torch.cuda.empty_cache()

In [None]:
def voting(logits, val_accuracy=None):
    labels, count = np.unique(logits, axis=0, return_counts=True)
    return labels[np.argmax(count)]

def averaging(logits, val_accuracy):
    assert len(logits) == len(val_accuracy)
    weights = (np.argsort(val_accuracy).argsort()+1) / np.arange(1, len(val_accuracy)+1).sum()
    ensemble_logits = np.array(logits) * np.expand_dims(weights, axis=1).sum(0)
    return np.argmax(ensemble_logits, axis=1)


In [None]:
# Ensemble by logits
hiddens = np.array(logits).mean(0)
predictions = np.argmax(hiddens, 1)
result = pd.DataFrame(predictions, columns=['label'])

# Write results
fname = 'submission.csv'

with open(fname, 'w+') as f:
    result.to_csv(fname, index=False)