In [1]:
import os
path = '/content/drive/MyDrive/VLSP_ReINTEL'
#you should change this path to your project folder path
os.chdir(path)

In [2]:
# install requirement library
!pip install transformers
!pip install fastBPE
!pip install fairseq
!pip install vncorenlp

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 12.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.5MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 50.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=05c2b631061fd37c69

In [3]:
#download PhoBERT base transformer
# you must run this cell first time you run this code
#!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
#!tar -xzvf PhoBERT_base_transformers.tar.gz

In [4]:
# Download VnCoreNLP-1.1.1.jar & its word segmentation component (i.e. RDRSegmenter) 
# you must run this cell first time you run this code
#!mkdir -p vncorenlp/models/wordsegmenter
#!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
#!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
#!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
#!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
#!mv vi-vocab vncorenlp/models/wordsegmenter/
#!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/

In [5]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import json
import numpy as np
import pickle
import os
import torch

def convert_lines(df, vocab, bpe, max_sequence_length):
    outputs = np.zeros((len(df), max_sequence_length))
    
    cls_id = 0
    eos_id = 2
    pad_id = 1

    for idx, row in tqdm(df.iterrows(), total=len(df)): 
        subwords = bpe.encode('<s> '+row.post_message+' </s>')
        input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        if len(input_ids) > max_sequence_length: 
            input_ids = input_ids[:max_sequence_length] 
            input_ids[-1] = eos_id
        else:
            input_ids = input_ids + [pad_id, ]*(max_sequence_length - len(input_ids))
        outputs[idx,:] = np.array(input_ids)
    return outputs

def seed_everything(SEED):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [6]:
from vncorenlp import VnCoreNLP
rdrsegmenter = VnCoreNLP(path+'/vncorenlp/VnCoreNLP-1.1.1.jar', annotators="wseg", max_heap_size='-Xmx500m') 
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default=path+"/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file(path+"/PhoBERT_base_transformers/dict.txt")

In [7]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from torch import nn
import json
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from transformers import *
import torch
import matplotlib.pyplot as plt
import torch.utils.data
import torch.nn.functional as F
import argparse
from transformers.modeling_utils import * 

In [8]:
#load data
import pandas as pd
train = pd.read_csv(path+'/VSLP_data/public_train (1).csv')
public_test = pd.read_csv(path+'/VSLP_data/public_test.csv')
test = pd.read_csv(path+'/VSLP_data/final_private_test_dropped_no_label - final_private_test_dropped_no_label.csv')

In [9]:
class RobertaForReINTEL(BertPreTrainedModel):
   config_class = RobertaConfig
   ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
    'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin",
    'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin",
}

   pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
   base_model_prefix = "roberta"
   def __init__(self, config):
       super(RobertaForReINTEL, self).__init__(config)
       self.num_labels = config.num_labels
       self.roberta = RobertaModel(config)
       self.qa_outputs = nn.Linear(4*config.hidden_size, self.num_labels)

       self.init_weights()

   def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):

       outputs = self.roberta(input_ids,
                            attention_mask=attention_mask,
                            position_ids=position_ids,
                            head_mask=head_mask)
       cls_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
       logits = self.qa_outputs(cls_output)
       return logits

In [10]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW
config = RobertaConfig.from_pretrained(
    path+"/PhoBERT_base_transformers/config.json",
    output_hidden_states=True,
    num_labels=1
)
model_bert = RobertaForReINTEL.from_pretrained(path+'/PhoBERT_base_transformers/model.bin', config=config)

Some weights of the model checkpoint at /content/drive/MyDrive/VLSP_ReINTEL/PhoBERT_base_transformers/model.bin were not used when initializing RobertaForReINTEL: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForReINTEL from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForReINTEL from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForReINTEL were not initialized from the model checkpoint at /content/drive/MyDrive/VLSP_ReINTEL/PhoBERT_base_transformers/model.bin and are newly initialized: ['qa_outputs.wei

In [11]:
if torch.cuda.device_count():
    print(f"Training using {torch.cuda.device_count()} gpus")
    model_bert = nn.DataParallel(model_bert)
    tsfm = model_bert.module.roberta 
else:
    tsfm = model_bert.roberta

Training using 1 gpus


In [12]:
kmax_sequence_length= 256
kbatch_size= 32
kaccumulation_steps= 5
kepochs= 5
kfold= 0
kseed= 42
klr= 2e-5

In [13]:
train_df = train[['id','post_message','label']].fillna('none')
train_df.post_message = train_df.post_message.progress_apply(lambda x: ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(x)])) 
y = train_df.label.values
X_train = convert_lines(train_df, vocab, bpe, kmax_sequence_length)

100%|██████████| 4372/4372 [00:31<00:00, 139.32it/s]
100%|██████████| 4372/4372 [00:07<00:00, 609.25it/s]


In [14]:
param_optimizer = list(model_bert.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(kepochs*len(train_df)/kbatch_size/kaccumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=klr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
scheduler0 = get_constant_schedule(optimizer)  # PyTorch scheduler


In [15]:
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X_train, y))
for fold, (train_idx, val_idx) in enumerate(splits):
    if fold != kfold:
      continue
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train[train_idx],dtype=torch.long), torch.tensor(y[train_idx],dtype=torch.long))
    valid_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train[val_idx],dtype=torch.long), torch.tensor(y[val_idx],dtype=torch.long))
    tq = tqdm(range(kepochs + 1))
    for child in tsfm.children():
        for param in child.parameters():
            if not param.requires_grad:
                print("whoopsies")
            param.requires_grad = False
    frozen = True
    for epoch in tq:

        if epoch > 0 and frozen:
            for child in tsfm.children():
                for param in child.parameters():
                    param.requires_grad = True
            frozen = False
            del scheduler0
            torch.cuda.empty_cache()

        val_preds = None
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=kbatch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=kbatch_size, shuffle=False)
        avg_loss = 0.
        avg_accuracy = 0.

        optimizer.zero_grad()
        pbar = tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
        for i,(x_batch, y_batch) in pbar:
            model_bert.train()
            y_pred = model_bert(x_batch.cuda(), attention_mask=(x_batch>0).cuda())
            loss =  F.binary_cross_entropy_with_logits(y_pred.view(-1).cuda(),y_batch.float().cuda())
            loss = loss.mean()
            loss.backward()
            if i % kaccumulation_steps == 0 or i == len(pbar) - 1:
                optimizer.step()
                optimizer.zero_grad()
                if not frozen:
                    scheduler.step()
                else:
                    scheduler0.step()
            lossf = loss.item()
            pbar.set_postfix(loss = lossf)
            avg_loss += loss.item() / len(train_loader)

        model_bert.eval()
        pbar = tqdm(enumerate(valid_loader),total=len(valid_loader),leave=False)
        for i,(x_batch, y_batch) in pbar:
            y_pred = model_bert(x_batch.cuda(), attention_mask=(x_batch>0).cuda())
            y_pred = y_pred.squeeze().detach().cpu().numpy()
            val_preds = np.atleast_1d(y_pred) if val_preds is None else np.concatenate([val_preds, np.atleast_1d(y_pred)])
        val_preds = sigmoid(val_preds)
        score = f1_score(y[val_idx], val_preds > 0.5)
        print(f"\nAUC = {roc_auc_score(y[val_idx], val_preds):.4f}, F1 score @0.5 = {score:.4f}")

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/110 [00:00<?, ?it/s][A
  0%|          | 0/110 [00:00<?, ?it/s, loss=0.654][A
  1%|          | 1/110 [00:00<01:11,  1.52it/s, loss=0.654][A
  1%|          | 1/110 [00:01<01:11,  1.52it/s, loss=0.691][A
  2%|▏         | 2/110 [00:01<01:04,  1.68it/s, loss=0.691][A
  2%|▏         | 2/110 [00:01<01:04,  1.68it/s, loss=0.643][A
  3%|▎         | 3/110 [00:01<00:59,  1.79it/s, loss=0.643][A
  3%|▎         | 3/110 [00:02<00:59,  1.79it/s, loss=0.653][A
  4%|▎         | 4/110 [00:02<00:56,  1.89it/s, loss=0.653][A
  4%|▎         | 4/110 [00:02<00:56,  1.89it/s, loss=0.625][A
  5%|▍         | 5/110 [00:02<00:53,  1.96it/s, loss=0.625][A
  5%|▍         | 5/110 [00:02<00:53,  1.96it/s, loss=0.623][A
  5%|▌         | 6/110 [00:02<00:51,  2.01it/s, loss=0.623][A
  5%|▌         | 6/110 [00:03<00:51,  2.01it/s, loss=0.651][A
  6%|▋         | 7/110 [00:03<00:50,  2.05it/s, loss=0.651][A
  6%|▋         | 7/110 [00:03<00:50,  2.05it/


AUC = 0.5842, F1 score @0.5 = 0.0000



  0%|          | 0/110 [00:01<?, ?it/s, loss=0.537][A
  1%|          | 1/110 [00:01<02:44,  1.51s/it, loss=0.537][A
  1%|          | 1/110 [00:03<02:44,  1.51s/it, loss=0.46] [A
  2%|▏         | 2/110 [00:03<02:42,  1.51s/it, loss=0.46][A
  2%|▏         | 2/110 [00:04<02:42,  1.51s/it, loss=0.481][A
  3%|▎         | 3/110 [00:04<02:41,  1.51s/it, loss=0.481][A
  3%|▎         | 3/110 [00:06<02:41,  1.51s/it, loss=0.534][A
  4%|▎         | 4/110 [00:06<02:39,  1.51s/it, loss=0.534][A
  4%|▎         | 4/110 [00:07<02:39,  1.51s/it, loss=0.499][A
  5%|▍         | 5/110 [00:07<02:38,  1.51s/it, loss=0.499][A
  5%|▍         | 5/110 [00:09<02:38,  1.51s/it, loss=0.502][A
  5%|▌         | 6/110 [00:09<02:39,  1.53s/it, loss=0.502][A
  5%|▌         | 6/110 [00:10<02:39,  1.53s/it, loss=0.562][A
  6%|▋         | 7/110 [00:10<02:37,  1.53s/it, loss=0.562][A
  6%|▋         | 7/110 [00:12<02:37,  1.53s/it, loss=0.483][A
  7%|▋         | 8/110 [00:12<02:36,  1.53s/it, loss=0.483][A



AUC = 0.6499, F1 score @0.5 = 0.0000



  0%|          | 0/110 [00:01<?, ?it/s, loss=0.272][A
  1%|          | 1/110 [00:01<02:48,  1.55s/it, loss=0.272][A
  1%|          | 1/110 [00:03<02:48,  1.55s/it, loss=0.364][A
  2%|▏         | 2/110 [00:03<02:45,  1.54s/it, loss=0.364][A
  2%|▏         | 2/110 [00:04<02:45,  1.54s/it, loss=0.557][A
  3%|▎         | 3/110 [00:04<02:43,  1.53s/it, loss=0.557][A
  3%|▎         | 3/110 [00:06<02:43,  1.53s/it, loss=0.204][A
  4%|▎         | 4/110 [00:06<02:41,  1.52s/it, loss=0.204][A
  4%|▎         | 4/110 [00:07<02:41,  1.52s/it, loss=0.439][A
  5%|▍         | 5/110 [00:07<02:39,  1.52s/it, loss=0.439][A
  5%|▍         | 5/110 [00:09<02:39,  1.52s/it, loss=0.624][A
  5%|▌         | 6/110 [00:09<02:39,  1.53s/it, loss=0.624][A
  5%|▌         | 6/110 [00:10<02:39,  1.53s/it, loss=0.456][A
  6%|▋         | 7/110 [00:10<02:37,  1.52s/it, loss=0.456][A
  6%|▋         | 7/110 [00:12<02:37,  1.52s/it, loss=0.4]  [A
  7%|▋         | 8/110 [00:12<02:35,  1.52s/it, loss=0.4][A
 


AUC = 0.8803, F1 score @0.5 = 0.3770



  0%|          | 0/110 [00:01<?, ?it/s, loss=0.333][A
  1%|          | 1/110 [00:01<02:49,  1.56s/it, loss=0.333][A
  1%|          | 1/110 [00:03<02:49,  1.56s/it, loss=0.424][A
  2%|▏         | 2/110 [00:03<02:47,  1.55s/it, loss=0.424][A
  2%|▏         | 2/110 [00:04<02:47,  1.55s/it, loss=0.323][A
  3%|▎         | 3/110 [00:04<02:44,  1.54s/it, loss=0.323][A
  3%|▎         | 3/110 [00:06<02:44,  1.54s/it, loss=0.254][A
  4%|▎         | 4/110 [00:06<02:42,  1.53s/it, loss=0.254][A
  4%|▎         | 4/110 [00:07<02:42,  1.53s/it, loss=0.297][A
  5%|▍         | 5/110 [00:07<02:40,  1.53s/it, loss=0.297][A
  5%|▍         | 5/110 [00:09<02:40,  1.53s/it, loss=0.18] [A
  5%|▌         | 6/110 [00:09<02:40,  1.54s/it, loss=0.18][A
  5%|▌         | 6/110 [00:10<02:40,  1.54s/it, loss=0.326][A
  6%|▋         | 7/110 [00:10<02:38,  1.53s/it, loss=0.326][A
  6%|▋         | 7/110 [00:12<02:38,  1.53s/it, loss=0.312][A
  7%|▋         | 8/110 [00:12<02:36,  1.53s/it, loss=0.312][A



AUC = 0.9201, F1 score @0.5 = 0.6875



  0%|          | 0/110 [00:01<?, ?it/s, loss=0.245][A
  1%|          | 1/110 [00:01<02:48,  1.54s/it, loss=0.245][A
  1%|          | 1/110 [00:03<02:48,  1.54s/it, loss=0.354][A
  2%|▏         | 2/110 [00:03<02:45,  1.53s/it, loss=0.354][A
  2%|▏         | 2/110 [00:04<02:45,  1.53s/it, loss=0.243][A
  3%|▎         | 3/110 [00:04<02:43,  1.53s/it, loss=0.243][A
  3%|▎         | 3/110 [00:06<02:43,  1.53s/it, loss=0.359][A
  4%|▎         | 4/110 [00:06<02:41,  1.52s/it, loss=0.359][A
  4%|▎         | 4/110 [00:07<02:41,  1.52s/it, loss=0.228][A
  5%|▍         | 5/110 [00:07<02:39,  1.52s/it, loss=0.228][A
  5%|▍         | 5/110 [00:09<02:39,  1.52s/it, loss=0.169][A
  5%|▌         | 6/110 [00:09<02:39,  1.53s/it, loss=0.169][A
  5%|▌         | 6/110 [00:10<02:39,  1.53s/it, loss=0.346][A
  6%|▋         | 7/110 [00:10<02:36,  1.52s/it, loss=0.346][A
  6%|▋         | 7/110 [00:12<02:36,  1.52s/it, loss=0.214][A
  7%|▋         | 8/110 [00:12<02:34,  1.52s/it, loss=0.214][A


AUC = 0.9391, F1 score @0.5 = 0.5025



  0%|          | 0/110 [00:01<?, ?it/s, loss=0.155][A
  1%|          | 1/110 [00:01<02:49,  1.55s/it, loss=0.155][A
  1%|          | 1/110 [00:03<02:49,  1.55s/it, loss=0.0673][A
  2%|▏         | 2/110 [00:03<02:46,  1.54s/it, loss=0.0673][A
  2%|▏         | 2/110 [00:04<02:46,  1.54s/it, loss=0.0212][A
  3%|▎         | 3/110 [00:04<02:44,  1.53s/it, loss=0.0212][A
  3%|▎         | 3/110 [00:06<02:44,  1.53s/it, loss=0.155] [A
  4%|▎         | 4/110 [00:06<02:42,  1.53s/it, loss=0.155][A
  4%|▎         | 4/110 [00:07<02:42,  1.53s/it, loss=0.157][A
  5%|▍         | 5/110 [00:07<02:39,  1.52s/it, loss=0.157][A
  5%|▍         | 5/110 [00:09<02:39,  1.52s/it, loss=0.0589][A
  5%|▌         | 6/110 [00:09<02:39,  1.53s/it, loss=0.0589][A
  5%|▌         | 6/110 [00:10<02:39,  1.53s/it, loss=0.203] [A
  6%|▋         | 7/110 [00:10<02:37,  1.53s/it, loss=0.203][A
  6%|▋         | 7/110 [00:12<02:37,  1.53s/it, loss=0.285][A
  7%|▋         | 8/110 [00:12<02:35,  1.53s/it, loss=0


AUC = 0.9439, F1 score @0.5 = 0.5946





In [16]:
torch.save(model_bert.state_dict(),os.path.join(path, f'phobert_model.bin'))

In [17]:
test_df = test[['id','post_message']].fillna('none')
test_df.post_message = test_df.post_message.progress_apply(lambda x: ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(x)])) 
X_test = convert_lines(test_df, vocab, bpe,kmax_sequence_length)

100%|██████████| 1646/1646 [00:11<00:00, 148.59it/s]
100%|██████████| 1646/1646 [00:02<00:00, 595.10it/s]


In [18]:
preds_en = []
for fold in range(5):
    print(f"Predicting for fold {fold}")
    preds_fold = []
    model_bert.load_state_dict(torch.load(os.path.join(path, f'phobert_model.bin')))
    test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test,dtype=torch.long))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=kbatch_size, shuffle=False)
    model_bert.eval()
    pbar = tqdm(enumerate(test_loader),total=len(test_loader),leave=False)
    for i, (x_batch,) in pbar:
        y_pred = model_bert(x_batch.cuda(), attention_mask=(x_batch>0).cuda())
        y_pred = y_pred.view(-1).detach().cpu().numpy()
        preds_fold = np.concatenate([preds_fold, y_pred])
    preds_fold = sigmoid(preds_fold)
    preds_en.append(preds_fold)
preds_en = np.mean(preds_en,axis=0)
#test_df["label"] = (preds_en > 0.5).astype(np.int)

Predicting for fold 0




Predicting for fold 1




Predicting for fold 2




Predicting for fold 3




Predicting for fold 4




In [19]:
x = pd.DataFrame({'test_id':test_df['id'],'label probability':preds_en})

In [20]:
x.to_csv('results.csv',index=False,header=False)