<a href="https://colab.research.google.com/github/haeggee/error-detection-mt/blob/main/siamese_ja_zh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Siamese XLM RoBERTa for sentence pair classification in English-Japanese and English-Chinese

#### Sections

1. [Installation of libraries and imports](#section01)

2. [Classes and functions](#section02)

3. [English-Japanese](#section03)

4. [English-Chinese](#section04)



## Installation of libraries and imports

In [None]:
!pip install datasets==1.0.1 -q
!pip install transformers==3.1.0 -q
!pip install pickle5 -q

In [None]:
!pip install fugashi -q 
!pip install ipadic -q

In [None]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
import pickle5 as pickle
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
os.environ["TOKENIZERS_PARALLELISM"] = "false"

PyTorch version 1.9.0+cu102 available.
TensorFlow version 2.5.0 available.


In [None]:
# Check that we are using 100% of GPU memory footprint support libraries/code
# from https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip -q install gputil
!pip -q install psutil
!pip -q install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.5 GB  | Proc size: 597.6 MB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total 16280MB



In case GPU utilisation (Util) is not at 0%, you can uncomment and run the following line to kill all processes to get the full GPU afterwards. Make sure to comment out the line again to not constantly crash the notebook on purpose.

In [None]:
#!kill -9 -1

In [None]:
#!unzip dataset

## Classes and functions

In [None]:
def dataset_splitting(lang_pair):
  filename_train = "dataset/wmt21_multi_train.pkl" 
  dataset_train = pickle.load(open(filename_train,'rb'))
  filename_dev = "dataset/wmt21_multi_dev.pkl" 
  dataset_dev = pickle.load(open(filename_dev, 'rb'))
  dataset_train = dataset_train[dataset_train['language_pair']==lang_pair]
  dataset_dev = dataset_dev[dataset_dev['language_pair']==lang_pair]
  df_train, df_val = train_test_split(dataset_train, test_size = 0.05, random_state=42)  # split the original training data for validation
  df_test = dataset_dev

  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  df_test = df_test.reset_index(drop=True)
  print("Training set: ", df_train.shape)
  print("Validation set: ", df_val.shape)
  print("Test set:", df_test.shape)
  return df_train, df_val, df_test

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, bert_model_src='bert-base-uncased', bert_model_mt='cl-tohoku/bert-base-japanese', with_labels=True):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer_src = AutoTokenizer.from_pretrained(bert_model_src)  
        self.tokenizer_mt = AutoTokenizer.from_pretrained(bert_model_mt)
        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent_src = str(self.data.loc[index, 'src'])
        sent_mt = str(self.data.loc[index, 'mt'])
        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair_src = self.tokenizer_src(sent_src, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        encoded_pair_mt = self.tokenizer_mt(sent_mt, padding='max_length',
                                            truncation=True, max_length=self.maxlen,
                                            return_tensors='pt')
        token_ids = [encoded_pair_src['input_ids'].squeeze(0), encoded_pair_mt['input_ids'].squeeze(0)]  # tensor of token ids
        attn_masks = [encoded_pair_src['attention_mask'].squeeze(0), encoded_pair_mt['attention_mask'].squeeze(0)]  # binary tensor with "0" for padded values and "1" for the other values                                                          
        token_type_ids = [encoded_pair_src['token_type_ids'].squeeze(0), encoded_pair_mt['token_type_ids'].squeeze(0)]
        # xml-roberta doesn't make use of a token type id
        if self.with_labels:  # True if the dataset has labels
            label = int(self.data.loc[index, 'critical'])
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [None]:
class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model_src='bert-base-uncased', bert_model_mt='cl-tohoku/bert-base-japanese',freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer_src = AutoModel.from_pretrained(bert_model_src)
        self.bert_layer_mt = AutoModel.from_pretrained(bert_model_mt)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model_src == "bert-base-uncased":
          hidden_size_src = 768
        else:
          print("Look for the hidden-state size of the src model")

        if bert_model_mt == "cl-tohoku/bert-base-japanese":
            hidden_size_mt = 768
        elif bert_model_mt == "bert-base-chinese":
            hidden_size_mt = 768
        else:
          print("Look for the hidden-state size of the mt model")
        

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer_src.parameters():
                p.requires_grad = False
            for p in self.bert_layer_mt.parameters():
                p.requires_grad = False

        # Classification layer
        self.dropout_src = nn.Dropout(p=0.2)
        self.dropout_mt = nn.Dropout(p=0.2)
        self.clf = nn.Linear(hidden_size_src+hidden_size_mt, 1)
  
    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        cont_reps_src, pooler_output_src = self.bert_layer_src(input_ids[0], attn_masks[0], token_type_ids[0])
        
        cont_reps_mt, pooler_output_mt = self.bert_layer_mt(input_ids[1], attn_masks[1], token_type_ids[1])

        """
        # Feeding to the classifier layer the last layer hidden-state of
        the [CLS] token further processed by a
        Linear Layer and a Tanh activation.
        The Linear layer weights were trained from the sentence order
        prediction (ALBERT) or next sentence prediction (BERT)
        objective during pre-training.
        """
        logits = self.clf(torch.cat((self.dropout_src(pooler_output_src), self.dropout_mt(pooler_output_mt)),1))
        return logits

In [None]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            labels = labels.to(device)
            seq = [seq[0].to(device), seq[1].to(device)]
            attn_masks = [attn_masks[0].to(device), attn_masks[1].to(device)]
            token_type_ids = [token_type_ids[0].to(device), token_type_ids[1].to(device)]
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return mean_loss / count

In [None]:
print("Creation of the models' folder...")
!mkdir -p models

Creation of the models' folder...


In [None]:
def train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

            # Converting to cuda tensors

            labels = labels.to(device)
            seq = [seq[0].to(device), seq[1].to(device)]
            attn_masks = [attn_masks[0].to(device), attn_masks[1].to(device)]
            token_type_ids = [token_type_ids[0].to(device), token_type_ids[1].to(device)]
            # Enables autocasting for the forward pass (model + loss)
            with autocast():
                # Obtaining the logits from the model
                logits = net(seq, attn_masks, token_type_ids)
                # Computing loss
                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged
            # Backpropagating the gradients
            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            if (it + 1) % iters_to_accumulate == 0:
                # Optimization step
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, opti.step() is then called,
                # otherwise, opti.step() is skipped.
                scaler.step(opti)
                # Updates the scale for next iteration.
                scaler.update()
                # Adjust the learning rate based on the number of iterations.
                lr_scheduler.step()
                # Clear gradients
                opti.zero_grad()


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

        torch.cuda.empty_cache()
    
    # Saving the model
    path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(model_name, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

In [None]:
print("Creation of the results' folder...")
!mkdir -p results

Creation of the results' folder...


In [None]:
def get_probs_from_logits(logits):
    """
    Converts a tensor of logits into an array of probabilities by applying the sigmoid function
    """
    probs = torch.sigmoid(logits.unsqueeze(-1))
    return probs.detach().cpu().numpy()

def test_prediction(net, device, dataloader, with_labels=True, result_file="results/output.txt"):
    """
    Predict the probabilities on a dataset with or without labels and print the result in a file
    """
    net.eval()
    w = open(result_file, 'w')
    probs_all = []

    with torch.no_grad():
        if with_labels:
            for seq, attn_masks, token_type_ids, _ in tqdm(dataloader):
                seq = [seq[0].to(device), seq[1].to(device)]
                attn_masks = [attn_masks[0].to(device), attn_masks[1].to(device)]
                token_type_ids = [token_type_ids[0].to(device), token_type_ids[1].to(device)]
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()
        else:
            for seq, attn_masks, token_type_ids in tqdm(dataloader):
                seq = [seq[0].to(device), seq[1].to(device)]
                attn_masks = [attn_masks[0].to(device), attn_masks[1].to(device)]
                token_type_ids = [token_type_ids[0].to(device), token_type_ids[1].to(device)]
                logits = net(seq, attn_masks, token_type_ids)
                probs = get_probs_from_logits(logits.squeeze(-1)).squeeze(-1)
                probs_all += probs.tolist()

    w.writelines(str(prob)+'\n' for prob in probs_all)
    w.close()

In [None]:
def evaluate_pred_test(labels_test, preds_test):
  accuracy = accuracy_score(labels_test, preds_test)
  bac = balanced_accuracy_score(labels_test, preds_test, adjusted=True)
  f1 = f1_score(labels_test, preds_test)
  precision = precision_score(labels_test, preds_test)
  recall = recall_score(labels_test, preds_test)
  cnf = confusion_matrix(labels_test, preds_test)
  print("-----Evaluation-----")
  if cnf.shape == (2,2):
    print("TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}".format(tp=cnf[1][1], tn=cnf[0][0], fp=cnf[0][1], fn=cnf[1][0]))
    print("F1: ", f1)
    print("Precision: ", precision)
    print("Recall: ", recall)
  print("Accuracy: ", accuracy)

In [None]:
def get_pred_from_prob(labels_test, probs_test):
# choose threshold: according to precision-recall
  from sklearn.metrics import precision_recall_curve
  precision, recall, thresholds = precision_recall_curve(labels_test, probs_test)
  fscore = (2 * precision * recall) / (precision + recall)
  # locate the index of the largest f score
  ix = np.argmax(fscore)
  print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
  threshold = thresholds[ix]
  preds_test = (probs_test>=threshold).astype('uint8')
  return preds_test

# For English-Japanese

## Loading the dataset

In [None]:
df_train, df_val, df_test = dataset_splitting('en-ja')

Training set:  (7274, 7)
Validation set:  (383, 7)
Test set: (999, 7)


In [None]:
df_train.head()

Unnamed: 0,id,src,mt,list_scores,avg_scores,critical,language_pair
0,4103,All I did to Knives and Pens was change a capi...,Knives and Pens に し た の は 、 大 文字 を 小 文字 に 変え る...,"[0, 1, 0]",NOT,0,en-ja
1,613,"Quick, tell someone with real power to block m...",急 い で 、 本物 の 力 の あ る 人 に これ から 私 を ブロック する よう ...,"[0, 0, 0]",NOT,0,en-ja
2,6751,"Hey asshole, she's 18 years old. The briefing ...",ねえ お前 、 彼女 は 18 歳 だ ブリーフィング で 述べ た 。,"[0, 0, 0]",NOT,0,en-ja
3,6067,Anyone familiar at all with this article (by M...,この 記事 に 全く 慣れ て い る 人 ( マンゴー に よ る ) - アタチュルク ...,"[0, 0, 0]",NOT,0,en-ja
4,7436,"how it is, bad enough that I can't simply remo...",メロディック ・ デス ・ メタル の 部分 を 削除 する こと は でき ま せ ん 。,"[0, 0, 0]",NOT,0,en-ja


## Parameters

In [None]:
model_name = "japanese"
bert_model_src = "bert-base-uncased" #english bert model
bert_model_mt = "cl-tohoku/bert-base-japanese" #japanese bert model
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
maxlen = 128  # 75% below
bs = 12  # batch size
iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 1e-5 # learning rate
epochs = 5  # number of training epochs

In [None]:
# increase weight for pos label for data imbalance
pos_weight = ((df_train['critical'] == 0).sum() / (df_train['critical'] == 1).sum())
pos_weight = torch.Tensor([pos_weight.item()])
print(pos_weight)

tensor([9.6657])


## Training and validation

Link for the AdamW optimizer and the learning rate scheduler :
https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [None]:
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model_src, bert_model_mt)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model_src, bert_model_mt)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model_src, bert_model_mt, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

net.to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

Reading training data...
Reading validation data...


  cpuset_checked))
 20%|█▉        | 121/607 [00:26<01:44,  4.66it/s]


Iteration 121/607 of epoch 1 complete. Loss : 0.5924404476546059 


 40%|████      | 243/607 [00:54<01:19,  4.60it/s]


Iteration 242/607 of epoch 1 complete. Loss : 0.61461981687664 


 60%|█████▉    | 363/607 [01:21<00:53,  4.59it/s]


Iteration 363/607 of epoch 1 complete. Loss : 0.5719453309439431 


 80%|███████▉  | 485/607 [01:48<00:26,  4.53it/s]


Iteration 484/607 of epoch 1 complete. Loss : 0.6685479408945919 


100%|█████████▉| 605/607 [02:16<00:00,  4.52it/s]


Iteration 605/607 of epoch 1 complete. Loss : 0.5604158216271519 


100%|██████████| 607/607 [02:16<00:00,  4.44it/s]
100%|██████████| 32/32 [00:02<00:00, 11.01it/s]
  0%|          | 0/607 [00:00<?, ?it/s]


Epoch 1 complete! Validation Loss : 1.2318099504336715
Best validation loss improved from inf to 1.2318099504336715



 20%|█▉        | 121/607 [00:28<01:48,  4.48it/s]


Iteration 121/607 of epoch 2 complete. Loss : 0.520083511048112 


 40%|████      | 243/607 [00:56<01:21,  4.46it/s]


Iteration 242/607 of epoch 2 complete. Loss : 0.5296328578852425 


 60%|█████▉    | 363/607 [01:24<00:55,  4.42it/s]


Iteration 363/607 of epoch 2 complete. Loss : 0.4836163844697732 


 80%|███████▉  | 485/607 [01:52<00:27,  4.42it/s]


Iteration 484/607 of epoch 2 complete. Loss : 0.5926407634719344 


100%|█████████▉| 605/607 [02:20<00:00,  4.45it/s]


Iteration 605/607 of epoch 2 complete. Loss : 0.48757112605020037 


100%|██████████| 607/607 [02:21<00:00,  4.30it/s]
100%|██████████| 32/32 [00:02<00:00, 10.97it/s]
  0%|          | 0/607 [00:00<?, ?it/s]


Epoch 2 complete! Validation Loss : 1.2364204628393054


 20%|█▉        | 121/607 [00:28<01:49,  4.42it/s]


Iteration 121/607 of epoch 3 complete. Loss : 0.4213192188296436 


 40%|████      | 243/607 [00:57<01:22,  4.41it/s]


Iteration 242/607 of epoch 3 complete. Loss : 0.4092986031997302 


 60%|█████▉    | 363/607 [01:25<00:55,  4.42it/s]


Iteration 363/607 of epoch 3 complete. Loss : 0.37838239462907647 


 80%|███████▉  | 485/607 [01:53<00:27,  4.44it/s]


Iteration 484/607 of epoch 3 complete. Loss : 0.44762896062914004 


100%|█████████▉| 605/607 [02:21<00:00,  4.42it/s]


Iteration 605/607 of epoch 3 complete. Loss : 0.36670761900253535 


100%|██████████| 607/607 [02:22<00:00,  4.27it/s]
100%|██████████| 32/32 [00:02<00:00, 10.97it/s]
  0%|          | 0/607 [00:00<?, ?it/s]


Epoch 3 complete! Validation Loss : 1.4917262040544301


 20%|█▉        | 121/607 [00:28<01:50,  4.41it/s]


Iteration 121/607 of epoch 4 complete. Loss : 0.30222080293888887 


 40%|████      | 243/607 [00:57<01:22,  4.41it/s]


Iteration 242/607 of epoch 4 complete. Loss : 0.2844176034727865 


 60%|█████▉    | 363/607 [01:25<00:55,  4.43it/s]


Iteration 363/607 of epoch 4 complete. Loss : 0.2558854351605266 


 80%|███████▉  | 485/607 [01:53<00:27,  4.42it/s]


Iteration 484/607 of epoch 4 complete. Loss : 0.3247256304237468 


100%|█████████▉| 605/607 [02:21<00:00,  4.45it/s]


Iteration 605/607 of epoch 4 complete. Loss : 0.24658464003077224 


100%|██████████| 607/607 [02:22<00:00,  4.27it/s]
100%|██████████| 32/32 [00:02<00:00, 11.01it/s]
  0%|          | 0/607 [00:00<?, ?it/s]


Epoch 4 complete! Validation Loss : 1.6381291340803728


 20%|█▉        | 121/607 [00:28<01:50,  4.41it/s]


Iteration 121/607 of epoch 5 complete. Loss : 0.19670346049853593 


 40%|████      | 243/607 [00:57<01:22,  4.39it/s]


Iteration 242/607 of epoch 5 complete. Loss : 0.18304133332095857 


 60%|█████▉    | 363/607 [01:25<00:55,  4.42it/s]


Iteration 363/607 of epoch 5 complete. Loss : 0.19062894401102026 


 80%|███████▉  | 485/607 [01:53<00:27,  4.43it/s]


Iteration 484/607 of epoch 5 complete. Loss : 0.2651115078017239 


100%|█████████▉| 605/607 [02:21<00:00,  4.46it/s]


Iteration 605/607 of epoch 5 complete. Loss : 0.20711439416920843 


100%|██████████| 607/607 [02:22<00:00,  4.27it/s]
100%|██████████| 32/32 [00:02<00:00, 10.95it/s]



Epoch 5 complete! Validation Loss : 1.6235678367083892
The model has been saved in models/japanese_lr_1e-05_val_loss_1.23181_ep_1.pt


You can download the model saved in the folder "models" by browsing the files on the left of the colab notebook

In [None]:
# If you encounter a CUDA out of memory error: 
# - uncomment the kill command, run the "kill" command (and comment it)
# - reduce the batch size
# - then run all cells from the begining 

# If you get an ugly print of tqdm (all iterations are showed), follow the above first and last steps

printm()
# !kill -9 -1

Gen RAM Free: 11.5 GB  | Proc size: 6.1 GB
GPU RAM Free: 16280MB | Used: 0MB | Util   0% | Total 16280MB


## Prediction and Evaluation

In [None]:
path_to_output_file = 'results/output_japanese.txt'

print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model_src, bert_model_mt)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)

model = net
print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

Reading test data...


  cpuset_checked))
  0%|          | 0/84 [00:00<?, ?it/s]

Predicting on test data...


100%|██████████| 84/84 [00:06<00:00, 12.31it/s]


Predictions are available in : results/output_japanese.txt





You can download the predictions saved in the folder "results" by browsing the files on the left of the colab notebook

In [None]:
path_to_output_file = 'results/output_japanese.txt'  # path to the file with prediction probabilities

labels_test = df_test['critical']  # true labels

probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
preds_test = get_pred_from_prob(labels_test, probs_test)

Best Threshold=0.433105, F-Score=0.331


Link for the threshold choice problem : https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/

In [None]:
evaluate_pred_test(labels_test, preds_test)

-----Evaluation-----
TP: 47, TN: 762, FP: 141, FN: 49
F1:  0.33098591549295775
Precision:  0.25
Recall:  0.4895833333333333
Accuracy:  0.8098098098098098


# For English-Chinese

## Loading the dataset

In [None]:
df_train, df_val, df_test = dataset_splitting('en-zh')
df_train.head()

Training set:  (6515, 7)
Validation set:  (343, 7)
Test set: (999, 7)


Unnamed: 0,id,src,mt,list_scores,avg_scores,critical,language_pair
0,4082,Tylototriton or Tylotriton? I'm confused - is ...,"泰 洛特 里顿 还是 泰 洛特 里顿 ? 我 很 困惑 , 是 泰 洛特 里顿 还是 泰 洛...","[1, 0, 1]",ERR,1,en-zh
1,5608,"In any case, the articles be written such that...","无论如何 , 条款 都 应 写成 这样 , 使 每个 军队 的 使用 条件 都 得到 充分 ...","[0, 0, 0]",NOT,0,en-zh
2,3517,"Yes, I am on the list also. Maybe you can get ...","是 的 , 我 也 在 名单 上 。 也许 你 可以 和 他 的 一些 成员 联系 , 就 ...","[3, 0, 0]",NOT,0,en-zh
3,9744,"Dennis, stop using the discussion page for you...","丹尼斯 , 不要 再用 讨论 页 来 讨论 你 对 纽曼 的 挑衅 .","[0, 0, 0]",NOT,0,en-zh
4,9989,There has been a POV template in this article ...,"在 这个 文章 里 已有近 三年 的 POV 模板 , 但 没有 在 谈话 页 上 。 也许...","[0, 0, 0]",NOT,0,en-zh


## Parameters

In [None]:
model_name = "chinese"
bert_model_src = "bert-base-uncased" #english bert model
bert_model_mt = "bert-base-chinese" #chinese bert model
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
maxlen = 128  # 75% below
bs = 12  # batch size
iters_to_accumulate = 2  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 1e-5 # learning rate
epochs = 5  # number of training epochs

In [None]:
# increase weight for pos label for data imbalance
pos_weight = ((df_train['critical'] == 0).sum() / (df_train['critical'] == 1).sum())
pos_weight = torch.Tensor([pos_weight.item()])
print(pos_weight)

tensor([5.1695])


## Training and Validation

In [None]:
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model_src, bert_model_mt)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model_src, bert_model_mt)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = SentencePairClassifier(bert_model_src, bert_model_mt, freeze_bert=freeze_bert)
if torch.cuda.device_count() > 1:  # if multiple GPUs
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

net.to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps
t_total = (len(train_loader) // iters_to_accumulate) * epochs  # Necessary to take into account Gradient accumulation
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_bert(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

Reading training data...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…


Reading validation data...


  cpuset_checked))


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




 20%|██        | 109/543 [00:23<01:32,  4.71it/s]


Iteration 108/543 of epoch 1 complete. Loss : 0.5746423902886885 


 40%|███▉      | 217/543 [00:47<01:09,  4.70it/s]


Iteration 216/543 of epoch 1 complete. Loss : 0.5584069082030544 


 60%|█████▉    | 325/543 [01:11<00:46,  4.67it/s]


Iteration 324/543 of epoch 1 complete. Loss : 0.5473480195634894 


 80%|███████▉  | 433/543 [01:35<00:23,  4.64it/s]


Iteration 432/543 of epoch 1 complete. Loss : 0.528438818399553 


100%|█████████▉| 541/543 [01:59<00:00,  4.64it/s]


Iteration 540/543 of epoch 1 complete. Loss : 0.5363861107163959 


100%|██████████| 543/543 [01:59<00:00,  4.53it/s]
100%|██████████| 29/29 [00:02<00:00, 11.43it/s]
  0%|          | 0/543 [00:00<?, ?it/s]


Epoch 1 complete! Validation Loss : 1.0282572918924793
Best validation loss improved from inf to 1.0282572918924793



 20%|██        | 109/543 [00:24<01:35,  4.55it/s]


Iteration 108/543 of epoch 2 complete. Loss : 0.4919968429538939 


 40%|███▉      | 217/543 [00:49<01:12,  4.48it/s]


Iteration 216/543 of epoch 2 complete. Loss : 0.5086771219416901 


 60%|█████▉    | 325/543 [01:13<00:48,  4.50it/s]


Iteration 324/543 of epoch 2 complete. Loss : 0.4460193612785251 


 80%|███████▉  | 433/543 [01:38<00:24,  4.53it/s]


Iteration 432/543 of epoch 2 complete. Loss : 0.457309630320028 


100%|█████████▉| 541/543 [02:03<00:00,  4.50it/s]


Iteration 540/543 of epoch 2 complete. Loss : 0.453517562813229 


100%|██████████| 543/543 [02:03<00:00,  4.38it/s]
100%|██████████| 29/29 [00:02<00:00, 11.10it/s]
  0%|          | 0/543 [00:00<?, ?it/s]


Epoch 2 complete! Validation Loss : 1.0236627685612645
Best validation loss improved from 1.0282572918924793 to 1.0236627685612645



 20%|██        | 109/543 [00:25<01:37,  4.45it/s]


Iteration 108/543 of epoch 3 complete. Loss : 0.3994307003363415 


 40%|███▉      | 217/543 [00:50<01:12,  4.47it/s]


Iteration 216/543 of epoch 3 complete. Loss : 0.39024090787602794 


 60%|█████▉    | 325/543 [01:15<00:48,  4.47it/s]


Iteration 324/543 of epoch 3 complete. Loss : 0.3174380586930999 


 80%|███████▉  | 433/543 [01:40<00:24,  4.44it/s]


Iteration 432/543 of epoch 3 complete. Loss : 0.32665915442285715 


100%|█████████▉| 541/543 [02:05<00:00,  4.48it/s]


Iteration 540/543 of epoch 3 complete. Loss : 0.3219937146813781 


100%|██████████| 543/543 [02:05<00:00,  4.32it/s]
100%|██████████| 29/29 [00:02<00:00, 11.02it/s]
  0%|          | 0/543 [00:00<?, ?it/s]


Epoch 3 complete! Validation Loss : 1.1789402072799617


 20%|██        | 109/543 [00:25<01:37,  4.43it/s]


Iteration 108/543 of epoch 4 complete. Loss : 0.2820303007201464 


 40%|███▉      | 217/543 [00:50<01:12,  4.47it/s]


Iteration 216/543 of epoch 4 complete. Loss : 0.2638240935349906 


 60%|█████▉    | 325/543 [01:15<00:49,  4.42it/s]


Iteration 324/543 of epoch 4 complete. Loss : 0.2045385887570403 


 80%|███████▉  | 433/543 [01:40<00:24,  4.46it/s]


Iteration 432/543 of epoch 4 complete. Loss : 0.20773854251537058 


100%|█████████▉| 541/543 [02:05<00:00,  4.48it/s]


Iteration 540/543 of epoch 4 complete. Loss : 0.19794288591516238 


100%|██████████| 543/543 [02:06<00:00,  4.31it/s]
100%|██████████| 29/29 [00:02<00:00, 10.97it/s]
  0%|          | 0/543 [00:00<?, ?it/s]


Epoch 4 complete! Validation Loss : 1.4725263740482002


 20%|██        | 109/543 [00:25<01:37,  4.46it/s]


Iteration 108/543 of epoch 5 complete. Loss : 0.19969332794210426 


 40%|███▉      | 217/543 [00:50<01:13,  4.46it/s]


Iteration 216/543 of epoch 5 complete. Loss : 0.15889315024294234 


 60%|█████▉    | 325/543 [01:15<00:48,  4.48it/s]


Iteration 324/543 of epoch 5 complete. Loss : 0.14190769559462313 


 80%|███████▉  | 433/543 [01:40<00:24,  4.46it/s]


Iteration 432/543 of epoch 5 complete. Loss : 0.1582438413709126 


100%|█████████▉| 541/543 [02:05<00:00,  4.48it/s]


Iteration 540/543 of epoch 5 complete. Loss : 0.15900073067664547 


100%|██████████| 543/543 [02:05<00:00,  4.31it/s]
100%|██████████| 29/29 [00:02<00:00, 10.83it/s]



Epoch 5 complete! Validation Loss : 1.6629903331912796
The model has been saved in models/chinese_lr_1e-05_val_loss_1.02366_ep_2.pt


In [None]:
# If you encounter a CUDA out of memory error: 
# - uncomment the kill command, run the "kill" command (and comment it)
# - reduce the batch size
# - then run all cells from the begining 

# If you get an ugly print of tqdm (all iterations are showed), follow the above first and last steps

printm()
# !kill -9 -1

## Prediction and Evaluation

In [None]:
path_to_output_file = 'results/output_chinese.txt'

print("Reading test data...")
test_set = CustomDataset(df_test, maxlen, bert_model_src, bert_model_mt)
test_loader = DataLoader(test_set, batch_size=bs, num_workers=5)

model = net
print("Predicting on test data...")
test_prediction(net=model, device=device, dataloader=test_loader, with_labels=True,  # set the with_labels parameter to False if your want to get predictions on a dataset without labels
                result_file=path_to_output_file)
print()
print("Predictions are available in : {}".format(path_to_output_file))

Reading test data...


  cpuset_checked))
  0%|          | 0/84 [00:00<?, ?it/s]

Predicting on test data...


100%|██████████| 84/84 [00:06<00:00, 12.34it/s]


Predictions are available in : results/output_chinese.txt





In [None]:
path_to_output_file = 'results/output_chinese.txt'  # path to the file with prediction probabilities

labels_test = df_test['critical']  # true labels

probs_test = pd.read_csv(path_to_output_file, header=None)[0]  # prediction probabilities
preds_test = get_pred_from_prob(labels_test, probs_test)

Best Threshold=0.131714, F-Score=0.334


In [None]:
evaluate_pred_test(labels_test, preds_test)

-----Evaluation-----
TP: 83, TN: 585, FP: 273, FN: 58
F1:  0.33400402414486924
Precision:  0.23314606741573032
Recall:  0.5886524822695035
Accuracy:  0.6686686686686687
