In [8]:
import pandas as pd
import torch

In [5]:
from transformers import AutoModel, AutoTokenizer
AUTH_TOKEN = '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'
tokenizer = AutoTokenizer.from_pretrained('nguyenvulebinh/vi-mrc-base', use_auth_token='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')
print(tokenizer.encode("thành phố hà nội"))



[0, 2781, 15902, 90348, 11332, 2]


In [3]:
df = pd.read_csv('train_ranking.csv')
df.head()

Unnamed: 0,text,queries,label,data_types
0,phạm văn đồng ( 1 tháng 3 năm 1906 – 29 tháng ...,Tên gọi nào được Phạm Văn Đồng sử dụng khi làm...,1,train
1,phạm văn đồng có vợ là bà phạm thị cúc và một ...,Tên gọi nào được Phạm Văn Đồng sử dụng khi làm...,0,train
2,"ông việt phương , nguyên thư_ký của thủ_tướng ...",Tên gọi nào được Phạm Văn Đồng sử dụng khi làm...,0,train
3,bình định là mảnh đất có bề dày lịch_sử với nề...,Tên gọi nào được Phạm Văn Đồng sử dụng khi làm...,0,train
4,"đầu năm 1126 , triều_kim của người nữ chân đã ...",Tên gọi nào được Phạm Văn Đồng sử dụng khi làm...,0,train


## Custom Datasest

In [4]:
from torch.utils.data import Dataset

class SiameseDataset(Dataset):

    def __init__(self, df, tokenizer, max_length, is_train):
        if is_train:
            self.df = df[df.data_types=='train']
        else:
            print('is_train=',is_train)
            self.df = df[df.data_types=='test']

        self.max_length = max_length
        self.tokenizer = tokenizer
        self.content1 = tokenizer.batch_encode_plus(list(self.df.queries.apply(lambda x: x.replace("_"," ")).values), max_length=max_length, truncation=True)["input_ids"]
        self.content2 = tokenizer.batch_encode_plus(list(self.df.text.apply(lambda x: x.replace("_"," ")).values), max_length=max_length, truncation=True)["input_ids"]
        self.targets = self.df.label.values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return {
            'ids1': torch.tensor(self.content1[index], dtype=torch.long),
            'ids2': torch.tensor(self.content2[index][1:], dtype=torch.long),
            'target': torch.tensor(self.targets[index], dtype=torch.float)
        }


## Model

In [3]:
import torch.nn as nn
from transformers import AutoModel, AutoConfig

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class PairwiseModel(nn.Module):
    def __init__(self, model_name):
        super(PairwiseModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)
        self.config = AutoConfig.from_pretrained(model_name, use_auth_token=AUTH_TOKEN)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, 1)
        
    def forward(self, ids, masks):
        out = self.model(input_ids=ids,
                           attention_mask=masks,
                           output_hidden_states=False).last_hidden_state
        out = out[:,0]
        outputs = self.fc(out)
        return outputs


## Training

In [6]:
pad_token_id = tokenizer.pad_token_id
def collate_fn(batch):
    ids = [torch.cat([x["ids1"], x["ids2"]]) for x in batch]
    targets = [x["target"] for x in batch]
    max_len = np.max([len(x) for x in ids])
    masks = []
    for i in range(len(ids)):
        if len(ids[i]) < max_len:
            ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))
        masks.append(ids[i] != pad_token_id)
    # print(tokenizer.decode(ids[0]))
    outputs = {
        "ids": torch.vstack(ids),
        "masks": torch.vstack(masks),
        "target": torch.vstack(targets).view(-1)
    }
    return outputs


In [7]:
def optimizer_scheduler(model, num_train_steps):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

    opt = AdamW(optimizer_parameters, lr=3e-5)
    sch = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.05*num_train_steps),
        num_training_steps=num_train_steps,
        last_epoch=-1,
    )
    return opt, sch


In [9]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import *

loss_fn = nn.BCEWithLogitsLoss()
epochs = 5
accumulation_steps = 8
scaler = torch.cuda.amp.GradScaler()
error_ids = None

model = PairwiseModel('nguyenvulebinh/vi-mrc-base')
# model.load_state_dict(torch.load(f"./outputs/pairwise_v2.bin"))
model.cuda()


train_dataset = SiameseDataset(df, tokenizer, 384, True)
valid_dataset = SiameseDataset(df, tokenizer, 384, False)
train_loader = DataLoader(train_dataset, batch_size=8, collate_fn=collate_fn,
                            num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn,
                            num_workers=2, shuffle=False, pin_memory=True)

print('Done load dataset')


Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


is_train= False
Done load dataset


In [10]:
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np

In [11]:
num_train_steps = len(train_loader) * epochs // accumulation_steps
optimizer, scheduler = optimizer_scheduler(model, num_train_steps)

for epoch in tqdm(range(epochs)):
    model.train()
    bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    for step, data in bar:
        ids = data["ids"].cuda()
        # for x in ids:
        #     print(tokenizer.decode(x))
        masks = data["masks"].cuda()
        target = data["target"].cuda()
        # with torch.cuda.amp.autocast():
        preds = model(ids, masks)
        # print(preds.view(-1))
        loss = loss_fn(preds.view(-1), target.view(-1))
        loss /= accumulation_steps
        loss.backward()
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            # scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        bar.set_postfix(loss=loss.item())

    model.eval()
    with torch.no_grad():
        bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)
        targets = []
        all_preds = []
        for step, data in bar:
            ids = data["ids"].cuda()
            masks = data["masks"].cuda()
            target = data["target"].cuda()
            preds = torch.sigmoid(model(ids, masks))
            all_preds.extend(preds.cpu().view(-1).numpy())
            targets.extend(target.cpu().view(-1).numpy())
        all_preds = np.array(all_preds)
        targets = np.array(targets)

        print(f"F1 {f1_score(targets, all_preds > 0.5)}")




  0%|          | 0/5 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible


	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/28685 [00:00<?, ?it/s]

In [13]:
print(f"F1 {recall_score(np.array(targets), np.array(all_preds) > 0.5)}")

F1 0.7832579185520362


In [12]:
torch.save(model.state_dict(), f"./pairwise_v2.bin")


In [7]:
model1 = PairwiseModel('nguyenvulebinh/vi-mrc-base')
model1.load_state_dict(torch.load(f"./pairwise_v2.bin"))


Some weights of the model checkpoint at nguyenvulebinh/vi-mrc-base were not used when initializing RobertaModel: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
# from transformers import *

# class MonoBERT(BertPreTrainedModel):
#     def __init__(self, config):
#         config.num_labels = 1
#         super(MonoBERT, self).__init__(config)
#         self.bert = BertForSequenceClassification(config)
#         self.init_weights()

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(input_ids, attention_mask, token_type_ids)
#         logits = outputs[0]
#         return logits


In [None]:
# import torch
# from torch.nn.functional import cross_entropy
# from transformers import AdamW

# model = MonoBERT.from_pretrained("bert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
# optimizer.zero_grad()

# pos_text = "{} [SEP] {}".format(query, pos_doc) # query, pos_doc and neg_doc can be 
# neg_text = "{} [SEP] {}".format(query, neg_doc) /#retrieved from the training triples

# pos_encoded = tokenizer.encode_plus(pos_text, return_tensors="pt")
# neg_encoded = tokenizer.encode_plus(neg_text, return_tensors="pt")

# pos_output = model.forward(**pos_encoded).squeeze(1)
# neg_output = model.forward(**neg_encoded).squeeze(1)

# labels = torch.zeros(1, dtype=torch.long)
# loss = cross_entropy(torch.stack((pos_output, neg_output), dim=1), labels)

# loss.backward()
# optimizer.step()


## Bert

In [None]:
# from transformers import AutoModel, AutoTokenizer
# import torch
# import pytorch_lightning as pl
# import torch_optimizer as optim
# from transformers import (BertForNextSentencePrediction, BertModel, get_linear_schedule_with_warmup)
# import torch.distributed as dist
# from torch import nn
# import pytrec_eval
# import gc


# class CrossEncoder(torch.nn.Module):
#     def __init__(self,
#                  encoder_name_or_dir,
#                  encoder_config=None,
#                  cache_dir=None):
#         super().__init__()
#         self.encoder = BertForNextSentencePrediction.from_pretrained(encoder_name_or_dir,
#                                                                      config=encoder_config,
#                                                                      cache_dir=cache_dir)

#     def forward(self, inputs, labels=None):
#         outputs = self.encoder(**inputs, labels=labels)
#         return outputs


# class BertReranker(pl.LightningModule):
#     def __init__(self,
#                  encoder_name_or_dir,
#                  encoder_config=None,
#                  cache_dir=None,
#                  optimizer="adam",
#                  lr=1e-5,
#                  warm_up_steps=1700,
#                  num_gpus=1,
#                  batch_size=64,
#                  num_epochs=2,
#                  train_set_size=532761,  # ms marco train size
#                  num_neg_per_pos=4
#                  ):
#         super().__init__()

#         self.save_hyperparameters()

#         self.encoder = CrossEncoder(encoder_name_or_dir,
#                                     encoder_config,
#                                     cache_dir)

#     def training_step(self, batch, batch_idx):
#         inputs, labels = batch
#         outputs = self.encoder(inputs, labels=labels)
#         loss = outputs.loss
#         self.log("train_loss", loss.item())
#         return loss

#     def forward(self, inputs):
#         outputs = self.encoder(inputs)
#         return outputs

#     def get_scores(self, inputs):
#         outputs = self.encoder(inputs)
#         logits = outputs.logits
#         scores = torch.softmax(logits, dim=1)[:, 1]

#         return scores

#     def configure_optimizers(self):
#         optimizer = None
#         lr = self.hparams.lr
#         if self.hparams.optimizer == 'adam':
#             optimizer = torch.optim.Adam(self.parameters(), lr=lr)

#         if self.hparams.optimizer == 'lamb':
#             optimizer = optim.Lamb(self.parameters(), lr=lr)
#         total_steps = self.hparams.num_epochs * \
#                       int(self.hparams.train_set_size / (self.hparams.batch_size * self.hparams.num_gpus))

#         # def lr_lambda(current_step):
#         #     if current_step < self.hparams.warm_up_step:
#         #         lr_scale = 0.1 * (current_step/self.hparams.warm_up_step)
#         #     else:
#         #         lr_scale = 0.1 * (0.90 ** (current_step - self.hparams.warm_up_step))
#         #         if lr_scale < self.hparams.lr:
#         #             lr_scale = self.hparams.lr
#         #     return lr_scale
#         #
#         # scheduler = torch.optim.lr_scheduler.LambdaLR(
#         #     optimizer,
#         #     lr_lambda=lr_lambda,
#         # )

#         if self.hparams.warm_up_steps == 0:
#             return optimizer

#         scheduler = get_linear_schedule_with_warmup(
#             optimizer, num_warmup_steps=self.hparams.warm_up_steps, num_training_steps=total_steps
#         )
#         schedulers = [{
#             'scheduler': scheduler,
#             'name': 'warm_up_lr',
#             'interval': 'step'
#         }]
#         optimizers = [optimizer]
#         return optimizers, schedulers