# Shopee Training paraphrase-distilroberta-base-v1

In [1]:
import sys
import time
import datetime
start_time = time.time()
print(datetime.datetime.now())

2021-05-18 12:05:42.065765


In [2]:
import os
import gc
import math
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import NearestNeighbors

import torch
from torch import nn 
import torch.nn.functional as F 
from transformers import AutoTokenizer, AutoModel

import warnings
warnings.filterwarnings('ignore')

In [3]:
TRAIN_CSV = '../input/shopee-product-matching/train.csv'

class CFG:
    
    compute_cv = True  # set False to train model for submission

    ### BERT
#     bert_model_name = '../input/bertmodel/paraphrase-xlm-r-multilingual-v1'
#     bert_model_name = '../input/bertmodel/distilbert-base-indonesian'
#     bert_model_name = '../input/bertmodel/roberta-base' # very bad performance
    bert_model_name = '../input/bertmodel/paraphrase-distilroberta-base-v1'
#     bert_model_name = '../input/bert-model-pretrained/bert-base-multilingual-uncased/bert-base-multilingual-uncased'

    max_length = 128

    ### ArcFace
    scale = 30
    margin = 0.8
    fc_dim = 768
    seed = 412
    classes = 11014
    
    # groupkfold
    N_SPLITS = 5
    TEST_FOLD = 0
    VALID_FOLD = 1
    
    ### Training
    batch_size = 16
    accum_iter = 1  # 1 if use_sam = True
    epochs = 8
    min_save_epoch = epochs // 3
    use_sam = True  # SAM (Sharpness-Aware Minimization for Efficiently Improving Generalization)
    use_amp = True  # Automatic Mixed Precision
    num_workers = 2  # On Windows, set 0 or export train_fn and TitleDataset as .py files for faster training.
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    
    ### NearestNeighbors
    bert_knn = 50
    bert_knn_threshold = 0.4  # Cosine distance threshold
    
    ### GradualWarmupSchedulerV2（lr_start -> lr_max -> lr_min）
    scheduler_params = {
        "lr_start": 7.5e-6,
        "lr_max": 1e-4,
        "lr_min": 2.74e-5, # 1.5e-5,
    }
    multiplier = scheduler_params['lr_max'] / scheduler_params['lr_start']
    eta_min = scheduler_params['lr_min']  # last minimum learning rate
    freeze_epo = 0
    warmup_epo = 2
    cosine_epo = epochs - freeze_epo - warmup_epo
    
    ### save_model_path
    save_model_path = f"./{bert_model_name.rsplit('/', 1)[-1]}_epoch{epochs}-bs{batch_size}x{accum_iter}.pt"

cuda:0


In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True # set True to be faster

seed_everything(CFG.seed)

# Classes and Functions

In [5]:
### Dataset

class TitleDataset(torch.utils.data.Dataset):
    def __init__(self, df, text_column, label_column):
        texts = df[text_column]
        self.labels = df[label_column].values
        
        self.titles = []
        for title in texts:
            title = title.encode('utf-8').decode("unicode_escape")
            title = title.encode('ascii', 'ignore').decode("unicode_escape")
            title = title.lower()
            self.titles.append(title)

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        text = self.titles[idx]
        label = torch.tensor(self.labels[idx])
        return text, label

In [6]:
### SAM Optimizer 2020/1/16
# https://github.com/davda54/sam/blob/main/sam.py

class SAM(torch.optim.Optimizer):
    def __init__(self, params, base_optimizer, rho=0.05, **kwargs):
        assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"

        defaults = dict(rho=rho, **kwargs)
        super(SAM, self).__init__(params, defaults)

        self.base_optimizer = base_optimizer(self.param_groups, **kwargs)
        self.param_groups = self.base_optimizer.param_groups

    @torch.no_grad()
    def first_step(self, zero_grad=False):
        grad_norm = self._grad_norm()
        for group in self.param_groups:
            scale = group["rho"] / (grad_norm + 1e-12)

            for p in group["params"]:
                if p.grad is None: continue
                e_w = p.grad * scale.to(p)
                p.add_(e_w)  # climb to the local maximum "w + e(w)"
                self.state[p]["e_w"] = e_w

        if zero_grad: self.zero_grad()

    @torch.no_grad()
    def second_step(self, zero_grad=False):
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None: continue
                p.sub_(self.state[p]["e_w"])  # get back to "w" from "w + e(w)"

        self.base_optimizer.step()  # do the actual "sharpness-aware" update

        if zero_grad: self.zero_grad()

    @torch.no_grad()
    def step(self, closure=None):
        assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
        closure = torch.enable_grad()(closure)  # the closure should do a full forward-backward pass

        self.first_step(zero_grad=True)
        closure()
        self.second_step()

    def _grad_norm(self):
        shared_device = self.param_groups[0]["params"][0].device  # put everything on the same device, in case of model parallelism
        norm = torch.norm(
                    torch.stack([
                        p.grad.norm(p=2).to(shared_device)
                        for group in self.param_groups for p in group["params"]
                        if p.grad is not None
                    ]),
                    p=2
               )
        return norm

In [7]:
### GradualWarmupScheduler
# https://github.com/ildoonet/pytorch-gradual-warmup-lr

from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau


class GradualWarmupScheduler(_LRScheduler):
    """ Gradually warm-up(increasing) learning rate in optimizer.
    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
        total_epoch: target learning rate is reached at total_epoch, gradually
        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    """

    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        self.multiplier = multiplier
        if self.multiplier < 1.:
            raise ValueError('multiplier should be greater thant or equal to 1.')
        self.total_epoch = total_epoch
        self.after_scheduler = after_scheduler
        self.finished = False
        super(GradualWarmupScheduler, self).__init__(optimizer)

    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_last_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]

        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

    def step_ReduceLROnPlateau(self, metrics, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
        if self.last_epoch <= self.total_epoch:
            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
                param_group['lr'] = lr
        else:
            if epoch is None:
                self.after_scheduler.step(metrics, None)
            else:
                self.after_scheduler.step(metrics, epoch - self.total_epoch)

    def step(self, epoch=None, metrics=None):
        if type(self.after_scheduler) != ReduceLROnPlateau:
            if self.finished and self.after_scheduler:
                if epoch is None:
                    self.after_scheduler.step(None)
                else:
                    self.after_scheduler.step(epoch - self.total_epoch)
                self._last_lr = self.after_scheduler.get_last_lr()
            else:
                return super(GradualWarmupScheduler, self).step(epoch)
        else:
            self.step_ReduceLROnPlateau(metrics, epoch)

In [8]:
### GradualWarmupSchedulerV2

class GradualWarmupSchedulerV2(GradualWarmupScheduler):
    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]
        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

In [9]:
### Train one epoch

def train_fn(model, data_loader, optimizer, scheduler, use_sam, accum_iter, epoch, device, use_amp):
    model.train()
    if use_amp:
        scaler = torch.cuda.amp.GradScaler()
    fin_loss = 0.0
    tk = tqdm(data_loader, desc = "Training epoch: " + str(epoch+1), ncols=100)

    for t, (texts, labels) in enumerate(tk):
        texts = list(texts)

        if use_sam:
            if use_amp:
                with torch.cuda.amp.autocast():
                    _, loss = model(texts, labels)
                loss.mean().backward()
                optimizer.first_step(zero_grad=True)
                fin_loss += loss.item() 
                with torch.cuda.amp.autocast():
                     _, loss_second = model(texts, labels)
                loss_second.mean().backward()
                optimizer.second_step(zero_grad=True)
                optimizer.zero_grad()
            else:
                _, loss = model(texts, labels)
                loss.mean().backward()
                optimizer.first_step(zero_grad=True)
                fin_loss += loss.item() 
                _, loss_second = model(texts, labels)
                loss_second.mean().backward()
                optimizer.second_step(zero_grad=True)
                optimizer.zero_grad()

        else:  # if use_sam == False
            if use_amp:
                with torch.cuda.amp.autocast():
                    _, loss = model(texts, labels)
                scaler.scale(loss).backward()
                fin_loss += loss.item() 
                # mini-batch accumulation
                if (t + 1) % accum_iter == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
            else:
                _, loss = model(texts, labels)
                loss.backward()
                fin_loss += loss.item() 
                # mini-batch accumulation
                if (t + 1) % accum_iter == 0:
                    optimizer.step() 
                    optimizer.zero_grad()
                
        tk.set_postfix({'loss' : '%.6f' %float(fin_loss/(t+1)), 'LR' : optimizer.param_groups[0]['lr']})

    scheduler.step()
    return model, fin_loss / len(data_loader)

In [10]:
### Validation

def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score


def get_bert_embeddings(df, column, model, chunk=32):
    model.eval()
    
    bert_embeddings = torch.zeros((df.shape[0], 768)).to(CFG.device)
    for i in tqdm(list(range(0, df.shape[0], chunk)) + [df.shape[0]-chunk], desc="get_bert_embeddings", ncols=80):
        titles = []
        for title in df[column][i : i + chunk].values:
            try:
                title = title.encode('utf-8').decode("unicode_escape")
                title = title.encode('ascii', 'ignore').decode("unicode_escape")
            except:
                pass
            #title = text_punctuation(title)
            title = title.lower()
            titles.append(title)
            
        with torch.no_grad():
            if CFG.use_amp:
                with torch.cuda.amp.autocast():
                    model_output = model(titles)
            else:
                model_output = model(titles)
            
        bert_embeddings[i : i + chunk] = model_output
    
    del model, titles, model_output
    gc.collect()
    torch.cuda.empty_cache()
    
    return bert_embeddings


def get_neighbors(df, embeddings, knn=50, threshold=0.0):

    model = NearestNeighbors(n_neighbors=knn, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    preds = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        preds.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return preds

In [11]:
### ArcFace
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
        self.criterion = nn.CrossEntropyLoss()
                
    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        if CFG.use_amp:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight)).float()  # if CFG.use_amp
        else:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device=CFG.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        return output, self.criterion(output,label)

In [12]:
### BERT

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


class ShopeeBertModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.classes,
        model_name = CFG.bert_model_name,
        fc_dim = CFG.fc_dim,
        margin = CFG.margin,
        scale = CFG.scale,
        use_fc = True
    ):

        super(ShopeeBertModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name).to(CFG.device)

        in_features = 768
        self.use_fc = use_fc
        
        if use_fc:
            self.dropout = nn.Dropout(p=0.0)
            self.classifier = nn.Linear(in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            in_features = fc_dim
            
        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            easy_margin = False,
            ls_eps = 0.0
        )

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, texts, labels=torch.tensor([0])):
        features = self.extract_features(texts)
        if self.training:
            logits = self.final(features, labels.to(CFG.device))
            return logits
        else:
            return features
        
    def extract_features(self, texts):
        encoding = self.tokenizer(texts, padding=True, truncation=True,
                             max_length=CFG.max_length, return_tensors='pt').to(CFG.device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        embedding = self.backbone(input_ids, attention_mask=attention_mask)
        x = mean_pooling(embedding, attention_mask)
        
        if self.use_fc and self.training:
            x = self.dropout(x)
            x = self.classifier(x)
            x = self.bn(x)
        
        return x

# Setup

In [13]:
### Create Dataloader

print("Compute CV =", CFG.compute_cv)

df = pd.read_csv(TRAIN_CSV)
df['target'] = df.label_group.map(df.groupby('label_group').posting_id.agg('unique').to_dict())

labelencoder= LabelEncoder()
df['label_group'] = labelencoder.fit_transform(df['label_group'])

gkf = GroupKFold(n_splits=CFG.N_SPLITS)
df['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(gkf.split(X=df, groups=df['label_group'])):
    df.loc[valid_idx, 'fold'] = i

train_df = df[df['fold']!=CFG.TEST_FOLD].reset_index(drop=True)
train_df = train_df[train_df['fold']!=CFG.VALID_FOLD].reset_index(drop=True)
valid_df = df[df['fold']==CFG.VALID_FOLD].reset_index(drop=True)
test_df = df[df['fold']==CFG.TEST_FOLD].reset_index(drop=True)

# force label_group to be integers from 0 to (n_class - 1)
train_df['label_group'] = labelencoder.fit_transform(train_df['label_group'])

print("train_df length =", len(train_df))
print("train_df classes =", len(train_df['label_group'].unique()))
print("valid_df length =", len(valid_df))
print("valid_df classes =", len(valid_df['label_group'].unique()))
print("test_df length =", len(test_df))
print("test_df classes =", len(test_df['label_group'].unique()))

train_dataset = TitleDataset(train_df, 'title', 'label_group')
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = CFG.batch_size,
    num_workers = CFG.num_workers,
    pin_memory = True,
    shuffle = True,
    drop_last = True
)

valid_dataset = TitleDataset(valid_df, 'title', 'label_group')
valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size = CFG.batch_size,
    num_workers = CFG.num_workers,
    pin_memory = True,
    shuffle = False,
    drop_last = False
)

test_dataset = TitleDataset(test_df, 'title', 'label_group')
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = CFG.batch_size,
    num_workers = CFG.num_workers,
    pin_memory = True,
    shuffle = False,
    drop_last = False
)

Compute CV = True
train_df length = 20550
train_df classes = 6609
valid_df length = 6849
valid_df classes = 2202
test_df length = 6851
test_df classes = 2203


In [14]:
### Create Model

model = ShopeeBertModel()
model.to(CFG.device);

Building Model Backbone for ../input/bertmodel/paraphrase-distilroberta-base-v1 model


In [15]:
### Create Optimizer

optimizer_grouped_parameters = [
    {'params': model.backbone.parameters(), 'lr': CFG.scheduler_params['lr_start']},
    {'params': model.classifier.parameters(), 'lr': CFG.scheduler_params['lr_start'] * 2},
    {'params': model.bn.parameters(), 'lr': CFG.scheduler_params['lr_start'] * 2},
    {'params': model.final.parameters(), 'lr': CFG.scheduler_params['lr_start'] * 2},
]

if CFG.use_sam:
    from transformers import AdamW
    optimizer = AdamW
    optimizer = SAM(optimizer_grouped_parameters, optimizer)

else:
    from transformers import AdamW
    optimizer = AdamW(optimizer_grouped_parameters)

print("lr_start")
print("-" * 30)
for i in range(len(optimizer.param_groups)):
    print('Parameter Group ' + str(i) + ' :', optimizer.param_groups[i]["lr"])

lr_start
------------------------------
Parameter Group 0 : 7.5e-06
Parameter Group 1 : 1.5e-05
Parameter Group 2 : 1.5e-05
Parameter Group 3 : 1.5e-05


In [16]:
### Create Scheduler

scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.cosine_epo-2, eta_min=CFG.eta_min, last_epoch=-1)
scheduler = GradualWarmupSchedulerV2(optimizer, multiplier=CFG.multiplier, total_epoch=CFG.warmup_epo,
                                     after_scheduler=scheduler_cosine)

# Training and Validation

In [17]:
print("Training epochs =", CFG.epochs)

Training epochs = 8


In [18]:
max_f1_valid = 0.

for epoch in range(CFG.epochs):
    model, avg_loss_train = train_fn(model, train_dataloader, optimizer, scheduler,
                                     CFG.use_sam, CFG.accum_iter, epoch, CFG.device, CFG.use_amp)

    valid_embeddings = get_bert_embeddings(valid_df, 'title', model)
    valid_predictions = get_neighbors(valid_df, valid_embeddings.detach().cpu().numpy(),
                                      knn=CFG.bert_knn if len(df) > 3 else 3, threshold=CFG.bert_knn_threshold)

    valid_df['oof'] = valid_predictions
    valid_df['f1'] = valid_df.apply(getMetric('oof'), axis=1)
    valid_f1 = valid_df.f1.mean()
    print('Valid f1 score =', valid_f1)

    if (epoch >= CFG.min_save_epoch) and (valid_f1 > max_f1_valid):
        print(f"[{datetime.datetime.now()}] Valid f1 score improved. Saving model weights to {CFG.save_model_path}")
        max_f1_valid = valid_f1
        torch.save(model.state_dict(), CFG.save_model_path)


Training epoch: 1: 100%|█████████████| 1284/1284 [02:23<00:00,  8.92it/s, loss=30.767563, LR=7.5e-6]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.29it/s]
Training epoch: 2:   0%|                                                   | 0/1284 [00:00<?, ?it/s]

Valid f1 score = 0.27305284174922023


Training epoch: 2: 100%|████████████| 1284/1284 [02:23<00:00,  8.98it/s, loss=27.716828, LR=5.38e-5]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.44it/s]
Training epoch: 3:   0%|                                                   | 0/1284 [00:00<?, ?it/s]

Valid f1 score = 0.6724363880470948


Training epoch: 3: 100%|█████████████| 1284/1284 [02:24<00:00,  8.91it/s, loss=22.825797, LR=0.0001]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.41it/s]


Valid f1 score = 0.7540630163718208
[2021-05-18 12:13:37.804986] Valid f1 score improved. Saving model weights to ./paraphrase-distilroberta-base-v1_epoch8-bs16x1.pt


Training epoch: 4: 100%|█████████████| 1284/1284 [02:24<00:00,  8.90it/s, loss=17.794455, LR=0.0001]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.34it/s]


Valid f1 score = 0.7869485839531154
[2021-05-18 12:16:10.904809] Valid f1 score improved. Saving model weights to ./paraphrase-distilroberta-base-v1_epoch8-bs16x1.pt


Training epoch: 5: 100%|████████████| 1284/1284 [02:25<00:00,  8.84it/s, loss=13.834280, LR=8.94e-5]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.11it/s]


Valid f1 score = 0.7901976736808723
[2021-05-18 12:18:45.268527] Valid f1 score improved. Saving model weights to ./paraphrase-distilroberta-base-v1_epoch8-bs16x1.pt


Training epoch: 6: 100%|████████████| 1284/1284 [02:25<00:00,  8.80it/s, loss=10.672974, LR=6.37e-5]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.46it/s]
Training epoch: 7:   0%|                                                   | 0/1284 [00:00<?, ?it/s]

Valid f1 score = 0.7861159566215353


Training epoch: 7: 100%|██████████████| 1284/1284 [02:25<00:00,  8.80it/s, loss=8.474598, LR=3.8e-5]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.44it/s]
Training epoch: 8:   0%|                                                   | 0/1284 [00:00<?, ?it/s]

Valid f1 score = 0.7824868021294097


Training epoch: 8: 100%|█████████████| 1284/1284 [02:24<00:00,  8.87it/s, loss=7.352768, LR=2.74e-5]
get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.37it/s]


Valid f1 score = 0.7840610590570171


# Best threshold Search

In [19]:
print("Searching best threshold...")

search_space = np.arange(10, 50, 1)

model.load_state_dict(torch.load(CFG.save_model_path, map_location=CFG.device))
valid_embeddings = get_bert_embeddings(valid_df, 'title', model)

best_f1_valid = 0.
best_threshold = 0.

for i in search_space:
    threshold = i / 100
    valid_predictions = get_neighbors(valid_df, valid_embeddings.detach().cpu().numpy(),
                                      knn=CFG.bert_knn if len(df) > 3 else 3, threshold=threshold)

    valid_df['oof'] = valid_predictions
    valid_df['f1'] = valid_df.apply(getMetric('oof'), axis=1)
    valid_f1 = valid_df.f1.mean()
    print(f"threshold = {threshold} -> f1 score = {valid_f1}")

    if (valid_f1 > best_f1_valid):
        best_f1_valid = valid_f1
        best_threshold = threshold

print("Best threshold =", best_threshold)
print("Best f1 score =", best_f1_valid)
BEST_THRESHOLD = best_threshold

Searching best threshold...


get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.28it/s]


threshold = 0.1 -> f1 score = 0.5800641594697847
threshold = 0.11 -> f1 score = 0.5907177741679782
threshold = 0.12 -> f1 score = 0.6004316672997441
threshold = 0.13 -> f1 score = 0.6085586129276395
threshold = 0.14 -> f1 score = 0.6178010528808617
threshold = 0.15 -> f1 score = 0.6265293586822434
threshold = 0.16 -> f1 score = 0.6356487387283115
threshold = 0.17 -> f1 score = 0.645094375394771
threshold = 0.18 -> f1 score = 0.6556491949668578
threshold = 0.19 -> f1 score = 0.6646982625868257
threshold = 0.2 -> f1 score = 0.6725240740499075
threshold = 0.21 -> f1 score = 0.6812922081234917
threshold = 0.22 -> f1 score = 0.6905991151912595
threshold = 0.23 -> f1 score = 0.698632926827469
threshold = 0.24 -> f1 score = 0.7042046951628836
threshold = 0.25 -> f1 score = 0.710534822813064
threshold = 0.26 -> f1 score = 0.7185089639100462
threshold = 0.27 -> f1 score = 0.7257756066306198
threshold = 0.28 -> f1 score = 0.732240261122238
threshold = 0.29 -> f1 score = 0.7390960779856748
thresh

In [20]:
print("Searching best knn...")

search_space = np.arange(40, 80, 2)

best_f1_valid = 0.
best_knn = 0

for knn in search_space:

    valid_predictions = get_neighbors(valid_df, valid_embeddings.detach().cpu().numpy(),
                                      knn=knn, threshold=BEST_THRESHOLD)

    valid_df['oof'] = valid_predictions
    valid_df['f1'] = valid_df.apply(getMetric('oof'), axis=1)
    valid_f1 = valid_df.f1.mean()
    print(f"knn = {knn} -> f1 score = {valid_f1}")

    if (valid_f1 > best_f1_valid):
        best_f1_valid = valid_f1
        BEST_KNN = knn

print("Best knn =", BEST_KNN)
print("Best f1 score =", best_f1_valid)

Searching best knn...
knn = 40 -> f1 score = 0.7915905013207277
knn = 42 -> f1 score = 0.7917559169411896
knn = 44 -> f1 score = 0.7919143676934218
knn = 46 -> f1 score = 0.7920602634984976
knn = 48 -> f1 score = 0.7921456361603253
knn = 50 -> f1 score = 0.7921341182907695
knn = 52 -> f1 score = 0.7921136708613354
knn = 54 -> f1 score = 0.7921310811717941
knn = 56 -> f1 score = 0.7921310811717941
knn = 58 -> f1 score = 0.7921310811717941
knn = 60 -> f1 score = 0.7921310811717941
knn = 62 -> f1 score = 0.7921310811717941
knn = 64 -> f1 score = 0.7921310811717941
knn = 66 -> f1 score = 0.7921310811717941
knn = 68 -> f1 score = 0.7921310811717941
knn = 70 -> f1 score = 0.7921310811717941
knn = 72 -> f1 score = 0.7921310811717941
knn = 74 -> f1 score = 0.7921310811717941
knn = 76 -> f1 score = 0.7921310811717941
knn = 78 -> f1 score = 0.7921310811717941
Best knn = 48
Best f1 score = 0.7921456361603253


# Find Test F1 Score

In [21]:
test_embeddings = get_bert_embeddings(test_df, 'title', model)
test_predictions = get_neighbors(test_df, test_embeddings.detach().cpu().numpy(),
                                      knn=BEST_KNN, threshold=BEST_THRESHOLD)

test_df['oof'] = test_predictions
test_df['f1'] = test_df.apply(getMetric('oof'), axis=1)
test_f1 = test_df.f1.mean()
print("Test f1 score =", test_f1)

get_bert_embeddings: 100%|████████████████████| 216/216 [00:05<00:00, 39.96it/s]


Test f1 score = 0.7972597830699942


In [22]:
time_elapsed = time.time() - start_time
print('Elapsed time: {:.0f} min {:.0f} sec'.format(time_elapsed // 60, time_elapsed % 60))
print(datetime.datetime.now())

Elapsed time: 24 min 19 sec
2021-05-18 12:30:01.026686


End