In [23]:
import os
import gc
import math
import random
import time
import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import NearestNeighbors

import torch
from torch import nn 
import torch.nn.functional as F 
from transformers import AutoTokenizer, AutoModel
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AdamW

import warnings
warnings.filterwarnings('ignore')

TRAIN_CSV = '../input/shopee-product-matching/train.csv'
random.seed(512)

### Configuration

In [24]:
bert_model_name = '../input/transformers/bert-base-uncased'

max_length = 128

scale = 30
margin = 0.5
fc_dim = 768
seed = 412
classes = 11014

n_splits = 4 
batch_size = 16
accum_iter = 1
epochs = 12
min_save_epoch = epochs // 3
num_workers = 2
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

bert_knn = 50
bert_knn_threshold = 0.4
scheduler = _LRScheduler

scheduler_params = {
    "lr_start": 7.5e-6,
    "lr_max": 1e-4,
    "lr_min": 2.74e-5
}
multiplier = scheduler_params['lr_max'] / scheduler_params['lr_start']
eta_min = scheduler_params['lr_min']
freeze_epo = 0
warmup_epo = 2
cosine_epo = epochs - freeze_epo - warmup_epo

save_model_path = f"./{bert_model_name.rsplit('/', 1)[-1]}_epoch{epochs}-bs{batch_size}x{accum_iter}.pt"

### Dataset

In [25]:
class TitleDataset(torch.utils.data.Dataset):
    def __init__(self, df, text_column, label_column):
        texts = df[text_column]
        self.labels = df[label_column].values
        self.titles = []
        for title in texts:
            title = title.encode('utf-8').decode("unicode_escape")
            title = title.encode('ascii', 'ignore').decode("unicode_escape")
            title = title.lower()
            self.titles.append(title)

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        text = self.titles[idx]
        label = torch.tensor(self.labels[idx])
        return text, label

### Training function and metrics

In [26]:
def train_fn(model, data_loader, optimizer, scheduler, accum_iter, epoch, device):
    model.train()
    fin_loss = 0.0
    tk = tqdm(data_loader, desc = "Training epoch: " + str(epoch+1), ncols=100)

    for t, (texts, labels) in enumerate(tk):
        texts = list(texts)

        _, loss = model(texts, labels)
        loss.backward()
        fin_loss += loss.item() 

        if (t + 1) % accum_iter == 0:
            optimizer.step() 
            optimizer.zero_grad()
                
        tk.set_postfix({'loss' : '%.6f' %float(fin_loss/(t+1)), 'LR' : optimizer.param_groups[0]['lr']})

    return model, fin_loss / len(data_loader)

In [27]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score


def get_bert_embeddings(df, column, model, chunk=32):
    model.eval()
    
    bert_embeddings = torch.zeros((df.shape[0], 768)).to(device)
    for i in tqdm(list(range(0, df.shape[0], chunk)) + [df.shape[0]-chunk], desc="get_bert_embeddings", ncols=80):
        titles = []
        for title in df[column][i : i + chunk].values:
            try:
                title = title.encode('utf-8').decode("unicode_escape")
                title = title.encode('ascii', 'ignore').decode("unicode_escape")
            except:
                pass
            title = title.lower()
            titles.append(title)
            
        with torch.no_grad():
            model_output = model(titles)
            
        bert_embeddings[i : i + chunk] = model_output
    
    del model, titles, model_output
    gc.collect()
    torch.cuda.empty_cache()
    
    return bert_embeddings


def get_neighbors(df, embeddings, knn=50, threshold=0.0):

    model = NearestNeighbors(n_neighbors=knn, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    preds = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        preds.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return preds

In [28]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps 
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
        self.criterion = nn.CrossEntropyLoss()
                
    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        phi = torch.where(cosine > self.th, phi, cosine - self.mm)

        one_hot = torch.zeros(cosine.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        return output, self.criterion(output,label)

### Model

In [29]:
class BertModel(nn.Module):

    def __init__(
        self,
        n_classes = classes,
        model_name = bert_model_name,
        fc_dim = fc_dim,
        margin = margin,
        scale = scale,
        use_fc = True
    ):
        super(BertModel,self).__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)

        in_features = 768
            
        self.final = ArcMarginProduct(
            in_features,
            n_classes,
            scale = scale,
            margin = margin,
            ls_eps = 0.0
        )

    def forward(self, texts, labels=torch.tensor([0])):
        encoding = self.tokenizer(texts, padding=True, truncation=True,
                             max_length=max_length, return_tensors='pt').to(device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        embedding = self.model(input_ids, attention_mask=attention_mask)
        
        token_embeddings = embedding[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        features = sum_embeddings / sum_mask
        if self.training:
            logits = self.final(features, labels.to(device))
            return logits
        else:
            return features

In [30]:
df = pd.read_csv(TRAIN_CSV)
df['target'] = df.label_group.map(df.groupby('label_group').posting_id.agg('unique').to_dict())

gkf = GroupKFold(n_splits=n_splits)
df['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(gkf.split(X=df, groups=df['label_group'])):
    df.loc[valid_idx, 'fold'] = i

labelencoder= LabelEncoder()
df['label_group'] = labelencoder.fit_transform(df['label_group'])

fold = 0
train_df = df[df['fold']!=fold].reset_index(drop=True)
valid_df = df[df['fold']==fold].reset_index(drop=True)
print("train_df length =", len(train_df))
print("train_df classes =", len(train_df['label_group'].unique()))
print("valid_df length =", len(valid_df))
print("valid_df classes =", len(valid_df['label_group'].unique()))

train_dataset = TitleDataset(train_df, 'title', 'label_group')
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = batch_size,
    num_workers = num_workers,
    pin_memory = True,
    shuffle = True,
    drop_last = True
)

valid_dataset = TitleDataset(valid_df, 'title', 'label_group')
valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size = batch_size,
    num_workers = num_workers,
    pin_memory = True,
    shuffle = False,
    drop_last = False
)

In [31]:
model = BertModel()
model.to(device)

optimizer_grouped_parameters = [
    {'params': model.model.parameters(), 'lr': scheduler_params['lr_start']},
    {'params': model.final.parameters(), 'lr': scheduler_params['lr_start'] * 2},
]
optimizer = AdamW(optimizer_grouped_parameters)

In [33]:
max_f1_valid = 0.

for epoch in range(epochs):
    model, avg_loss_train = train_fn(
        model, train_dataloader, optimizer, scheduler, accum_iter, epoch, device
    )

    valid_embeddings = get_bert_embeddings(valid_df, 'title', model)
    valid_predictions = get_neighbors(valid_df, valid_embeddings.detach().cpu().numpy(),
                                      knn=bert_knn if len(df) > 3 else 3, threshold=bert_knn_threshold)

    valid_df['oof'] = valid_predictions
    valid_df['f1'] = valid_df.apply(getMetric('oof'), axis=1)
    valid_f1 = valid_df.f1.mean()
    print('Valid f1 score =', valid_f1)

    if (epoch >= min_save_epoch) and (valid_f1 > max_f1_valid):
        print(f"[{datetime.datetime.now()}] Valid f1 score improved. Saving model weights to {save_model_path}")
        max_f1_valid = valid_f1
        torch.save(model.state_dict(), save_model_path)

In [38]:
print("Searching best threshold...")

search_space = np.arange(30, 50, 1)
tresholds = []
f1 = []

model.load_state_dict(torch.load(save_model_path, map_location=device))
valid_embeddings = get_bert_embeddings(valid_df, 'title', model)

best_f1_valid = 0.
best_threshold = 0.

for i in search_space:
    threshold = i / 100
    valid_predictions = get_neighbors(valid_df, valid_embeddings.detach().cpu().numpy(),
                                      knn=bert_knn if len(df) > 3 else 3, threshold=threshold)

    valid_df['oof'] = valid_predictions
    valid_df['f1'] = valid_df.apply(getMetric('oof'), axis=1)
    valid_f1 = valid_df.f1.mean()
    tresholds.append(threshold)
    f1.append(valid_f1)
    print(f"threshold = {threshold} -> f1 score = {valid_f1}")

    if (valid_f1 > best_f1_valid):
        best_f1_valid = valid_f1
        best_threshold = threshold

print("Best threshold =", best_threshold)
print("Best f1 score =", best_f1_valid)
BEST_THRESHOLD = best_threshold

In [40]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure()
fig.add_trace(go.Scatter(x=tresholds, y=f1,
                    mode='lines+markers',
                    name='Threshold selection'))
fig.update_layout(legend=dict(
    yanchor="top",
    xanchor="right",
    y = 0.3,
    x = 0.99
))
fig.update_layout(title_text='Threshold selection', title_x=0.5)
fig.show()