# Shopee Training BERT

In [1]:
import sys
sys.path.insert(0,'../input/shopee-competition-utils')

from sklearn.externals import joblib
from config import CFG

2021-05-17 14:49:24.801855


In [2]:
import os
import gc
import math
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import NearestNeighbors

import torch
from torch import nn 
import torch.nn.functional as F 
from transformers import AutoTokenizer, AutoModel

import warnings
warnings.filterwarnings('ignore')

# Classes and Functions

In [10]:
### Validation

def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2 * n / (len(row.target) + len(row[col]))
    return f1score

def get_neighbors(df, embeddings, knn=50, threshold=0.0):

    model = NearestNeighbors(n_neighbors=knn, metric='cosine')
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    preds = []
    for k in range(embeddings.shape[0]):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        preds.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return preds

# Setup

In [16]:
### Create Scheduler

scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.COSINE_EPO-2, eta_min=CFG.ETA_MIN, last_epoch=-1)
scheduler = GradualWarmupSchedulerV2(optimizer, multiplier=CFG.MULTIPLIER, total_epoch=CFG.warmup_epo,
                                     after_scheduler=scheduler_cosine)

# Training and Validation

# Best threshold Search

In [19]:
print("Searching best threshold...")

search_space = np.arange(10, 50, 1)

model.load_state_dict(torch.load(CFG.save_model_path, map_location=CFG.DEVICE))
valid_embeddings = get_bert_embeddings(valid_df, 'title', model)

best_f1_valid = 0.
best_threshold = 0.

for i in search_space:
    threshold = i / 100
    valid_predictions = get_neighbors(valid_df, valid_embeddings.detach().cpu().numpy(),
                                      50, threshold=threshold)

    valid_df['oof'] = valid_predictions
    valid_df['f1'] = valid_df.apply(getMetric('oof'), axis=1)
    valid_f1 = valid_df.f1.mean()
    print(f"threshold = {threshold} -> f1 score = {valid_f1}")

    if (valid_f1 > best_f1_valid):
        best_f1_valid = valid_f1
        best_threshold = threshold

print("Best threshold =", best_threshold)
print("Best f1 score =", best_f1_valid)
BEST_THRESHOLD = best_threshold

Searching best threshold...


get_bert_embeddings: 100%|████████████████████| 216/216 [00:09<00:00, 23.11it/s]


threshold = 0.1 -> f1 score = 0.6088018626286464
threshold = 0.11 -> f1 score = 0.6239264999692185
threshold = 0.12 -> f1 score = 0.6374309987550659
threshold = 0.13 -> f1 score = 0.6519164067766097
threshold = 0.14 -> f1 score = 0.6669998480146297
threshold = 0.15 -> f1 score = 0.6804091274071279
threshold = 0.16 -> f1 score = 0.6924048056064657
threshold = 0.17 -> f1 score = 0.7048821298395557
threshold = 0.18 -> f1 score = 0.7164033690657718
threshold = 0.19 -> f1 score = 0.7290526003092075
threshold = 0.2 -> f1 score = 0.7407476913322213
threshold = 0.21 -> f1 score = 0.7509635308095594
threshold = 0.22 -> f1 score = 0.7607051479873749
threshold = 0.23 -> f1 score = 0.7690491227955439
threshold = 0.24 -> f1 score = 0.7753313570585402
threshold = 0.25 -> f1 score = 0.7815356863179782
threshold = 0.26 -> f1 score = 0.7874806888416356
threshold = 0.27 -> f1 score = 0.7917585807417954
threshold = 0.28 -> f1 score = 0.7944463061121809
threshold = 0.29 -> f1 score = 0.7969448545006332
th

# Find Test F1 Score

In [21]:
test_embeddings = get_bert_embeddings(test_df, 'title', model)
test_predictions = get_neighbors(test_df, test_embeddings.detach().cpu().numpy(),
                                      knn=BEST_KNN, threshold=BEST_THRESHOLD)

test_df['oof'] = test_predictions
test_df['f1'] = test_df.apply(getMetric('oof'), axis=1)
test_f1 = test_df.f1.mean()
print("Test f1 score =", test_f1)

get_bert_embeddings: 100%|████████████████████| 216/216 [00:09<00:00, 23.54it/s]


Test f1 score = 0.8135798639912429
