In [1]:
import sys

sys.path.append('../input/shopee-competition-utils')
sys.path.insert(0,'../input/pytorch-image-models')

In [2]:
import numpy as np # np.set_printoptions(suppress = True)
import pandas as pd

import torch
from torch import nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

from custom_scheduler import ShopeeScheduler
from custom_activation import replace_activations, Mish
from custom_optimizer import Ranger

import math
import cv2
import timm
import os
import random
import gc

from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm

In [3]:
class CFG: 
    
    DATA_DIR = '../input/shopee-product-matching/train_images'
    TRAIN_CSV = '../input/shopee-product-matching/train.csv'

    # data augmentation
    IMG_SIZE = 512
    MEAN = [0.485, 0.456, 0.406]
    STD = [0.229, 0.224, 0.225]

    SEED = 2021

    # data split
    N_SPLITS = 5
    TEST_FOLD = 0
    VALID_FOLD = 1

    EPOCHS = 8
    BATCH_SIZE = 8

    NUM_WORKERS = 4
    DEVICE = 'cuda:1'

    CLASSES = 6609 
    SCALE = 30
    MARGINS = [0.5,0.6,0.7,0.8,0.9]
    MARGIN = 0.5

    BEST_THRESHOLD = 0.19
    BEST_THRESHOLD_MIN2 = 0.225

    MODEL_NAME = 'resnet50'
    MODEL_NAMES = ['resnet50','resnext50_32x4d','densenet121','efficientnet_b3','eca_nfnet_l0']
    LOSS_MODULE = 'arc'
    LOSS_MODULES = ['arc','curricular']
    USE_ARCFACE = True
    MODEL_PATH_PREFIX = '../input/image-model-trained/'
    EMB_PATH_PREFIX = '../input/image-embeddings/'
    USE_EMBEDDING = True
    MODEL_PATH = f'{MODEL_NAME}_{LOSS_MODULE}_face_epoch_8_bs_8_margin_{MARGIN}.pt'
    FC_DIM = 512
    SCHEDULER_PARAMS = {
            "lr_start": 1e-5,
            "lr_max": 1e-5 * 32,
            "lr_min": 1e-6,
            "lr_ramp_ep": 5,
            "lr_sus_ep": 0,
            "lr_decay": 0.8,
        }

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True # set True to be faster

seed_everything(CFG.SEED)

In [5]:
def read_dataset():
    df = pd.read_csv(CFG.TRAIN_CSV)
    df['matches'] = df.label_group.map(df.groupby('label_group').posting_id.agg('unique').to_dict())
    df['matches'] = df['matches'].apply(lambda x: ' '.join(x))

    gkf = GroupKFold(n_splits=CFG.N_SPLITS)
    df['fold'] = -1
    for i, (train_idx, valid_idx) in enumerate(gkf.split(X=df, groups=df['label_group'])):
        df.loc[valid_idx, 'fold'] = i

    labelencoder= LabelEncoder()
    df['label_group'] = labelencoder.fit_transform(df['label_group'])

    train_df = df[df['fold']!=CFG.TEST_FOLD].reset_index(drop=True)
    train_df = train_df[train_df['fold']!=CFG.VALID_FOLD].reset_index(drop=True)
    valid_df = df[df['fold']==CFG.VALID_FOLD].reset_index(drop=True)
    test_df = df[df['fold']==CFG.TEST_FOLD].reset_index(drop=True)

    train_df['label_group'] = labelencoder.fit_transform(train_df['label_group'])

    return train_df, valid_df, test_df

In [6]:
def precision_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    precision = intersection / len_y_pred
    return precision

def recall_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_true = y_true.apply(lambda x: len(x)).values
    recall = intersection / len_y_true
    return recall

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [7]:
def get_voting_neighbors(df, distances, indices, threshold = 0.2, min2 = False):
    predictions = []
    for k in range(distances.shape[0]):
        if min2:
            idx = np.where(distances[k,] < CFG.BEST_THRESHOLD)[0]
            ids = indices[k,idx]
            if len(ids) <= 1 and distances[k,1] < threshold:
                ids = np.append(ids,indices[k,1])
        else:
            idx = np.where(distances[k,] < threshold)[0]
            ids = indices[k,idx]
        posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
        
    df['pred_matches'] = predictions
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    df['recall'] = recall_score(df['matches'], df['pred_matches'])
    df['precision'] = precision_score(df['matches'], df['pred_matches'])

    return df

In [8]:
def search_voting_threshold(valid_df, distances, indices):
    search_space = np.arange(10, 50, 1)
    print("Searching best threshold...")
    best_f1_valid = 0.
    best_threshold = 0.
    for i in search_space:
        threshold = i / 100
        valid_df = get_voting_neighbors(valid_df, distances, indices, threshold=threshold)
        valid_f1 = valid_df.f1.mean()
        valid_recall = valid_df.recall.mean()
        valid_precision = valid_df.precision.mean()
        print(f"threshold = {threshold} -> f1 score = {valid_f1}, recall = {valid_recall}, precision = {valid_precision}")
        if (valid_f1 > best_f1_valid):
            best_f1_valid = valid_f1
            best_threshold = threshold

    print("Best threshold =", best_threshold)
    print("Best f1 score =", best_f1_valid)
    CFG.BEST_THRESHOLD = best_threshold

    # phase 2 search
    print("________________________________")
    print("Searching best min2 threshold...")
    search_space = np.arange(CFG.BEST_THRESHOLD * 100, CFG.BEST_THRESHOLD * 100 + 20, 0.5)

    best_f1_valid = 0.
    best_threshold = 0.

    for i in search_space:
        threshold = i / 100
        valid_df = get_voting_neighbors(valid_df, distances, indices, threshold=threshold,min2=True)

        valid_f1 = valid_df.f1.mean()
        valid_recall = valid_df.recall.mean()
        valid_precision = valid_df.precision.mean()

        print(f"min2 threshold = {threshold} -> f1 score = {valid_f1}, recall = {valid_recall}, precision = {valid_precision}")

        if (valid_f1 > best_f1_valid):
            best_f1_valid = valid_f1
            best_threshold = threshold

    print("Best min2 threshold =", best_threshold)
    print("Best f1 score after min2 =", best_f1_valid)
    CFG.BEST_THRESHOLD_MIN2 = best_threshold


In [9]:
def get_voting_result(df, distances, indices):
    predictions = []
    for k in range(distances.shape[0]):
        idx = np.where(distances[k,] < CFG.BEST_THRESHOLD)[0]
        ids = indices[k,idx]
        if len(ids) <= 1 and distances[k,1] < CFG.BEST_THRESHOLD_MIN2:
            ids = np.append(ids,indices[k,1])
        posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        predictions.append(posting_ids)
        
    df['pred_matches'] = predictions
    df['f1'] = f1_score(df['matches'], df['pred_matches'])
    df['recall'] = recall_score(df['matches'], df['pred_matches'])
    df['precision'] = precision_score(df['matches'], df['pred_matches'])

    f1 = df.f1.mean()
    recall = df.recall.mean()
    precision = df.precision.mean()
    print(f'f1 score after voting = {f1}, recall = {recall}, precision = {precision}')

    return df

In [10]:
CFG.LOSS_MODULE = CFG.LOSS_MODULES[0]
CFG.MODEL_NAME = CFG.MODEL_NAMES[0]
CFG.MARGIN = CFG.MARGINS[0]
CFG.MODEL_PATH = f'{CFG.MODEL_NAME}_{CFG.LOSS_MODULE}_face_epoch_8_bs_8_margin_{CFG.MARGIN}.pt'

TEST_EMBEDDING_PATH = CFG.EMB_PATH_PREFIX + CFG.MODEL_PATH[:-3] + '_test_embed.csv'
test_embeddings_1 = np.loadtxt(TEST_EMBEDDING_PATH, delimiter=',')
VALID_EMBEDDING_PATH = CFG.EMB_PATH_PREFIX + CFG.MODEL_PATH[:-3] + '_valid_embed.csv'
valid_embeddings1 = np.loadtxt(VALID_EMBEDDING_PATH, delimiter=',')


In [11]:
CFG.LOSS_MODULE = CFG.LOSS_MODULES[1]
CFG.MODEL_NAME = CFG.MODEL_NAMES[0]
CFG.MARGIN = CFG.MARGINS[0]
CFG.MODEL_PATH = f'{CFG.MODEL_NAME}_{CFG.LOSS_MODULE}_face_epoch_8_bs_8_margin_{CFG.MARGIN}.pt'

TEST_EMBEDDING_PATH = CFG.EMB_PATH_PREFIX + CFG.MODEL_PATH[:-3] + '_test_embed.csv'
test_embeddings_2 = np.loadtxt(TEST_EMBEDDING_PATH, delimiter=',')
VALID_EMBEDDING_PATH = CFG.EMB_PATH_PREFIX + CFG.MODEL_PATH[:-3] + '_valid_embed.csv'
valid_embeddings2 = np.loadtxt(VALID_EMBEDDING_PATH, delimiter=',')

In [17]:
def get_voting_nns(embeddings_dict):
    embs_num = len(embeddings_dict)
    similarities_sum = 0.
    for i in range(embs_num):
        try:
            emb = normalize(embeddings_dict[f'emb_{i}'])
        except KeyError:
            raise KeyError('Please use keys emb_0, emb_1, etc in embeddings dict.')
        similarities = emb.dot(emb.T)
        similarities_sum += similarities
    similarities_sum = similarities_sum / embs_num
    similarities = np.sort(similarities_sum)[:,:-51:-1]
    distances = 1 - similarities
    indices = np.argsort(similarities_sum)[:,:-51:-1]

    return distances, indices

In [18]:
train_df, valid_df, test_df = read_dataset()
valid_embeddings_dict = {'emb_0':valid_embeddings1, 'emb_1':valid_embeddings2}
distances, indices = get_voting_nns(valid_embeddings_dict)
search_voting_threshold(valid_df, distances, indices)

Searching best threshold...
threshold = 0.1 -> f1 score = 0.6583605472178857, recall = 0.5462449245329842, precision = 0.9974290176655487
threshold = 0.11 -> f1 score = 0.6647615872403874, recall = 0.5545829400956374, precision = 0.9969934309618936
threshold = 0.12 -> f1 score = 0.6709709133232828, recall = 0.5621778330318576, precision = 0.9968943549758269
threshold = 0.13 -> f1 score = 0.677426470744603, recall = 0.5701223634345831, precision = 0.9962908605484166
threshold = 0.14 -> f1 score = 0.6829447917562086, recall = 0.5773563741156131, precision = 0.9960085808968858
threshold = 0.15 -> f1 score = 0.6886507180804735, recall = 0.5848942887597608, precision = 0.9953122679141078
threshold = 0.16 -> f1 score = 0.6933861548490088, recall = 0.5908956843116335, precision = 0.9948148811815041
threshold = 0.17 -> f1 score = 0.6981658529824145, recall = 0.5975966784377, precision = 0.9933025476170466
threshold = 0.18 -> f1 score = 0.7050687551802866, recall = 0.6069615555931833, precision

In [20]:
test_embeddings_dict = {'emb_0':test_embeddings_1, 'emb_1':test_embeddings_2}
distances, indices = get_voting_nns(test_embeddings_dict)
get_voting_result(test_df, distances, indices)

f1 score after voting = 0.7689225530271497, recall = 0.7507355965401381, precision = 0.880131053311615


Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,fold,pred_matches,f1,recall,precision
0,train_1802986387,00144a49c56599d45354a1c28104c039.jpg,f815c9bb833ab4c8,Jubah anak size 1-12 thn,4687,train_1802986387 train_1396161074 train_713073...,0,train_1802986387 train_885703853,0.222222,0.142857,0.500000
1,train_1598329973,001d7f5d9a2fac714f4d5f37b3baffb4.jpg,bec8d09693634b4b,Atasan Rajut Wanita LISDIA SWEATER,6347,train_1598329973 train_841015183 train_4224502769,0,train_1598329973 train_3538342341,0.400000,0.333333,0.500000
2,train_4196427721,002039aaf8618627a0442d5e89e5dda6.jpg,e98c873acc65946e,Korek Kuping LED untuk balita CherryBabyKidsSh...,897,train_4196427721 train_1482447822 train_234660...,0,train_4196427721 train_2221959828 train_375787...,0.545455,0.375000,1.000000
3,train_2985955659,002f978c58a44a00aadfca71c3cad2bb.jpg,bf38f0e083d7c710,HnKfashion Sweater Hoodie WHO Printing BabyTer...,8795,train_2985955659 train_3916258742 train_415673...,0,train_2985955659 train_4156734436 train_391625...,1.000000,1.000000,1.000000
4,train_3466601092,004076b57135e761ab8b41d84acc4c94.jpg,aa2c0ee4eb6ba0cd,[Shiyan] mainan gigitan bayi set pack baby tee...,7555,train_3466601092 train_354147588,0,train_3466601092 train_354147588,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
6846,train_259196128,ffcd708bcca72f3f9aacd2a50a8827f3.jpg,e7930c6e96635394,Power Bank VEGER Ultimate X10 PRO 10.000Mah LE...,1560,train_3219916478 train_259196128,0,train_259196128 train_3219916478,1.000000,1.000000,1.000000
6847,train_3074398993,ffd6946ca482bf8ddcc2171997c395e8.jpg,f2e00f95790f702e,Bebelac 4 Vanila Susu Bubuk 800 gr,5965,train_2919333796 train_3074398993,0,train_3074398993 train_2919333796 train_189425...,0.800000,1.000000,0.666667
6848,train_3296417563,ffda9710b76ca85f004518e7f243f3fc.jpg,af26e0f0d3d2f02c,Gunting Kuku Bayi Reliable 2in1 /Baby Nail Cli...,5317,train_1569930350 train_3296417563,0,train_3296417563,0.666667,0.500000,1.000000
6849,train_945815402,fff1222750374ddbed82b17c8bd2766d.jpg,b0cbce308fcdcc64,Baterai Batre Battery Xiaomi Redmi Note 3 BM46...,7971,train_601331882 train_945815402,0,train_945815402 train_601331882,1.000000,1.000000,1.000000
