In [1]:
import sys
import importlib
from types import SimpleNamespace
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.special import softmax
from joblib import Parallel, delayed
# import seaborn as sns
import scipy as sp

import warnings
warnings.filterwarnings("ignore")

sys.path.append("/home/hhl/바탕화면/dacon/dacon21/model/using_cofing")
# sys.argv = ['--config', 'config1_try']
sys.argv = ['--config', 'config1']

from models import *
from loss import *
from run import *
from utils import *

Using config file config1


In [2]:
class config6:
    DEBUG=False
    num_workers=8
    gpus='0'
    distributed_backend=None
    sync_batchnorm=True
    gradient_accumulation_steps=4
    precision=16
    warmup_epo=1
    cosine_epo=19
    lr=0.002
    weight_decay=0.0001
    p_trainable=True
    crit='bce'
    backbone='tf_efficientnet_b1_ns'
    embedding_size=512
    pool='gem'
    arcface_s=45.0
    arcface_m=0.4
    neck='option-D'
    head='arc_margin'
    pretrained_weights=None
    optim='sgd'
    batch_size=9
    n_splits=5
    fold=0
    seed=9999
    device='cuda:0'
    out_dim=1049
    n_classes=1049
    class_weights='log'
    class_weights_norm='batch'
    normalization='imagenet'
    crop_size=448
    model='/home/hhl/바탕화면/dacon/dacon21/model/config6/tf_efficientnet_b1_ns_best_fold_0.pth'

In [3]:
class LMDataset(Dataset): 
    def __init__(self, csv): 
        self.csv = csv.filepath.values
    def __getitem__(self, index):
        img_path = self.csv[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


        img2 = val_aug2(image=img)['image'].astype(np.float32)
        img2 = self.normalize_img(img2)
        tensor2 = self.to_torch_tensor(img2)
        feature_dict2 = {'idx':torch.tensor(index).long(), 'input':tensor2}


        return feature_dict2

    def __len__(self): 
        return len(self.csv)
    def normalize_img(self,img):
        mean = np.array([123.675, 116.28 , 103.53 ], dtype=np.float32)
        std = np.array([58.395   , 57.120, 57.375 ], dtype=np.float32)
        img = img.astype(np.float32)
        img -= mean
        img *= np.reciprocal(std, dtype=np.float32)
        return img
    def to_torch_tensor(self,img):
        return torch.from_numpy(img.transpose((2, 0, 1)))

In [4]:
val_aug2 = A.Compose([
        A.ImageCompression(quality_lower=99, quality_upper=100),    
        A.Resize(512, 512),
    ])

In [5]:
set_seed(0)
train = pd.read_csv('../data/public/train.csv')
train['filepath'] = [os.path.join('../data/train', str(lm_id), str(id)+'.JPG') for lm_id, id in zip(train['landmark_id'], train['id'])]
sub = pd.read_csv('../data/public/sample_submission.csv')
sub['filepath'] = [os.path.join('../data/public/test', id, folder+'.JPG') for id, folder in zip(sub['id'].apply(lambda x: x[0]), sub['id'])]

train_dataset = LMDataset(train)
train_loader = DataLoader(dataset=train_dataset, batch_size=36, num_workers=8, shuffle=False, pin_memory=True)

test_dataset = LMDataset(sub)
test_loader = DataLoader(dataset=test_dataset, batch_size=36, num_workers=8, shuffle=False, pin_memory=True)

In [6]:
model6 = Net(config6)
model6 = model6.to('cuda:0')
model6.load_state_dict(torch.load(config6.model))

<All keys matched successfully>

In [7]:
from tqdm import tqdm
CLS_TOP_K=1
TOP_K=1
model6.eval()

with torch.no_grad():
    feats = []
    for batch2 in tqdm(train_loader):
        batch2['input'] = batch2['input'].to('cuda:0')
        
        output_dict6 = model6(batch2, get_embeddings=True)
        
        feat = output_dict6['embeddings']
        feats.append(feat.detach().cpu())
    else:
        feats = torch.cat(feats)
        feats = feats.to('cuda:0')
        feats = F.normalize(feats)
        
#     PRODS = []
#     PREDS = []
#     PRODS_M = []
#     PREDS_M = []     
    
#     for batch2 in tqdm(test_loader):
#         batch2['input'] = batch2['input'].to('cuda:0')
        
#         output_dict = model6(batch2, get_embeddings=True) ; logits += output_dict['logits'] ; embeddings6 = output_dict['embedding']
#         feat = embeddings6
#         feat = F.normalize(feat)
        
#         (values, indices) = torch.topk(logits, CLS_TOP_K, dim=1)
#         probs_m = values
#         preds_m = indices  
        
#         PRODS_M.append(probs_m.detach().cpu())
#         PREDS_M.append(preds_m.detach().cpu())  
        
#         distance = feat.mm(feats.t())
#         (values, indices) = torch.topk(distance, TOP_K, dim=1)
#         probs = values
#         preds = indices
#         PRODS.append(probs.detach().cpu())
#         PREDS.append(preds.detach().cpu())
        
#     PRODS = torch.cat(PRODS).numpy()
#     PREDS = torch.cat(PREDS).numpy()
#     PRODS_M = torch.cat(PRODS_M).numpy()
#     PREDS_M = torch.cat(PREDS_M).numpy()

100%|██████████| 2448/2448 [06:08<00:00,  6.64it/s]


In [14]:
def cos_similarity_matrix(a, b, eps=1e-8):
    a = a.to('cpu')
    b = b.to('cpu')
    
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

In [15]:
t_c = train['landmark_id'].value_counts()
df = pd.DataFrame(train['landmark_id'].unique())
df = pd.merge(df, t_c, how='left', left_on=0, right_index=True)
df['count_cumsum'] = df['landmark_id'].cumsum()

In [60]:
csm_dict = {}
for idx, cc in enumerate(df['count_cumsum']):
    if idx==0:
        csm = cos_similarity_matrix(feats[:cc], feats[:cc])
    else:
        prior_cc = df.loc[idx-1, 'count_cumsum']
        csm = cos_similarity_matrix(feats[prior_cc:cc], feats[prior_cc:cc])
        
    csm_dict[df.loc[idx, 0]]=csm

In [66]:
rm_sm = {}
threshold = 0.9
prior_cc = 0
for id in csm_dict.keys():
    rm_sm_list = []
    while True:
        if (csm_dict[id]<10).sum()==(csm_dict[id]>threshold).sum():
            break
        else:
            rm_idx = np.argmin((csm_dict[id]>threshold).sum(1))
            rm_sm_list.append(rm_idx)
            csm_dict[id] = np.delete(csm_dict[id], rm_idx, axis=0)
            csm_dict[id] = np.delete(csm_dict[id], rm_idx, axis=1)
    rm_sm[id] = rm_sm_list

In [67]:
# count delete image
len([j for i in rm_sm.values() for j in i])

36039

In [68]:
new_train = pd.DataFrame()

for id in rm_sm.keys():
    trn_idx = train[train['landmark_id']==id].index
    new_train = pd.concat([new_train, train.loc[np.delete(trn_idx, rm_sm[id])]])
else:
    new_train = new_train.reset_index(drop=True)

In [70]:
len(os.listdir('../data/train/398'))

90

In [71]:
new_train['landmark_id'].value_counts()

398     85
973     85
422     85
1032    84
586     84
        ..
291     43
103     43
428     42
744     41
8       39
Name: landmark_id, Length: 1049, dtype: int64

In [72]:
new_train.to_csv('../data/public/new_train.csv', index=False)

In [6]:
sub = pd.read_csv('../submit/baseline_9.csv')

sub.loc[37931, 'landmark_id']=901
sub.loc[299, 'landmark_id']=275
sub.loc[301, 'landmark_id']=275
sub.loc[605, 'landmark_id']=120
sub.loc[939, 'landmark_id']=666
sub.loc[956, 'landmark_id']=671
sub.loc[1705:1730, 'landmark_id']=751
sub.loc[1776, 'landmark_id']=909
sub.loc[1825, 'landmark_id']=159
sub.loc[2065, 'landmark_id']=348
sub.loc[1745, 'landmark_id']=513
sub.loc[3154:3163, 'landmark_id']=487
sub.loc[3213:3221, 'landmark_id']=470
sub.loc[3811:3847, 'landmark_id']=1046
sub.loc[1825, 'landmark_id']=606
sub.loc[2487, 'landmark_id']=646
sub.loc[2892:2927, 'landmark_id']=988
sub.loc[1902, 'landmark_id']=579
sub.loc[12223:12262, 'landmark_id']=543
sub.loc[2744, 'landmark_id']=812
sub.loc[2488, 'landmark_id']=646
sub.loc[3202, 'landmark_id']=470
sub.loc[3008, 'landmark_id']=292
sub.loc[2155:2164, 'landmark_id']=617
sub.loc[4259, 'landmark_id']=578
sub.loc[3252, 'landmark_id']=219
sub.loc[3807, 'landmark_id']=838
sub.loc[4368:4383, 'landmark_id']=868
sub.loc[4777, 'landmark_id']=64
sub.loc[5382:5401, 'landmark_id']=312
sub.loc[3510, 'landmark_id']=826
sub.loc[4086:4094, 'landmark_id']=398
sub.loc[5204, 'landmark_id']=589
sub.loc[5836, 'landmark_id']=1023
sub.loc[3289, 'landmark_id']=109
sub.loc[34109, 'landmark_id']=2
sub.loc[6628, 'landmark_id']=795
sub.loc[4866, 'landmark_id']=176
sub.loc[6882, 'landmark_id']=27
sub.loc[16638:16671, 'landmark_id']=40
sub.loc[16009:16043, 'landmark_id']=892
sub.loc[16048:16076, 'landmark_id']=861
sub.loc[21474, 'landmark_id']=66
sub.loc[21489:21528, 'landmark_id']=502
sub.loc[36193:36212, 'landmark_id']=236
sub.loc[19344:19358, 'landmark_id']=488
sub.loc[11882, 'landmark_id']=1008
sub.loc[36980:36987, 'landmark_id']=29
sub.loc[24477, 'landmark_id']=10
sub.loc[16323:16338, 'landmark_id']=136
sub.loc[7392, 'landmark_id']=437
sub.loc[14800:14806, 'landmark_id']=616
sub.loc[19331, 'landmark_id']=488
sub.loc[37440:37474, 'landmark_id']=38
sub.loc[19373, 'landmark_id']=757
sub.loc[15640:15675, 'landmark_id']=371
sub.loc[25803:25833, 'landmark_id']=771
sub.loc[30633:30678, 'landmark_id']=763
sub.loc[17764, 'landmark_id']=539
sub.loc[17172, 'landmark_id']=105
sub.loc[19420, 'landmark_id']=258
sub.loc[20568, 'landmark_id']=710
sub.loc[13187, 'landmark_id']=538
sub.loc[28293:28296, 'landmark_id']=90
sub.loc[37072, 'landmark_id']=607
sub.loc[35413:35435, 'landmark_id']=595
sub.loc[34776, 'landmark_id']=297
sub.loc[33846:33848, 'landmark_id']=32
sub.loc[37556, 'landmark_id']=998
sub.loc[35557, 'landmark_id']=982
sub.loc[33040, 'landmark_id']=867
sub.loc[33174, 'landmark_id']=107
sub.loc[30500, 'landmark_id']=246
sub.loc[27948, 'landmark_id']=328
sub.loc[33198, 'landmark_id']=107
sub.loc[31883, 'landmark_id']=153
sub.loc[31885, 'landmark_id']=153
sub.loc[26452, 'landmark_id']=447
sub.loc[25295, 'landmark_id']=253
sub.loc[1879:1880, 'landmark_id']=579


sub2 = sub[sub['landmark_id'].diff()!=0]
sub3 = sub2[sub2['landmark_id'].isin(sub2['landmark_id'].value_counts()[sub2['landmark_id'].value_counts()!=1].index)]

In [718]:
sub.drop(sub3.index)

Unnamed: 0,id,landmark_id,conf
0,xlf1tgh2ih,956,0.504824
1,68a3ot4osk,956,0.469569
2,si2lek4u0a,956,0.209661
3,rmtqxhipnv,956,0.364817
4,2flmjdud0e,956,0.599650
...,...,...,...
37959,8nlfrrdnwk,901,0.385468
37960,k0w00aa3iy,901,0.428705
37961,xrp8d0pb85,901,0.499425
37962,uobnsz7na9,901,0.256282


In [None]:
[ 2465,  2928,  3012,  3154,  3227,  3467,  3811,  3966,  4083,
 6549,  6627,  6647,  6648,  6649,  6848,  7615,  8276,  8383,
 8389,  8390,  9254, 10412, 10506, 12295, 12302, 12303, 12393,
12415, 12416, 12419, 12420, 12707, 12893, 13516, 13539, 13540,
14783, 14820, 16638, 17754, 17797, 17798, 19256, 19396, 19407,
19408, 20360, 21053, 21488, 25288, 25302, 25303, 25871, 26154,
27780, 27801, 27802, 27803, 27820, 28217, 28248, 28249, 28387,
28388, 30633, 33507, 33634, 34952, 36193, 37617]

In [455]:
pd.options.display.max_rows = 400
# pd.options.display.max_s = 100

In [17]:
PRODS

array([[0.9681861 , 0.9588182 , 0.95706064, 0.95588136, 0.95040643],
       [0.96667993, 0.9472121 , 0.9447556 , 0.9408512 , 0.94078714],
       [0.9377258 , 0.912168  , 0.90437233, 0.88918334, 0.88405263],
       ...,
       [0.9671086 , 0.96655184, 0.9575849 , 0.9565214 , 0.94599533],
       [0.9174435 , 0.90726936, 0.90146875, 0.89430594, 0.89088726],
       [0.9650006 , 0.9605202 , 0.9572787 , 0.94926447, 0.94199556]],
      dtype=float32)

In [18]:
PREDS

array([[46165, 46223, 46177, 46178, 46214],
       [46212, 46166, 46164, 46178, 46193],
       [46179, 46204, 46238, 46212, 46206],
       ...,
       [87205, 87197, 87263, 87269, 87247],
       [87230, 87197, 87205, 87269, 87263],
       [87269, 87263, 87205, 87197, 87196]])

In [19]:
PRODS_M

array([[0.8262315 , 0.17035033, 0.16999641, 0.16363013, 0.16199456],
       [0.8278582 , 0.16406849, 0.1602645 , 0.15817578, 0.15451196],
       [0.79496634, 0.1899417 , 0.18392396, 0.17450395, 0.16595139],
       ...,
       [0.82473266, 0.24279721, 0.17697482, 0.1700898 , 0.16491812],
       [0.8146967 , 0.22368771, 0.16808186, 0.15954326, 0.15771452],
       [0.81549466, 0.2523113 , 0.16957906, 0.16657   , 0.16497181]],
      dtype=float32)

In [20]:
PREDS_M

array([[ 956,  547,  674,  524,  184],
       [ 956,  547,  524,  849, 1040],
       [ 956,  184,  557,  547,  496],
       ...,
       [ 901,  675,  555,   94,  655],
       [ 901,  675,  555,  613,   52],
       [ 901,  675,  325,  856,  555]])

In [21]:
np.save('ensembles/PRODS.npy', PRODS)
np.save('ensembles/PREDS.npy', PREDS)
np.save('ensembles/PREDS_M.npy', PREDS_M)
np.save('ensembles/PREDS_M.npy', PREDS_M)

In [22]:
landmark = train['landmark_id'].values
PREDS2 = landmark[PREDS]
# PREDS_M = np.vectorize(idx2landmark_id.get)(PREDS_M)

In [None]:
with open(os.path.join(model_dir, 'idx2landmark_id.pkl'), 'rb') as fp:
    idx2landmark_id = pickle.load(fp)
    landmark_id2idx = {idx2landmark_id[idx]: idx for idx in idx2landmark_id.keys()}
    
pred_mask = pd.Series(df.landmark_id.unique()).map(landmark_id2idx).values

In [24]:
np.vectorize(idx2landmark_id.get)

In [25]:
PREDS2.min(), PREDS2.max(), PREDS_M.min(), PREDS_M.max()

(0, 1048, 0, 1048)

In [26]:
PREDS_M

array([[ 956,  547,  674,  524,  184],
       [ 956,  547,  524,  849, 1040],
       [ 956,  184,  557,  547,  496],
       ...,
       [ 901,  675,  555,   94,  655],
       [ 901,  675,  555,  613,   52],
       [ 901,  675,  325,  856,  555]])

In [58]:
PREDS2[:3,:]

array([[956, 956, 956],
       [956, 956, 956],
       [956, 956, 956]])

In [27]:
PREDS_M[:3,:]

array([[ 956,  547,  674,  524,  184],
       [ 956,  547,  524,  849, 1040],
       [ 956,  184,  557,  547,  496]])

In [28]:
PRODS[:3,:]

array([[0.9681861 , 0.9588182 , 0.95706064, 0.95588136, 0.95040643],
       [0.96667993, 0.9472121 , 0.9447556 , 0.9408512 , 0.94078714],
       [0.9377258 , 0.912168  , 0.90437233, 0.88918334, 0.88405263]],
      dtype=float32)

In [29]:
PRODS_M[:3,:]

array([[0.8262315 , 0.17035033, 0.16999641, 0.16363013, 0.16199456],
       [0.8278582 , 0.16406849, 0.1602645 , 0.15817578, 0.15451196],
       [0.79496634, 0.1899417 , 0.18392396, 0.17450395, 0.16595139]],
      dtype=float32)

In [30]:
PRODS_F = []
PREDS_F = []
for i in tqdm(range(PREDS2.shape[0])):
    tmp = {}
    classify_dict = {PREDS_M[i,j] : PRODS_M[i,j] for j in range(CLS_TOP_K)}
    for k in range(TOP_K):
        lid = PREDS2[i, k]
        tmp[lid] = tmp.get(lid, 0.) + float(PRODS[i, k]) ** 9 * classify_dict.get(lid,1e-8)**10
    pred, conf = max(tmp.items(), key=lambda x: x[1])
    PREDS_F.append(pred)
    PRODS_F.append(conf)

100%|██████████| 37964/37964 [00:00<00:00, 46493.55it/s]


In [31]:
sub['landmark_id']=PREDS_F
sub['conf']=PRODS_F

In [32]:
sub

Unnamed: 0,id,landmark_id,conf,filepath
0,xlf1tgh2ih,956,0.504824,../data/public/test/x/xlf1tgh2ih.JPG
1,68a3ot4osk,956,0.469569,../data/public/test/6/68a3ot4osk.JPG
2,si2lek4u0a,956,0.209661,../data/public/test/s/si2lek4u0a.JPG
3,rmtqxhipnv,956,0.364817,../data/public/test/r/rmtqxhipnv.JPG
4,2flmjdud0e,956,0.599650,../data/public/test/2/2flmjdud0e.JPG
...,...,...,...,...
37959,8nlfrrdnwk,901,0.385468,../data/public/test/8/8nlfrrdnwk.JPG
37960,k0w00aa3iy,901,0.428705,../data/public/test/k/k0w00aa3iy.JPG
37961,xrp8d0pb85,901,0.499425,../data/public/test/x/xrp8d0pb85.JPG
37962,uobnsz7na9,901,0.256282,../data/public/test/u/uobnsz7na9.JPG


In [33]:
sub.drop(columns=['filepath']).to_csv('../submit/baseline_9.csv', index=False)

In [72]:
sub

Unnamed: 0,id,landmark_id,conf,filepath
0,xlf1tgh2ih,956,0.335313,../data/public/test/x/xlf1tgh2ih.JPG
1,68a3ot4osk,956,0.302072,../data/public/test/6/68a3ot4osk.JPG
2,si2lek4u0a,956,0.187452,../data/public/test/s/si2lek4u0a.JPG
3,rmtqxhipnv,956,0.250135,../data/public/test/r/rmtqxhipnv.JPG
4,2flmjdud0e,956,0.401532,../data/public/test/2/2flmjdud0e.JPG
...,...,...,...,...
37959,8nlfrrdnwk,901,0.276970,../data/public/test/8/8nlfrrdnwk.JPG
37960,k0w00aa3iy,901,0.294003,../data/public/test/k/k0w00aa3iy.JPG
37961,xrp8d0pb85,901,0.362963,../data/public/test/x/xrp8d0pb85.JPG
37962,uobnsz7na9,901,0.195852,../data/public/test/u/uobnsz7na9.JPG


In [76]:
prior_sub = pd.read_csv('../submit/baseline_3.csv')

In [78]:
prior_sub['landmark_id']

0        956
1        956
2        956
3        956
4        956
        ... 
37959    901
37960    901
37961    901
37962    901
37963    901
Name: landmark_id, Length: 37964, dtype: int64

In [84]:
(prior_sub['landmark_id']==sub['landmark_id']).sum()

0        True
1        True
2        True
3        True
4        True
         ... 
37959    True
37960    True
37961    True
37962    True
37963    True
Name: landmark_id, Length: 37964, dtype: bool

In [4]:
sub_npy = np.load('../submit/val_outputs_best.npy', allow_pickle=True)

In [36]:
sub2['conf'] = [j for i in range(len(sub_npy)) for j in sub_npy[i]['preds_conf'].detach().cpu().numpy()]

In [40]:
sub2['conf'] = sub['conf']

In [28]:
sub2 = sub.copy()

In [41]:
sub2

Unnamed: 0,id,landmark_id,conf,new_id
0,xlf1tgh2ih,956,0.504824,956
1,68a3ot4osk,956,0.469569,956
2,si2lek4u0a,956,0.209661,956
3,rmtqxhipnv,956,0.364817,956
4,2flmjdud0e,956,0.599650,956
...,...,...,...,...
37959,8nlfrrdnwk,901,0.385468,901
37960,k0w00aa3iy,901,0.428705,901
37961,xrp8d0pb85,901,0.499425,901
37962,uobnsz7na9,901,0.256282,901


In [30]:
sub2['landmark_id']=sub2['new_id']

In [42]:
sub2.iloc[:, :3].to_csv('../submit/best_submission2.csv', index=False)