In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import numpy as np
from func import display_df, path, path_img, get_data
from func_torch import ShopeeDataset, device, torch, f1_score_cal
from torch.utils.data import DataLoader
from efficientnet_pytorch import EfficientNet
import gc
import faiss

In [2]:
# input
df = get_data(path / 'train.csv', path_img)
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,filepath,target,title_edit
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,C:\Users\PC\OneDrive - Seagroup\computer_vison...,"[train_2278313361, train_129225211]",paper bag victoria secret
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,C:\Users\PC\OneDrive - Seagroup\computer_vison...,"[train_3386243561, train_3423213080]","double tape 3m vhb 12 mm x 4,5 m original / do..."
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,C:\Users\PC\OneDrive - Seagroup\computer_vison...,"[train_2288590299, train_3803689425]",maling tts canned pork luncheon meat 397 gr
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,C:\Users\PC\OneDrive - Seagroup\computer_vison...,"[train_3342059966, train_2406599165]",daster batik lengan pendek - motif acak / camp...
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,C:\Users\PC\OneDrive - Seagroup\computer_vison...,"[train_3369186413, train_921438619]",nescafe \xc3\x89clair latte 220ml


In [None]:
dataset_data = ShopeeDataset(csv=df, train=True)
data_loader = DataLoader(dataset_data, batch_size=16, num_workers=4)

print(f"Dataset Len: {len(dataset_data):,}\nImage Shape [0]: {dataset_data[0][0].shape}")

In [16]:
model_name = 'efficientnet-b3'
image_embeddings_path = path / f'image_embed_{model_name}.npy'
rerun = True

if rerun:
    # Run
    model_effnet = EfficientNet.from_name(model_name).to(device)

    print(model_name, device)

    embeddings = []
    with torch.no_grad():
        for image, label in tqdm(data_loader):
            image = image.to(device)
            img_embeddings = model_effnet(image)
            img_embeddings = img_embeddings.detach().cpu().numpy()
            embeddings.append(img_embeddings)
    all_image_embeddings = np.concatenate(embeddings)

    # Save
    np.save(image_embeddings_path, all_image_embeddings)

    # Clean memory
    del model_effnet
    _ = gc.collect()

else:
    # load
    all_image_embeddings = np.load(image_embeddings_path)

print(f"image_embeddings shape: {all_image_embeddings.shape}")

In [4]:
index = faiss.IndexFlatL2(all_image_embeddings.shape[1])
index.add(all_image_embeddings)
print(index.ntotal)

k = 50
distances, indices = index.search(all_image_embeddings,k)

# faiss_index = faiss.IndexFlatL2(1000)
#
# im_indices = []
# for i, v in tqdm(enumerate(all_image_embeddings), total=all_image_embeddings.shape[0]):
#     faiss_index.add(v.reshape(1, 1000))
#     im_indices.append(i)

100%|████████████████████████████████████████████████████████████████████████| 34250/34250 [00:00<00:00, 285415.80it/s]


In [27]:
thresholds = list(np.arange(0.001, 0.1, 0.01))
target = df['target'].values.tolist()
scores, plot_check, f1_score_lst, pred_lst = {}, {}, [], {}

for num in thresholds:
    predictions = []
    for k in range(all_image_embeddings.shape[0]):
        idx = np.where(distances[k,] < num)[0]
        ids = indices[k,idx]
        predictions.append(df['posting_id'].iloc[ids].values.tolist())
    f1_score_lst = [f1_score_cal(i, v) for i, v in zip(target, predictions)]

    score = np.mean(f1_score_lst)
    scores.update({num: score})
    plot_check.update({num: f1_score_lst})
    pred_lst.update({num: predictions})
    print(f'--- F1 score for threshold {num} is {score:.3f} ---\n')


best_score = max(scores, key=scores.get)
print(f'--- Best score is {round(scores[best_score], 4)} and has a threshold {best_score} ---')
df['img_pred'] = pred_lst[best_score]
df['f1_score'] = plot_check[best_score]

array([[    0, 33042,  3963, 17471,  5831]], dtype=int64)