In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import os.path as osp
import yaml
import sys
import logging
from glob import glob
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import average_precision_score
sys.path.append("../src")


logger = logging.getLogger(__name__)


In [2]:
from dataset_utils import get_datasets
from lit_utils import LitModel


In [3]:
paths = [
    "../outputs/train_model_Clothing_Shoes_and_Jewelry_20211027_102744",
    "../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211020_172806/train_model_Clothing_Shoes_and_Jewelry_20211020_172806_0",
    # "../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211020_172806/train_model_Clothing_Shoes_and_Jewelry_20211020_172806_1",
    "../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211020_172806/train_model_Clothing_Shoes_and_Jewelry_20211020_172806_2",
    # "../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211020_172806/train_model_Clothing_Shoes_and_Jewelry_20211020_172806_3",
]

# Cosine loss
paths = [
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211027_141006/train_model_Clothing_Shoes_and_Jewelry_20211027_141006_0',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211027_141006/train_model_Clothing_Shoes_and_Jewelry_20211027_141006_1',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211027_141006/train_model_Clothing_Shoes_and_Jewelry_20211027_141006_2',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211027_141006/train_model_Clothing_Shoes_and_Jewelry_20211027_141006_3',
]

# Inner product
paths = [
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211028_090148/train_model_Clothing_Shoes_and_Jewelry_20211028_090148_0',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211028_090148/train_model_Clothing_Shoes_and_Jewelry_20211028_090148_1',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211028_090148/train_model_Clothing_Shoes_and_Jewelry_20211028_090148_2'
]

# Less labeled data
paths= [
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_0',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_1',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_2',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_3',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_4',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_0',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_1',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_2',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_3',
'../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_4',
]


In [5]:
res_dict = {}
recall_dict, hit_dict = {}, {}
for path in paths:
    print(path)
    config_path = osp.join(path, ".hydra", "config.yaml")
    model_path = glob(osp.join(path, "epoch=*.ckpt*"))[0]

    # Load config file
    with open(config_path, "r") as f:
        cfg = yaml.safe_load(f)
    cfg["is_pretrained"] = False
    cfg["batch_size"] = 128
    print(f'{cfg["cf_weight"]=} {cfg["labeled_ratio"]=}')

    # Load dataset
    train_dataset, test_dataset, dataset_meta, pos_weight = get_datasets(
        cfg["train_df_path"],
        cfg["test_df_path"],
        cfg["cf_vector_df_path"],
        cfg["labeled_ratio"],
    )
    logger.info(
        "Sizes [trainset testset num_classes]=[{} {} {}]".format(
            dataset_meta["train_set_size"],
            dataset_meta["test_set_size"],
            dataset_meta["num_classes"],
        )
    )

    testloader = DataLoader(
        test_dataset, batch_size=cfg["batch_size"], num_workers=cfg["num_workers"]
    )

    # Load model
    lit_h = LitModel.load_from_checkpoint(model_path)
    device = "cuda:0"
    lit_h = lit_h.to(device)
    lit_h = lit_h.eval()

    # Get predictions
    label_list, pred_list = [], []
    for batch in tqdm(testloader):

        (
            imgs,
            _,
            labels,
            _,
        ) = batch

        imgs = imgs.to(device)
        y_hat, _ = lit_h(imgs)
        preds = torch.sigmoid(y_hat)

        pred_list.append(preds.detach().cpu())
        label_list.append(labels.detach().cpu())
    preds = torch.vstack(pred_list).numpy()
    labels = torch.vstack(label_list).numpy()

    # Score
    ap = average_precision_score(labels, preds, average=None)

    # Recall
    recall, hit = 0, 0
    items = 0
    no_labels = 0
    for pred, label in zip(torch.tensor(preds), torch.tensor(labels)):
        _, pred_idx = torch.topk(pred, k=5)
        label_idx = torch.where(label == 1)[0]

        if len(label_idx) == 0:
            no_labels += 1
            continue

        recall_i = sum(el in pred_idx for el in label_idx) / len(label_idx)
        recall += recall_i

        hit_i = sum(el in label_idx for el in pred_idx)
        hit += hit_i

        items += 1

    recall /= items
    hit /= items

    recall_dict[f'cf_weight={cfg["cf_weight"]} labeled_ratio={cfg["labeled_ratio"]}'] = recall
    hit_dict[f'cf_weight={cfg["cf_weight"]} labeled_ratio={cfg["labeled_ratio"]}'] = hit

    print(f"{no_labels=}")
    print(f'recall={np.round(recall,3)} ap={np.round(ap.mean(),3)}')


../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_0
cfg["cf_weight"]=0.0 cfg["labeled_ratio"]=0.5


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.691 ap=0.341
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_1
cfg["cf_weight"]=0.0 cfg["labeled_ratio"]=0.6


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.689 ap=0.348
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_2
cfg["cf_weight"]=0.0 cfg["labeled_ratio"]=0.7


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.687 ap=0.352
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_3
cfg["cf_weight"]=0.0 cfg["labeled_ratio"]=0.8


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.686 ap=0.359
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_4
cfg["cf_weight"]=0.0 cfg["labeled_ratio"]=0.9


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.229 ap=0.13
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_0
cfg["cf_weight"]=2.0 cfg["labeled_ratio"]=0.5


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.693 ap=0.347
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_1
cfg["cf_weight"]=2.0 cfg["labeled_ratio"]=0.6


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.693 ap=0.356
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_2
cfg["cf_weight"]=2.0 cfg["labeled_ratio"]=0.7


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.692 ap=0.36
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_3
cfg["cf_weight"]=2.0 cfg["labeled_ratio"]=0.8


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.692 ap=0.36
../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211030_080601/train_model_Clothing_Shoes_and_Jewelry_20211030_080601_4
cfg["cf_weight"]=2.0 cfg["labeled_ratio"]=0.9


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.691 ap=0.367


In [6]:
recall_dict

{'cf_weight=0.0 labeled_ratio=0.5': 0.6911625390745033,
 'cf_weight=0.0 labeled_ratio=0.6': 0.6885688226997471,
 'cf_weight=0.0 labeled_ratio=0.7': 0.6869218466509665,
 'cf_weight=0.0 labeled_ratio=0.8': 0.6858133034092401,
 'cf_weight=0.0 labeled_ratio=0.9': 0.22890082703852438,
 'cf_weight=2.0 labeled_ratio=0.5': 0.692607163758405,
 'cf_weight=2.0 labeled_ratio=0.6': 0.6933390608221298,
 'cf_weight=2.0 labeled_ratio=0.7': 0.692049154340351,
 'cf_weight=2.0 labeled_ratio=0.8': 0.6921553327535273,
 'cf_weight=2.0 labeled_ratio=0.9': 0.6909699380128276}

In [7]:
res_dict = {}
recall_dict, hit_dict = {}, {}
for path in paths:
    print(path)
    config_path = osp.join(path, ".hydra", "config.yaml")
    model_path = glob(osp.join(path, "epoch=*.ckpt*"))[0]

    # Load config file
    with open(config_path, "r") as f:
        cfg = yaml.safe_load(f)
    cfg["is_pretrained"] = False
    cfg["batch_size"] = 128
    print(f'{cfg["cf_weight"]=} {cfg["labeled_ratio"]=}')

    # Load dataset
    train_dataset, test_dataset, dataset_meta, pos_weight = get_datasets(
        cfg["train_df_path"],
        cfg["test_df_path"],
        cfg["cf_vector_df_path"],
        cfg["labeled_ratio"],
    )
    logger.info(
        "Sizes [trainset testset num_classes]=[{} {} {}]".format(
            dataset_meta["train_set_size"],
            dataset_meta["test_set_size"],
            dataset_meta["num_classes"],
        )
    )

    testloader = DataLoader(
        test_dataset, batch_size=cfg["batch_size"], num_workers=cfg["num_workers"]
    )

    # Load model
    lit_h = LitModel.load_from_checkpoint(model_path)
    device = "cuda:0"
    lit_h = lit_h.to(device)
    lit_h = lit_h.eval()

    # Get predictions
    label_list, pred_list = [], []
    for batch in tqdm(testloader):

        (
            imgs,
            _,
            labels,
            _,
        ) = batch

        imgs = imgs.to(device)
        y_hat, _ = lit_h(imgs)
        preds = torch.sigmoid(y_hat)

        pred_list.append(preds.detach().cpu())
        label_list.append(labels.detach().cpu())
    preds = torch.vstack(pred_list).numpy()
    labels = torch.vstack(label_list).numpy()

    # Score
    ap = average_precision_score(labels, preds, average=None)

    # Recall
    recall, hit = 0, 0
    items = 0
    no_labels = 0
    for pred, label in zip(torch.tensor(preds), torch.tensor(labels)):
        _, pred_idx = torch.topk(pred, k=5)
        label_idx = torch.where(label == 1)[0]

        if len(label_idx) == 0:
            no_labels += 1
            continue

        recall_i = sum(el in pred_idx for el in label_idx) / len(label_idx)
        recall += recall_i

        hit_i = sum(el in label_idx for el in pred_idx)
        hit += hit_i

        items += 1

    recall /= items
    hit /= items

    recall_dict[f'cf_weight={cfg["cf_weight"]} labeled_ratio={cfg["labeled_ratio"]}'] = recall
    hit_dict[f'cf_weight={cfg["cf_weight"]} labeled_ratio={cfg["labeled_ratio"]}'] = hit

    print(f"{no_labels=}")
    print(f'recall={np.round(recall,3)} ap={np.round(ap.mean(),3)}')
    break

../outputs/train_model_multirun_Clothing_Shoes_and_Jewelry_20211029_151409/train_model_Clothing_Shoes_and_Jewelry_20211029_151409_0
cfg["cf_weight"]=0.0 cfg["labeled_ratio"]=0.5


  0%|          | 0/53 [00:00<?, ?it/s]

no_labels=121
recall=0.691 ap=0.341


In [8]:
sum(el in pred_idx for el in label_idx) / len(label_idx)



1.0

In [9]:
pred_idx, label_idx

(tensor([ 28,  32,  25, 195,  22]), tensor([22, 25, 28, 32]))

In [None]:
pd.set_option('display.max_rows', res_df.shape[0]+1)

df = res_df[['cf_weight=0.0', 'cf_weight=2.0', 'count']]
df['diff'] =  df['cf_weight=2.0'] - df['cf_weight=0.0']
df = df.sort_values(by='diff')
df= df.reset_index()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(30,10))
ax = df[['cf_weight=0.0', 'cf_weight=2.0']].plot.bar(rot=0,ax=ax)
plt.show()

In [None]:
plt.plot(df['count'])

In [None]:
topk= 5

idx_top = preds.argsort()[::-1][:topk]
label_top = labels.argsort()[::-1][:topk]

In [None]:
_, idx_top =  torch.topk(torch.tensor(preds),k=topk,dim=-1)

In [None]:
for n,idxs in enumerate(idx_top):
    label_in_idx_top = labels[n][idxs]
    break

In [None]:
from torchmetrics.functional import recall


In [None]:
recall(torch.tensor(preds),torch.tensor(labels),top_k=5,average=None,num_classes=preds.shape[-1]).mean()

In [None]:
a = [[0.99,0.99,0.99,0.99,0.99,0,0,0]]
b = [[1,1,1,1,1,0,0,0]]
recall(torch.tensor(a),torch.tensor(b),top_k=5,average=None,num_classes=8)

In [None]:
_,idx = torch.topk(torch.tensor(a),k=5)

In [None]:
pred_idx,label_idx

In [None]:
recall_i