In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import argparse
import torch
from trainer.trainer import Task
from tools.config_loader import get_config
from pathlib import Path
from data_handling.DataLoader import get_dataloader
# from tools.make_csvfile import make_csv

from lightning.pytorch import LightningModule, Trainer, seed_everything
from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
from lightning.pytorch.strategies import DDPStrategy
from lightning.pytorch.loggers import TensorBoardLogger

2023-05-15 17:22:16.230473: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-15 17:22:16.475149: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-15 17:22:17.404298: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.1/lib64:/usr/local/cuda-11.8/lib64::/usr/local/cuda-11.8/lib64
2023-

In [11]:
import numpy as np
import torch
import random
from sentence_transformers import util
from tools.file_io import load_pickle_file
from gensim.models.word2vec import Word2Vec

In [2]:
import easydict
from tools.config_loader import get_config
args = easydict.EasyDict({
    
    "dataset": "Clotho",
    "lr": 0.0001,
    "config": "settings",
    "loss": "weight",
    "freeze": False,
    "batch":24, 
    "margin":0.2,
    "seed":20
})

config = get_config(args.config)

In [3]:
#test_loader = get_dataloader('test', config)

In [3]:
device, device_name = ('cuda',
                           torch.cuda.get_device_name(torch.cuda.current_device())) \
        if torch.cuda.is_available() else ('cpu', platform.processor())
print(device)
print(device_name)

cuda
NVIDIA GeForce RTX 3090


In [7]:
import sys
import time
import numpy as np
import torch
from tqdm import tqdm
from pathlib import Path
from torchinfo import summary
#from loguru import logger
#from pprint import PrettyPrinter
#from torch.utils.tensorboard import SummaryWriter
from tools.utils import setup_seed, AverageMeter, a2t, t2a
from tools.loss import BiDirectionalRankingLoss, TripletLoss, NTXent, VICReg, InfoNCE, InfoNCE_VICReg
#from tools.InfoNCE import InfoNCE
from tools.make_csvfile import make_csv
import pickle

from models.ASE_model import ASE

import lightning.pytorch as pl

class Task(pl.LightningModule):

    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters(config)
        self.config = config
        self.model = ASE(config)
        # self.return_ranks = config.training.csv
        self.pickle_output_path=Path(config.pickle_output_dir,'temporal_embeddings.pkl')
        self.train_step_outputs = []
        self.validate_step_outputs = []

        #Print SubModules of Task
        if torch.distributed.is_initialized() and torch.distributed.get_rank() != 0:
            # do nothing, only run on main process
            None
        else:
            summary(self.model.audio_enc)
            summary(self.model.audio_linear)
            summary(self.model.text_enc)
            summary(self.model.text_linear)

        # #Set-up for CSV file
        # if config.training.csv:
        #     self.csv_output_dir = Path('outputs', config.folder_name, 'csv')
        #     self.csv_output_dir.mkdir(parents=True, exist_ok=True)


        '''
        This is for logger

        logger.add(sys.stdout, format='{time: YYYY-MM-DD at HH:mm:ss} | {message}', level='INFO',
                filter=lambda record: record['extra']['indent'] == 1)
        logger.add(config.log_output_dir.joinpath('output.txt'), format='{time: YYYY-MM-DD at HH:mm:ss} | {message}', level='INFO',
                filter=lambda record: record['extra']['indent'] == 1)

        self.main_logger = logger.bind(indent=1)

        # setup TensorBoard
        writer = SummaryWriter(log_dir=str(self.log_output_dir) + '/tensorboard')
    

        # print training settings
        printer = PrettyPrinter()
        self.main_logger.info('Training setting:\n'
                        f'{printer.pformat(config)}')
        '''

        # set up model
        # if torch.cuda.is_available():
        #     device, device_name = ('cuda',torch.cuda.get_device_name(torch.cuda.current_device()))
        # else: 
        #     device, device_name = ('cpu', None)
        # print(f'Process on {device}:{device_name}')

        # Set up Loss function
        if config.training.loss == 'triplet':
            self.criterion = TripletLoss(margin=config.training.margin)
        
        elif config.training.loss == 'ntxent':
            self.criterion = NTXent()
        
        elif config.training.loss == 'weight':
            self.criterion = WeightTriplet(margin=config.training.margin)
            
        elif config.training.loss == 'infonce':
            self.criterion = InfoNCE()

        elif config.training.loss == 'infonce+vicreg':
            self.criterion = VICReg()
            
        else: #config.training.loss == 'bidirect': 'contrastive'??
            self.criterion = BiDirectionalRankingLoss(margin=config.training.margin)

        ep = 1

        # resume from a checkpoint
        if config.training.resume:
            checkpoint = torch.load(config.path.resume_model)
            self.model.load_state_dict(checkpoint['model'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            ep = checkpoint['epoch']

    # def on_train_start(self):
    #     self.recall_sum =[]

    # def on_train_epoch_start(self):
    #     self.epoch_loss = AverageMeter()

    def training_step(self, batch, batch_idx):

        audios, captions, audio_ids, _, _ = batch

        audio_embeds, caption_embeds = self.model(audios, captions)

        loss = self.criterion(audio_embeds, caption_embeds, audio_ids)
        self.log('train_step_loss',loss, on_step=True, on_epoch=False, prog_bar=True, logger=True)
        self.train_step_outputs.append(loss)
        return loss
    
    def on_train_epoch_end(self):
        avg_loss = torch.stack(self.train_step_outputs).mean()
        self.log('train_epoch_loss', avg_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.train_step_outputs.clear()

    def configure_optimizers(self):
        # set up optimizer
        optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.config.training.lr)
        # set up scheduler
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',factor=0.5,patience=5,threshold=0.005,threshold_mode='abs',min_lr=0.000001,verbose=True)
        return {"optimizer":optimizer, 
                "lr_scheduler":{"scheduler":scheduler,
                                "monitor": 'validation_epoch_loss',
                                "frequency": 1}}

    # def on_validation_start(self):
    #     self.audio_embs, self.cap_embs , self.audio_names_, self.caption_names= None, None, None, None
        
    def validation_step(self, batch, batch_idx):
        # Tensor(N,E), list, Tensor(N), array, list
        audios, captions, audio_ids, indexs, audio_names = batch
        data_size = self.config.data.val_datasets_size
        audio_embeds, caption_embeds = self.model(audios, captions)
            # if self.return_ranks:
            #     Task.audio_names_ = np.array([None for i in range(data_size)], dtype=object)
            #     Task.caption_names = np.array([None for i in range(data_size)], dtype=object)
        
        loss = self.criterion(audio_embeds, caption_embeds, audio_ids)
        self.log('validation_step_loss', loss, on_step=True, on_epoch=False, prog_bar=True, logger=True)
        self.validate_step_outputs.append(loss)
        return loss
    
    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.validate_step_outputs).mean()
        self.log('validation_epoch_loss', avg_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.validate_step_outputs.clear()

    # def on_test_start(self):
    #     self.on_validation_start()
    '''
    def on_test_epoch_start(self):
        self.on_validation_epoch_start()
    
    def test_step(self, batch, batch_idx):
        audios, captions, audio_ids, indexs, audio_names = batch
        data_size = self.config.data.test_datasets_size
        audio_embeds, caption_embeds = self.model(audios, captions)

        if Task.audio_embs is None:
            Task.audio_embs = np.zeros((data_size, audio_embeds.shape[1]))
            Task.cap_embs = np.zeros((data_size, caption_embeds.shape[1]))
            if self.return_ranks:
                Task.audio_names_ = np.array([None for i in range(data_size)],dtype=object)
                Task.caption_names = np.array([None for i in range(data_size)],dtype=object)
        
        loss = self.criterion(audio_embeds, caption_embeds, audio_ids)
        self.log('test_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        Task.audio_embs[indexs] = audio_embeds.cpu().numpy()
        Task.cap_embs[indexs] = caption_embeds.cpu().numpy()

        if self.return_ranks:
            Task.audio_names_[indexs] = np.array(audio_names)
            Task.caption_names[indexs] = np.array(captions)
        return loss

    def on_test_end(self):
        if self.return_ranks:
            r1, r5, r10, mAP10, medr, meanr, ranks, Task.top10 = t2a(Task.audio_embs, Task.cap_embs, return_ranks=True)
            print("Top10 Shape:",Task.top10.shape,"Audio Embeddings:",Task.audio_embs.shape)
        else:
            r1, r5, r10, mAP10, medr, meanr = t2a(Task.audio_embs, Task.cap_embs)
        self.logger.experiment.add_scalars('test_metric',{'r1':r1, 'r5':r5, 'r10':r10, 'mAP10':mAP10, 'medr':medr, 'meanr':meanr})
    
    def on_after_backward(self):
        # call on_test_end() only once after accumulating the results of each process
        if self.trainer.local_rank == 0:
            self.on_test_end()
'''
    def on_test_start(self):
        temporal_dict={'audio_embs':None, 'cap_embs':None, 'audio_names_':None, 'caption_names':None}
        with open(self.pickle_output_path, 'wb') as f:  
            pickle.dump(temporal_dict,f, protocol=pickle.HIGHEST_PROTOCOL)
        
    def test_step(self, batch, batch_idx):
        with open(self.pickle_output_path, 'rb') as f:  
            temporal_dict=pickle.load(f)
        # Tensor(N,E), list, Tensor(N), array, list
        audios, captions, audio_ids, indexs, audio_names = batch
        data_size = self.config.data.val_datasets_size
        audio_embeds, caption_embeds = self.model(audios, captions)
        if temporal_dict['audio_embs'] is None:
            temporal_dict['audio_embs'] = np.zeros((data_size, audio_embeds.shape[1]))
            temporal_dict['cap_embs'] = np.zeros((data_size, caption_embeds.shape[1]))
            # if self.return_ranks:
            #     Task.audio_names_ = np.array([None for i in range(data_size)], dtype=object)
            #     Task.caption_names = np.array([None for i in range(data_size)], dtype=object)
        temporal_dict['audio_embs'][indexs] = audio_embeds.cpu().numpy()
        temporal_dict['cap_embs'][indexs] = caption_embeds.cpu().numpy()

        with open(self.pickle_output_path, 'wb') as f:  
            pickle.dump(temporal_dict,f, protocol=pickle.HIGHEST_PROTOCOL)
    
    def on_test_end(self):
        with open(self.pickle_output_path, 'rb') as f:  
            temporal_dict=pickle.load(f)
        r1, r5, r10, mAP10, medr, meanr = t2a(temporal_dict['audio_embs'], temporal_dict['cap_embs'])
        print(f'r1:{r1}, r5:{r5}, r10:{r10}, mAP10:{mAP10}')
        self.logger.experiment.add_scalars('metric',{'r1':r1, 'r5':r5, 'r10':r10, 'mAP10':mAP10, 'medr':medr, 'meanr':meanr},self.current_epoch)

In [31]:
#baseline model
ckp_path1 = '/home/user/audio-text_retrieval/outputs/0511_freeze_True_lr_0.0001_seed_20/models/best_checkpoint.ckpt'
model1 = torch.load(ckp_path1)
for name, param in model1['state_dict'].items(): 
    print(name) 

model.audio_enc.audio_enc.bn0.weight
model.audio_enc.audio_enc.bn0.bias
model.audio_enc.audio_enc.bn0.running_mean
model.audio_enc.audio_enc.bn0.running_var
model.audio_enc.audio_enc.bn0.num_batches_tracked
model.audio_enc.audio_enc.spectrogram_extractor.stft.conv_real.weight
model.audio_enc.audio_enc.spectrogram_extractor.stft.conv_imag.weight
model.audio_enc.audio_enc.logmel_extractor.melW
model.audio_enc.audio_enc.conv_block1.conv1.weight
model.audio_enc.audio_enc.conv_block1.conv2.weight
model.audio_enc.audio_enc.conv_block1.bn1.weight
model.audio_enc.audio_enc.conv_block1.bn1.bias
model.audio_enc.audio_enc.conv_block1.bn1.running_mean
model.audio_enc.audio_enc.conv_block1.bn1.running_var
model.audio_enc.audio_enc.conv_block1.bn1.num_batches_tracked
model.audio_enc.audio_enc.conv_block1.bn2.weight
model.audio_enc.audio_enc.conv_block1.bn2.bias
model.audio_enc.audio_enc.conv_block1.bn2.running_mean
model.audio_enc.audio_enc.conv_block1.bn2.running_var
model.audio_enc.audio_enc.conv_

----

In [64]:
train_loader = get_dataloader('train', config) 
len(train_loader)

299

In [4]:
test_loader = get_dataloader('val', config) #val_loader
len(test_loader)

In [99]:
pickle_path3 = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'

with open(pickle_path3, 'rb') as f:  
    temporal_dict3=pickle.load(f)

In [100]:
temporal_dict3

{'audio_embs': None,
 'cap_embs': None,
 'audio_names_': None,
 'caption_names': None}

In [8]:
cc = '/home/user/audio-text_retrieval/outputs/cnn14triplet_weight/pickle/temporal_embeddings.pkl'
with open(cc, 'rb') as f:  
    temp3=pickle.load(f)
temp3

{'audio_embs': array([[ 0.00584552,  0.01597925, -0.01030066, ..., -0.0022582 ,
         -0.00109217, -0.00844152],
        [ 0.00584552,  0.01597925, -0.01030066, ..., -0.0022582 ,
         -0.00109217, -0.00844152],
        [ 0.00584552,  0.01597925, -0.01030066, ..., -0.0022582 ,
         -0.00109217, -0.00844152],
        ...,
        [-0.00783311, -0.01209926, -0.00834842, ...,  0.00567435,
          0.03848082,  0.00422601],
        [-0.00783311, -0.01209926, -0.00834842, ...,  0.00567435,
          0.03848082,  0.00422601],
        [-0.00783311, -0.01209926, -0.00834842, ...,  0.00567435,
          0.03848082,  0.00422601]]),
 'cap_embs': array([[ 0.02950856,  0.01023548, -0.01135472, ...,  0.0001325 ,
         -0.01058777,  0.00497012],
        [ 0.02425566,  0.02924298, -0.00770269, ..., -0.02199307,
         -0.02147528,  0.00692573],
        [ 0.02111989,  0.02728247, -0.00739679, ..., -0.01995897,
         -0.01361072,  0.00140775],
        ...,
        [-0.02159963,  0.001

---

In [323]:
def t2a(audio_embs, cap_embs, return_ranks=False):
    # caption to audio retrieval
    num_audios = int(audio_embs.shape[0] / 5)

    audios = np.array([audio_embs[i]for i in range(0, audio_embs.shape[0], 5)])

    ranks = np.zeros(5 * num_audios)
    top10 = np.zeros([5 * num_audios,10])

    for index in range(num_audios):

        # get query captions
        queries = cap_embs[5 * index: 5 * index + 5]

        # compute scores
        # queries @ audio.T
        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()

        inds = np.zeros(d.shape)
        for i in range(len(inds)):
            inds[i] = np.argsort(d[i])[::-1]
            ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
            top10[5 * index + i] = inds[i][0:10]

    # compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
    mAP10 = 100.0 * np.sum(1 / (ranks[np.where(ranks < 10)[0]] + 1)) / len(ranks)
    
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1
    if return_ranks:
        return r1, r5, r10, mAP10, medr, meanr, ranks, top10
    else:
        return r1, r5, r10, mAP10, medr, meanr

In [143]:
def t2a(audio_embs, cap_embs):
    # Get the number of audios
    num_audios = int(audio_embs.shape[0] / 5)

    audios = np.array([audio_embs[i] for i in range(0, audio_embs.shape[0], 5)])

    top10 = np.zeros([5 * num_audios, num_audios])

    for index in range(num_audios):
        queries = cap_embs[5 * index: 5 * index + 5]

        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()

        inds = np.argsort(d, axis=1)[:,::-1] #(5,1045)
        #print(inds)


        #top 10 유사도 점수에 해당하면 1, else 0
        top10_indices = inds[:,:10] #(5,10)
        audio_similarities = np.zeros((5, num_audios)) #(5,1045)
        for i in range(5):
            audio_similarities[i, top10_indices[i].astype(int)] = 1
        top10[5 * index: 5 * index + 5] = audio_similarities

    return audio_similarities

In [146]:
t2a(temp3['audio_embs'],temp3['cap_embs'])

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [139]:
np.where(t2a(temp3['audio_embs'],temp3['cap_embs']))==1

[[603 761 540 ... 733 353 594]
 [599 108  65 ... 675 626 334]
 [108 599  65 ... 675 626 334]
 [974 480 285 ... 625 874 981]
 [761 224 716 ... 846 855 594]]
[[  1 702 471 ... 775 222 485]
 [685 702 601 ... 941 334 436]
 [685 765   1 ... 436 334 532]
 [  1 702 354 ... 568 485 222]
 [702   1 354 ... 436 222 568]]
[[ 702  601  685 ...  832  975  222]
 [ 702  685  354 ...   32  975  222]
 [ 702  685  354 ...  532  775  136]
 [ 601  689    2 ...  573  975  253]
 [ 702  601  685 ...  536  373 1044]]
[[  63  251  195 ...  594  788  888]
 [ 980  902   40 ...  825  608  386]
 [ 571   63  776 ...   44  449  499]
 [  59  571  340 ...  920   71  772]
 [  63    3  571 ...  952 1044  818]]
[[  57  648  199 ...  525  353  594]
 [  60  306  810 ...  757  857  373]
 [ 810   60  310 ...   71  373  857]
 [ 897  419  277 ...  772  750  482]
 [1017  505  561 ...  321  539  355]]
[[ 709 1036  856 ...  901  546  828]
 [ 796  331  979 ...  938  203  320]
 [ 796  709    5 ...  206   39  320]
 [ 796  709    5 ..

False

In [None]:
## t2a(temp3['audio_embs'],temp3['cap_embs'])

-----
-----

In [187]:
def t2a(audio_embs, cap_embs, return_ranks=False):
    # caption to audio retrieval
    num_audios = int(audio_embs.shape[0] / 5)

    audios = np.array([audio_embs[i]for i in range(0, audio_embs.shape[0], 5)])

    ranks = np.zeros(5 * num_audios)
    top10 = np.zeros([5 * num_audios,10])

    for index in range(num_audios):

        # get query captions
        queries = cap_embs[5 * index: 5 * index + 5]

        # compute scores
        # queries @ audio.T
        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()

        inds = np.zeros(d.shape)
        for i in range(len(inds)):
            inds[i] = np.argsort(d[i])[::-1]
            ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
            top10[5 * index + i] = inds[i][0:10]

    # compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
    mAP10 = 100.0 * np.sum(1 / (ranks[np.where(ranks < 10)[0]] + 1)) / len(ranks)
    
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1
    return top10

In [159]:
_,_,_,_,_,_,_,top10=t2a(temp3['audio_embs'],temp3['cap_embs'],return_ranks=True)

In [175]:
matrix=np.zeros((5225,5225))

In [176]:
for i,index in enumerate(top10):
    matrix[i][index.astype(int)]=1

In [183]:
np.where(matrix[0]==1)

(array([ 58, 161, 540, 603, 646, 676, 761, 790, 902, 942]),)

In [184]:
top10[0]

array([603., 761., 540., 161., 676., 790., 942., 902., 646.,  58.])

In [194]:
top10_1=t2a(temporal_dict1['audio_embs'],temporal_dict1['cap_embs'])
top10_2=t2a(temporal_dict2['audio_embs'],temporal_dict2['cap_embs'])
top10_3=t2a(temporal_dict3['audio_embs'],temporal_dict3['cap_embs'])
top10_4=t2a(temporal_dict4['audio_embs'],temporal_dict4['cap_embs'])
top10_5=t2a(temporal_dict5['audio_embs'],temporal_dict5['cap_embs'])
top10_6=t2a(temporal_dict6['audio_embs'],temporal_dict6['cap_embs'])
top10_7=t2a(temporal_dict7['audio_embs'],temporal_dict7['cap_embs'])
top10_8=t2a(temporal_dict8['audio_embs'],temporal_dict8['cap_embs'])
top10_9=t2a(temporal_dict9['audio_embs'],temporal_dict9['cap_embs'])

In [199]:
all_matrix=[]
for j in range(9):
    matrix=np.zeros((5225,5225))
    for i,index in enumerate(globals()['top10_%s' % (j+1)]):
        matrix[i][index.astype(int)]=np.argsort(new_matrix[0])[::-1]
    all_matrix.append(matrix)

In [219]:
new_matrix=all_matrix[0] + all_matrix[1]+all_matrix[2]+all_matrix[3]+all_matrix[4]+all_matrix[5]+all_matrix[6]+all_matrix[7]+all_matrix[8]

In [251]:
np.where(np.argsort(new_matrix[0])[::-1] == 0 )[0][0]

10

In [276]:
np.argsort(new_matrix[0])[::-1]

array([ 761,  603,   70, ..., 3485, 3486, 2612])

In [272]:
ranks=[]
for index in range(new_matrix.shape[0]):
    ranks.append(np.where(np.argsort(new_matrix[index])[::-1] == index )[0][0])

In [273]:
ranks=np.array(ranks)

In [274]:
r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
mAP10 = 100.0 * np.sum(1 / (ranks[np.where(ranks < 10)[0]] + 1)) / len(ranks)

medr = np.floor(np.median(ranks)) + 1
meanr = ranks.mean() + 1

In [275]:
print(r1,r5,r10,r50,mAP10)

0.0 0.05741626794258373 0.09569377990430622 0.7272727272727273 0.024720893141945772


----

In [156]:
def t2a(audio_embs, cap_embs):
    top10_index = []
    # Get the number of audios
    num_audios = int(audio_embs.shape[0] / 5)

    audios = np.array([audio_embs[i] for i in range(0, audio_embs.shape[0], 5)])

    top10 = np.zeros([5 * num_audios, num_audios])

    for index in range(num_audios):
        queries = cap_embs[5 * index: 5 * index + 5]

        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()

        inds = np.argsort(d, axis=1)[:,::-1] #(5,1045)
        #print(inds)


        #top 10 유사도 점수에 해당하면 1, else 0
        top10_indices = inds[:,:10] #(5,10)
        print(top10_indices)
        top10_index.append(top10_indices)
        audio_similarities = np.zeros((5, num_audios)) #(5,1045)
        for i in range(5):
            audio_similarities[i, top10_indices[i].astype(int)] = 1
        top10[5 * index: 5 * index + 5] = audio_similarities

    return top10_indices
top10_index=t2a(temp3['audio_embs'],temp3['cap_embs'])

[[ 603  761  540  161  676  790  942  902  646   58]
 [ 599  108   65  193  363  109  533  576  347   24]
 [ 108  599   65  109  193  347  182  533  894  866]
 [ 974  480  285   84  690  465  445   18  621 1026]
 [ 761  224  716  676  603   58  540  866  161  942]]
[[   1  702  471  685  354  355  701  970  305  197]
 [ 685  702  601    2  358    1  122   58  765  385]
 [ 685  765    1  702  970  269  122 1009  354  369]
 [   1  702  354  399  731  685  355  471  166  970]
 [ 702    1  354  471  399 1030  731  685  666  355]]
[[702 601 685 358   1 385 354   2  58 689]
 [702 685 354 471   1 124 405 601 452 227]
 [702 685 354 124   1 601 405 471 452 207]
 [601 689   2 702 385  58 217 227 685 358]
 [702 601 685   1  58 385 358   2 204 354]]
[[ 63 251 195 571 454  59 160 232 776 707]
 [980 902  40 690 347 881 925 767 296 621]
 [571  63 776 232 251 205 160 291 897 925]
 [ 59 571 340  63 590 319 251 649 310 866]
 [ 63   3 571 251 707 340 160 252 195 603]]
[[  57  648  199  712   54  718  294

In [157]:
top10_index

array([[ 696,  959,  818,  115,  772,  993, 1044,  920,  895,  275],
       [ 155,  174,  977,  552,  275,   90,  555,  818, 1044, 1009],
       [ 977, 1044,  174,  555,  275,  619,  670,  964,  400,  152],
       [ 977,  555, 1044,  174,  275,  670,  619,  400,  964,  152],
       [ 696,  818, 1044,  959,  115,  772,  275,  993,  920,  174]])

-----

In [105]:
for index in range(1): #num_audios=1045
    queries = cap_embs[5 * index: 5 * index + 5] #(5,300)
    #print(queries)

    d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy() #(5,1045)
    print("d:",d)
    inds = np.zeros(d.shape) #(5,1045)
    print("inds:",inds) 
    print()
    for i in range(len(inds)): #0~4 
        inds[i] = np.argsort(d[i])[::-1] 
        print("inds:   ",inds[i]) #1045개 sorting함.
        ranks[5 * index + i] = np.where(inds[i] == index)[0][0] 
        print("ranks[]:",ranks[5 * index + i])
        top10[5 * index + i] = inds[i][0:10]
        print("top10[]:",top10[5 * index + i])
        print()


d: [[0.45075485 0.12083632 0.36690924 ... 0.12209399 0.13808487 0.07424631]
 [0.43359673 0.24229638 0.3381359  ... 0.34180066 0.29695374 0.15492585]
 [0.40268844 0.23143923 0.32050887 ... 0.3333536  0.27535573 0.1480293 ]
 [0.43700394 0.23226264 0.35124463 ... 0.20106594 0.19870071 0.01656111]
 [0.41626367 0.1285492  0.35412353 ... 0.14907838 0.1402225  0.05710289]]
inds: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

inds:    [603. 761. 540. ... 733. 353. 594.]
ranks[]: 32.0
top10[]: [603. 761. 540. 161. 676. 790. 942. 902. 646.  58.]

inds:    [599. 108.  65. ... 675. 626. 334.]
ranks[]: 32.0
top10[]: [599. 108.  65. 193. 363. 109. 533. 576. 347.  24.]

inds:    [108. 599.  65. ... 675. 626. 334.]
ranks[]: 41.0
top10[]: [108. 599.  65. 109. 193. 347. 182. 533. 894. 866.]

inds:    [974. 480. 285. ... 625. 874. 981.]
ranks[]: 46.0
top10[]: [ 974.  480.  285.   84.  690.  465.  445.   18.  621. 1026.]

inds

In [None]:
top10_indices = inds[i][0:10]
audio_similarities = np.zeros(num_audios)
audio_similarities[top10_indices] = 1
top10[5 * index + i] = audio_similarities

In [115]:
selections

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [119]:
# caption to audio retrieval
num_audios = int(audio_embs.shape[0] / 5)  
selections = np.zeros((5 * num_audios, audio_embs.shape[0]), dtype=int)  # 선택 결과를 저장할 배열

for index in range(num_audios):
    # get query captions
    queries = cap_embs[5 * index: 5 * index + 5]

    # compute scores
    d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()

    # select top 10 audios for each caption
    inds = np.argsort(d, axis=1)[:, ::-1]
    top10 = inds[:, :10]

    # update selections array
    for i in range(len(top10)):
        audio_indices = top10[i]
        selections[5 * index + i, audio_indices] = 1


In [121]:
selections.shape

(5225, 5225)

In [122]:
np.where(selections==1)

(array([   0,    0,    0, ..., 5224, 5224, 5224]),
 array([  58,  161,  540, ...,  959,  993, 1044]))

---

In [40]:
arr = np.array([[603., 761., 540., 902., 646., 58.],
                [599., 108., 65., 576., 347., 24.],
                [108., 599., 65., 533., 894., 866.],
                [977., 1044., 174., 964., 400., 152.],
                [977., 555., 1044., 400., 964., 152.],
                [696., 818., 1044., 993., 920., 174.]])

max_values = arr.max(axis=1)
max_indices = arr.argmax(axis=1)

for i in range(len(max_values)):
    print("최대값: {:.2f}, 위치: ({}, {})".format(max_values[i], i, max_indices[i]))
    

최대값: 902.00, 위치: (0, 3)
최대값: 599.00, 위치: (1, 0)
최대값: 894.00, 위치: (2, 4)
최대값: 1044.00, 위치: (3, 1)
최대값: 1044.00, 위치: (4, 2)
최대값: 1044.00, 위치: (5, 2)


In [129]:
import numpy as np
import torch
from sentence_transformers import util

def t2a(audio_embs, cap_embs, return_ranks=False):
    # 오디오 개수
    num_audios = int(audio_embs.shape[0] / 5) ##1045개 validation 기준

    # Extract the audio embeddings for each caption
    audios = np.array([audio_embs[i] for i in range(0, audio_embs.shape[0], 5)]) #(1045, 300)

    #각 caption에 대해서 오디오 rank 저장할 array
    ranks = np.zeros(5 * num_audios) # (5225,)

    # Create an array to store the top-10 audio indices for each query caption
    top10 = np.zeros([5 * num_audios, 10]) # (5225,10)

    for index in range(num_audios):
        # Get the query captions for the current audio
        queries = cap_embs[5 * index: 5 * index + 5] # (5,300)

        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy() #queries:(5,300), audios:(1045,300)
        # (d.shape) : (5,1045)

        #각 caption에 대해서 유사도 점수로 정렬
        inds = np.zeros(d.shape) 
        for i in range(len(inds)):
            inds[i] = np.argsort(d[i])[::-1]

            
        # 각 쿼리에 대해서 correct 오디오 rank 찾기
        for i in range(len(inds)): 
            rank = np.where(inds[i] == index)[0][0]
            ranks[5 * index + i] = rank

            #top10 audio에 대해서 similarity=1, top10이 아니면 0으로 설정
            top10_indices = inds[i][0:10]
            audio_similarities = np.zeros(num_audios)
            audio_similarities[top10_indices] = 1
            top10[5 * index + i] = audio_similarities

    # Compute the retrieval metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
    mAP10 = 100.0 * np.sum(1 / (ranks[np.where(ranks < 10)[0]] + 1)) / len(ranks)

    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1

    if return_ranks:
        # Return the retrieval metrics, the ranks of the correct audio for each query caption, and the top-10 audio indices
        return r1, r5, r10, mAP10, medr, meanr, ranks, top10, audio_similarities
    else:
        # Return only the retrieval metrics
        return r1, r5, r10, mAP10, medr, meanr, audio_similarities


In [155]:
t2a(temp3['audio_embs'], temp3['cap_embs'])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [157]:
temp3['cap_embs']

array([[ 0.02950856,  0.01023548, -0.01135472, ...,  0.0001325 ,
        -0.01058777,  0.00497012],
       [ 0.02425566,  0.02924298, -0.00770269, ..., -0.02199307,
        -0.02147528,  0.00692573],
       [ 0.02111989,  0.02728247, -0.00739679, ..., -0.01995897,
        -0.01361072,  0.00140775],
       ...,
       [-0.02159963,  0.00124152, -0.01530368, ..., -0.01862541,
         0.04651717,  0.00181126],
       [-0.02171858,  0.0009191 , -0.01271556, ..., -0.01491472,
         0.0524971 , -0.00010781],
       [-0.00528172, -0.02056348,  0.01314805, ..., -0.02382456,
         0.03031994,  0.01696127]])

In [125]:
import numpy as np
import torch
import random
from sentence_transformers import util

def t2a(audio_embs, cap_embs, return_ranks=False):
    # caption to audio retrieval
    num_audios = int(audio_embs.shape[0] / 5) #1045개 validation

    audios = np.array([audio_embs[i]for i in range(0, audio_embs.shape[0], 5)]) #(1045, 300)

    ranks = np.zeros(5 * num_audios) # (5225,)
    
    top1 = np.zeros(5 * num_audios)
    top10 = np.zeros([5 * num_audios,10]) # (5225,10)
    top10_scores = np.zeros([5 * num_audios,10])

    for index in range(num_audios):

        # get query captions
        queries = cap_embs[5 * index: 5 * index + 5] # (5,300)

        # compute scores
        # queries @ audio.T
        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()
        # (d.shape) : (5,1045)

        inds = np.zeros(d.shape) 
        for i in range(len(inds)):
            inds[i] = np.argsort(d[i])[::-1]
            ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
            top10[5 * index + i] = inds[i][0:10]
            #top1[5 * index + i] = inds[i][0]
            
            # # check if current index is in top 10 and set corresponding element in top10_scores
            # if i in inds[i][0:10]:
            #     top10_scores[5 * i + i][np.where(inds[i][0:10] == i)[0][0]] = 1
            
            # set similarity scores
        for i in range(5 * index, 5 * index + 5):
            if i in np.where(top10 == i)[0]:
                audio_similarities = np.zeros(5 * num_audios)
                audio_similarities[i] = 1
            else:
                audio_similarities = np.zeros(5 * num_audios)
    print(audio_similarities)
    print(audio_similarities.shape)

    # 각 row에 있는 1 찾기 
    # ones_indices = [np.where(row == 1)[0] for row in top10_scores]
    # 각 row에 있는 ones_indices 찍기
    # print(f"Ones indices for row {index}: {ones_indices}")
            
                
    #print(top10)
    #print(top10.shape)
    #print(top10_scores) 
    #print(ranks) # [32. 32. 41. ...  1.  2.  2.]
    #print(len(ranks)) #5225


    # compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
    mAP10 = 100.0 * np.sum(1 / (ranks[np.where(ranks < 10)[0]] + 1)) / len(ranks)

    
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1
    
    if return_ranks:
        return r1, r5, r10, mAP10, medr, meanr, ranks, top1
    
    else:
        return r1, r5, r10, mAP10, top10_scores


In [193]:
# t2a(temp3['audio_embs'],temp3['cap_embs'])

(5, 1045)
[[603 761 540 ... 733 353 594]
 [599 108  65 ... 675 626 334]
 [108 599  65 ... 675 626 334]
 [974 480 285 ... 625 874 981]
 [761 224 716 ... 846 855 594]]
(5, 1045)
[[  1 702 471 ... 775 222 485]
 [685 702 601 ... 941 334 436]
 [685 765   1 ... 436 334 532]
 [  1 702 354 ... 568 485 222]
 [702   1 354 ... 436 222 568]]
(5, 1045)
[[ 702  601  685 ...  832  975  222]
 [ 702  685  354 ...   32  975  222]
 [ 702  685  354 ...  532  775  136]
 [ 601  689    2 ...  573  975  253]
 [ 702  601  685 ...  536  373 1044]]
(5, 1045)
[[  63  251  195 ...  594  788  888]
 [ 980  902   40 ...  825  608  386]
 [ 571   63  776 ...   44  449  499]
 [  59  571  340 ...  920   71  772]
 [  63    3  571 ...  952 1044  818]]
(5, 1045)
[[  57  648  199 ...  525  353  594]
 [  60  306  810 ...  757  857  373]
 [ 810   60  310 ...   71  373  857]
 [ 897  419  277 ...  772  750  482]
 [1017  505  561 ...  321  539  355]]
(5, 1045)
[[ 709 1036  856 ...  901  546  828]
 [ 796  331  979 ...  938  203  3

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [151]:
t2a(temp3['audio_embs'],temp3['cap_embs'])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [131]:
np.where(t2a(temp3['audio_embs'],temp3['cap_embs'])==1)

(array([   0,    0,    0, ..., 5224, 5224, 5224]),
 array([  58,  161,  540, ...,  959,  993, 1044]))

---

In [113]:
cc = '/home/user/audio-text_retrieval/outputs/cnn14triplet_weight/pickle/temporal_embeddings.pkl'
with open(cc, 'rb') as f:  
    temp3=pickle.load(f)
temp3

{'audio_embs': array([[ 0.00584552,  0.01597925, -0.01030066, ..., -0.0022582 ,
         -0.00109217, -0.00844152],
        [ 0.00584552,  0.01597925, -0.01030066, ..., -0.0022582 ,
         -0.00109217, -0.00844152],
        [ 0.00584552,  0.01597925, -0.01030066, ..., -0.0022582 ,
         -0.00109217, -0.00844152],
        ...,
        [-0.00783311, -0.01209926, -0.00834842, ...,  0.00567435,
          0.03848082,  0.00422601],
        [-0.00783311, -0.01209926, -0.00834842, ...,  0.00567435,
          0.03848082,  0.00422601],
        [-0.00783311, -0.01209926, -0.00834842, ...,  0.00567435,
          0.03848082,  0.00422601]]),
 'cap_embs': array([[ 0.02950856,  0.01023548, -0.01135472, ...,  0.0001325 ,
         -0.01058777,  0.00497012],
        [ 0.02425566,  0.02924298, -0.00770269, ..., -0.02199307,
         -0.02147528,  0.00692573],
        [ 0.02111989,  0.02728247, -0.00739679, ..., -0.01995897,
         -0.01361072,  0.00140775],
        ...,
        [-0.02159963,  0.001

In [112]:
t2a(temp3['audio_embs'],temp3['cap_embs'])

(13.741626794258373,
 36.30622009569378,
 49.186602870813395,
 23.522503227766386,
 11.0,
 38.463923444976075)

In [147]:
aa = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
with open(aa, 'rb') as f:  
    temp1=pickle.load(f)
temp1

{'audio_embs': array([[ 0.00657598, -0.00107832, -0.00325419, ...,  0.02274671,
         -0.00794175, -0.00115105],
        [ 0.00657598, -0.00107832, -0.00325419, ...,  0.02274671,
         -0.00794175, -0.00115105],
        [ 0.00657598, -0.00107832, -0.00325419, ...,  0.02274671,
         -0.00794175, -0.00115105],
        ...,
        [-0.01338939, -0.02280666,  0.01228649, ...,  0.00759742,
          0.01252393,  0.0011347 ],
        [-0.01338939, -0.02280666,  0.01228649, ...,  0.00759742,
          0.01252393,  0.0011347 ],
        [-0.01338939, -0.02280666,  0.01228649, ...,  0.00759742,
          0.01252393,  0.0011347 ]]),
 'cap_embs': array([[ 3.03832311e-02,  8.33719969e-05,  2.21480872e-03, ...,
          1.16159320e-02, -1.72435623e-02,  2.08924115e-02],
        [ 5.39791072e-03,  2.30839998e-02, -6.37858408e-04, ...,
         -1.54549349e-02, -3.88105884e-02,  9.45706014e-03],
        [ 4.25532646e-03,  1.82949379e-02,  1.16008311e-03, ...,
         -1.43107139e-02, -3.1

In [148]:
temp1['audio_embs'].shape

(5225, 300)

## 앙상블 validation 성능 확인

In [191]:
ckp_path1 = '/home/user/audio-text_retrieval/outputs/0511_freeze_True_lr_0.0001_seed_20/models' #baseline model(cnn+sbert+ntxent) (0.227)
ckp_path2 = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/models' #baseline+diff_seed (0.229)
ckp_path3 = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/models' #cnn+sbert+triplet_weighted(0.235) ***
ckp_path4 = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/models' #cnn+sbert+triplet-max (0.23)
ckp_path5 = '/home/user/audio-text_retrieval/0512_resnetweight_freeze_True_lr_0.0001_seed_1234/models' #Resnet+SBert+t-weighted(0.22)
ckp_path6 = '/home/user/audio-text_retrieval/outputs/0512_WLCNN_weight_freeze_True_lr_0.0001_seed_1234/models' #wavgramcnn14+sbert+t-weighted(0.213)


pickle_path1 = '/home/user/audio-text_retrieval/outputs/0511_freeze_True_lr_0.0001_seed_20/pickle/temporal_embeddings.pkl'
pickle_path2 = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
#pickle_path3 = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
pickle_path3 = '/home/user/audio-text_retrieval/outputs/cnn14triplet_weight/pickle/temporal_embeddings.pkl'

pickle_path4 = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
pickle_path5 = '/home/user/audio-text_retrieval/outputs/0512_resnetweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
pickle_path6 = '/home/user/audio-text_retrieval/outputs/0512_WLCNN_weight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
pickle_path7 = '/home/user/audio-text_retrieval/outputs/0511_resnet_freeze_True_lr_0.0001_seed_20/pickle/temporal_embeddings.pkl'
pickle_path8 = '/home/user/audio-text_retrieval/outputs/0515_cnn14infovic_freeze_True_lr_0.0001_seed_42/pickle/temporal_embeddings.pkl'
pickle_path9 = '/home/user/audio-text_retrieval/outputs/0515_cnn14info_freeze_True_lr_0.0001_seed_42/pickle/temporal_embeddings.pkl'

In [193]:
with open(pickle_path1, 'rb') as f:  
    temporal_dict1=pickle.load(f)
    
with open(pickle_path2, 'rb') as f:  
    temporal_dict2=pickle.load(f)
    
with open(pickle_path3, 'rb') as f:  
    temporal_dict3=pickle.load(f)
    
with open(pickle_path4, 'rb') as f:  
    temporal_dict4=pickle.load(f)
    
with open(pickle_path5, 'rb') as f:  
    temporal_dict5=pickle.load(f)
    
with open(pickle_path6, 'rb') as f:  
    temporal_dict6=pickle.load(f)

with open(pickle_path7, 'rb') as f:  
    temporal_dict7=pickle.load(f)

with open(pickle_path8, 'rb') as f:  
    temporal_dict8=pickle.load(f)
with open(pickle_path9, 'rb') as f:  
    temporal_dict9=pickle.load(f)

In [130]:
print(t2a(temporal_dict1['audio_embs'],temporal_dict1['cap_embs']))
print(t2a(temporal_dict2['audio_embs'],temporal_dict2['cap_embs']))
print(t2a(temporal_dict3['audio_embs'],temporal_dict3['cap_embs']))
print(t2a(temporal_dict4['audio_embs'],temporal_dict4['cap_embs']))
print(t2a(temporal_dict5['audio_embs'],temporal_dict5['cap_embs']))
print(t2a(temporal_dict6['audio_embs'],temporal_dict6['cap_embs']))

(13.24401913875598, 35.196172248803826, 48.76555023923445, 22.687271208323843, 11.0, 38.03196172248804)
(13.397129186602871, 35.291866028708135, 48.61244019138756, 22.868504594820383, 11.0, 38.2044019138756)
(13.85645933014354, 36.07655502392345, 49.24401913875598, 23.538406622617146, 11.0, 38.47406698564593)
(13.741626794258373, 35.25358851674641, 48.44019138755981, 22.998048150679733, 11.0, 42.82468899521531)
(13.205741626794259, 34.10526315789474, 47.13875598086124, 22.334176349965823, 12.0, 41.948516746411485)
(11.866028708133971, 33.41626794258373, 46.354066985645936, 21.30792891319207, 12.0, 42.48133971291866)


---

In [147]:
audio_mean5 = np.mean([temporal_dict2['audio_embs'], temporal_dict3['audio_embs'],
                      temporal_dict4['audio_embs'], temporal_dict5['audio_embs'],temporal_dict6['audio_embs']], axis=0)

caption_mean5 = np.mean([temporal_dict2['cap_embs'], temporal_dict3['cap_embs'],
                      temporal_dict4['cap_embs'], temporal_dict5['cap_embs'],temporal_dict6['cap_embs']], axis=0)

In [148]:
t2a(audio_mean5,caption_mean5)

(14.736842105263158,
 37.588516746411486,
 50.67942583732057,
 24.46102377155009,
 10.0,
 37.58411483253589)

---

`baseline+ntxent (0.229) `  
`cnn+sbert+triplet_weighted(0.235) ***`   
`cnn+sbert+triplet-max (0.23)  `  
-> 0.24


In [156]:
audio_mean234 = np.mean([temporal_dict2['audio_embs'], temporal_dict3['audio_embs'],
                      temporal_dict4['audio_embs'],], axis=0)

caption_mean234 = np.mean([temporal_dict2['cap_embs'], temporal_dict3['cap_embs'],
                      temporal_dict4['cap_embs'],], axis=0)

t2a(audio_mean234,caption_mean234)

(14.258373205741627,
 37.43540669856459,
 50.143540669856456,
 24.00644034328245,
 10.0,
 37.516746411483254)

In [157]:
audio_mean123 = np.mean([temporal_dict2['audio_embs'], temporal_dict3['audio_embs'],
                      temporal_dict1['audio_embs'],], axis=0)

caption_mean123 = np.mean([temporal_dict2['cap_embs'], temporal_dict3['cap_embs'],
                      temporal_dict1['cap_embs'],], axis=0)

t2a(audio_mean123,caption_mean123)

(13.645933014354068,
 35.483253588516746,
 49.22488038277512,
 23.13321941216678,
 11.0,
 37.747751196172246)

In [168]:
audio_mean4567 = np.mean([temporal_dict4['audio_embs'], temporal_dict5['audio_embs'],
                      temporal_dict6['audio_embs'], temporal_dict7['audio_embs']], axis=0)

caption_mean4567 = np.mean([temporal_dict4['cap_embs'], temporal_dict5['cap_embs'],
                      temporal_dict6['cap_embs'], temporal_dict7['cap_embs']], axis=0)

t2a(audio_mean4567, caption_mean4567)

(13.894736842105264,
 36.30622009569378,
 49.645933014354064,
 23.59407609933926,
 11.0,
 39.90315789473684)

----
### same model different seeds

In [151]:
same_pickle1 = '/home/user/audio-text_retrieval/outputs/0511_freeze_True_lr_0.0001_seed_20/pickle/temporal_embeddings.pkl'
same_pickle2 = '/home/user/audio-text_retrieval/outputs/0511_diff1_freeze_True_lr_0.0001_seed_42/pickle/temporal_embeddings.pkl'
same_pickle3 = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
same_pickle4 = '/home/user/audio-text_retrieval/outputs/0511_diff3_freeze_True_lr_0.0001_seed_0/pickle/temporal_embeddings.pkl'

with open(same_pickle1, 'rb') as f:  
    same_pick1=pickle.load(f)
    
with open(same_pickle2, 'rb') as f:  
    same_pick2=pickle.load(f)
    
with open(same_pickle3, 'rb') as f:  
    same_pick3=pickle.load(f)
    
with open(same_pickle4, 'rb') as f:  
    same_pick4=pickle.load(f)

print(t2a(same_pick1['audio_embs'],same_pick1['cap_embs']))
print(t2a(same_pick2['audio_embs'],same_pick2['cap_embs']))
print(t2a(same_pick3['audio_embs'],same_pick3['cap_embs']))
print(t2a(same_pick4['audio_embs'],same_pick4['cap_embs']))


(13.24401913875598, 35.196172248803826, 48.76555023923445, 22.687271208323843, 11.0, 38.03196172248804)
(12.114832535885167, 34.39234449760765, 48.248803827751196, 21.7010101010101, 11.0, 37.78832535885167)
(13.397129186602871, 35.291866028708135, 48.61244019138756, 22.868504594820383, 11.0, 38.2044019138756)
(13.090909090909092, 34.622009569377994, 48.248803827751196, 22.415128730918205, 11.0, 38.15598086124402)


In [152]:
audio_diff_seed = np.mean([same_pick1['audio_embs'], same_pick2['audio_embs'],
                      same_pick3['audio_embs'], same_pick4['audio_embs']], axis=0)

caption_diff_seed = np.mean([same_pick1['cap_embs'], same_pick2['cap_embs'],
                      same_pick3['cap_embs'], same_pick4['cap_embs']], axis=0)

t2a(audio_diff_seed,caption_diff_seed)

(12.267942583732058,
 33.473684210526315,
 47.73205741626794,
 21.696012759170657,
 12.0,
 40.57531100478469)

----
### same model, different loss

In [154]:
pickle_path_a = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
#different loss
pickle_path_b = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
pickle_path_c= '/home/user/audio-text_retrieval/outputs/0512_tripletsum_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'
pickle_path_d = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_embeddings.pkl'

with open(pickle_path_a, 'rb') as f:  
    pickle_a=pickle.load(f)
    
with open(pickle_path_b, 'rb') as f:  
    pickle_b=pickle.load(f)
    
with open(pickle_path_c, 'rb') as f:  
    pickle_c=pickle.load(f)
    
with open(pickle_path_d, 'rb') as f:  
    pickle_d=pickle.load(f)

print(t2a(pickle_a['audio_embs'],pickle_a['cap_embs']))
print(t2a(pickle_b['audio_embs'],pickle_b['cap_embs']))
print(t2a(pickle_c['audio_embs'],pickle_c['cap_embs']))
print(t2a(pickle_d['audio_embs'],pickle_d['cap_embs']))

(13.397129186602871, 35.291866028708135, 48.61244019138756, 22.868504594820383, 11.0, 38.2044019138756)
(13.741626794258373, 35.25358851674641, 48.44019138755981, 22.998048150679733, 11.0, 42.82468899521531)
(11.578947368421053, 31.770334928229666, 45.89473684210526, 20.42282980177717, 13.0, 39.433875598086125)
(13.85645933014354, 36.07655502392345, 49.24401913875598, 23.538406622617146, 11.0, 38.47406698564593)


In [155]:
audio_4loss = np.mean([pickle_a['audio_embs'], pickle_b['audio_embs'],
                      pickle_c['audio_embs'], pickle_d['audio_embs']], axis=0)

caption_4loss = np.mean([pickle_a['cap_embs'], pickle_b['cap_embs'],
                      pickle_c['cap_embs'], pickle_d['cap_embs']], axis=0)
t2a(audio_4loss,caption_4loss)

(14.16267942583732,
 37.014354066985646,
 49.8755980861244,
 23.736424394319133,
 11.0,
 36.688421052631575)

In [322]:
pickle_path_e = '/home/user/audio-text_retrieval/outputs/0515_cnn14infovic_freeze_True_lr_0.0001_seed_42/pickle/temporal_embeddings.pkl'
pickle_path_f = '/home/user/audio-text_retrieval/outputs/0515_cnn14info_freeze_True_lr_0.0001_seed_42/pickle/temporal_embeddings.pkl'

with open(pickle_path_e, 'rb') as f:  
    pickle_e=pickle.load(f)
    
with open(pickle_path_f, 'rb') as f:  
    pickle_f=pickle.load(f)

In [324]:
audio_2loss = np.mean([pickle_e['audio_embs'], pickle_f['audio_embs']], axis=0)

caption_2loss = np.mean([pickle_e['cap_embs'], pickle_f['cap_embs']], axis=0)
t2a(audio_2loss,caption_2loss)

(10.583732057416269,
 30.545454545454547,
 43.4066985645933,
 19.291767297030457,
 14.0,
 43.324976076555025)

In [325]:
t2a(pickle_e['audio_embs'],pickle_e['cap_embs'])


(10.583732057416269,
 30.62200956937799,
 43.291866028708135,
 19.299718994455837,
 14.0,
 43.36631578947368)

In [326]:
t2a(pickle_f['audio_embs'],pickle_f['cap_embs'])


(10.813397129186603,
 30.679425837320576,
 43.21531100478469,
 19.3444976076555,
 14.0,
 43.32382775119617)

In [323]:
def t2a(audio_embs, cap_embs, return_ranks=False):
    # caption to audio retrieval
    num_audios = int(audio_embs.shape[0] / 5)

    audios = np.array([audio_embs[i]for i in range(0, audio_embs.shape[0], 5)])

    ranks = np.zeros(5 * num_audios)
    top10 = np.zeros([5 * num_audios,10])

    for index in range(num_audios):

        # get query captions
        queries = cap_embs[5 * index: 5 * index + 5]

        # compute scores
        # queries @ audio.T
        d = util.cos_sim(torch.Tensor(queries), torch.Tensor(audios)).numpy()

        inds = np.zeros(d.shape)
        for i in range(len(inds)):
            inds[i] = np.argsort(d[i])[::-1]
            ranks[5 * index + i] = np.where(inds[i] == index)[0][0]
            top10[5 * index + i] = inds[i][0:10]

    # compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
    mAP10 = 100.0 * np.sum(1 / (ranks[np.where(ranks < 10)[0]] + 1)) / len(ranks)
    
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1
    if return_ranks:
        return r1, r5, r10, mAP10, medr, meanr, ranks, top10
    else:
        return r1, r5, r10, mAP10, medr, meanr

----

## 제출용 csv 파일 만들기

### 1번째

In [279]:
pickle_path1 = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path2 = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path3 = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path4 = '/home/user/audio-text_retrieval/outputs/0512_resnetweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path5 = '/home/user/audio-text_retrieval/outputs/0512_WLCNN_weight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'

with open(pickle_path1, 'rb') as f:  
    pickle1=pickle.load(f)
    
with open(pickle_path2, 'rb') as f:  
    pickle2=pickle.load(f)
    
with open(pickle_path3, 'rb') as f:  
    pickle3=pickle.load(f)
    
with open(pickle_path4, 'rb') as f:  
    pickle4=pickle.load(f)
    
with open(pickle_path5, 'rb') as f:  
    pickle5=pickle.load(f)

In [289]:
pickle1.keys()

dict_keys(['audio_embs', 'cap_embs', 'audio_names_', 'caption_names'])

In [286]:
audio_ensemble1 = np.mean([pickle1['audio_embs'], pickle2['audio_embs'],
                      pickle3['audio_embs'], pickle4['audio_embs'], pickle5['audio_embs']], axis=0)

caption_ensemble1 = np.mean([pickle1['cap_embs'], pickle2['cap_embs'],
                      pickle3['cap_embs'], pickle4['cap_embs'], pickle5['cap_embs']], axis=0)
    
#t2a(audio_mean23456,caption_mean23456) #24.46


In [284]:
def t2a_retrieval(audio_embs, cap_embs, return_ranks=True):
    # caption to audio retrieval
    num_audios = int(audio_embs.shape[0])

    audios = np.array([audio_embs[i]for i in range(num_audios)])

    ranks = np.zeros(num_audios)
    top10 = np.zeros([num_audios,10])

    for index in range(num_audios):

        # get query captions
        query = cap_embs[index]

        # queries @ audio.T
        d = util.cos_sim(torch.Tensor(query), torch.Tensor(audios)).numpy()

        inds = np.argsort(d[0])[::-1]
        top10[index] = inds[0:10]

    return top10

In [285]:
import pandas as pd
import numpy as np
from pathlib import Path


def make_csv(caption_names, audio_names_, top10_index, csv_output_path):
    df_rank=pd.DataFrame([audio_names_[i.astype(int)] for i in top10_index],index=caption_names)
    df_rank.columns=["fname_1","fname_2","fname_3","fname_4","fname_5","fname_6","fname_7","fname_8","fname_9","fname_10"]
    df_rank.to_csv(csv_output_path, index=True)
    print("Saved CSV file at {}".format(csv_output_path))


In [287]:
top10 = t2a_retrieval(audio_ensemble1, caption_ensemble1,return_ranks=True)


In [290]:
#top10 = t2a_retrieval(audio_ensemble1, caption_ensemble1,return_ranks=True)
make_csv(pickle1['caption_names'], pickle1['audio_names_'], top10, csv_output_path='ensemble1.csv')

Saved CSV file at ensemble1.csv


### 2번째

In [281]:
pickle_path6 = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path7 = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path8 = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'

with open(pickle_path6, 'rb') as f:  
    pickle6=pickle.load(f)
    
with open(pickle_path7, 'rb') as f:  
    pickle7=pickle.load(f)
    
with open(pickle_path8, 'rb') as f:  
    pickle8=pickle.load(f)
    



In [291]:
audio_ensemble2 = np.mean([pickle6['audio_embs'], pickle7['audio_embs'],pickle8['audio_embs']], axis=0)
caption_ensemble2 = np.mean([pickle6['cap_embs'], pickle7['cap_embs'],pickle8['cap_embs']], axis=0)
    
#t2a(audio_mean23456,caption_mean23456) #24.46

In [292]:
top10 = t2a_retrieval(audio_ensemble2, caption_ensemble2,return_ranks=True)
make_csv(pickle1['caption_names'], pickle1['audio_names_'], top10, csv_output_path='ensemble2.csv')

Saved CSV file at ensemble2.csv


### 3번째

Cnn14, Resnet38, Wavegram-Logmel-Cnn14
, sbert
triplet-weight

In [282]:
pickle_path9 = '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path10 = '/home/user/audio-text_retrieval/outputs/0512_resnetweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path11= '/home/user/audio-text_retrieval/outputs/0512_WLCNN_weight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path12 = '/home/user/audio-text_retrieval/outputs/0511_resnet_freeze_True_lr_0.0001_seed_20/pickle/temporal_csv_embeddings.pkl'


with open(pickle_path9, 'rb') as f:  
    pickle9=pickle.load(f)
    
with open(pickle_path10, 'rb') as f:  
    pickle10=pickle.load(f)
    
with open(pickle_path11, 'rb') as f:  
    pickle11=pickle.load(f)

with open(pickle_path12, 'rb') as f:  
    pickle12=pickle.load(f)
    

In [293]:
audio_ensemble3 = np.mean([pickle9['audio_embs'], pickle10['audio_embs'],pickle11['audio_embs'],pickle12['audio_embs']], axis=0)
caption_ensemble3 = np.mean([pickle9['cap_embs'], pickle10['cap_embs'],pickle11['cap_embs'],pickle12['cap_embs']], axis=0)

In [294]:
top10 = t2a_retrieval(audio_ensemble3, caption_ensemble3, return_ranks=True)
make_csv(pickle1['caption_names'], pickle1['audio_names_'], top10, csv_output_path='ensemble3.csv')

Saved CSV file at ensemble3.csv


### 4번째

In [283]:
pickle_path13 = '/home/user/audio-text_retrieval/outputs/0511_diff2_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path14= '/home/user/audio-text_retrieval/outputs/0512_triplet_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path15 = '/home/user/audio-text_retrieval/outputs/0512_tripletsum_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'
pickle_path16 = '/home/user/audio-text_retrieval/outputs/0512_tripletweight_freeze_True_lr_0.0001_seed_1234/pickle/temporal_csv_embeddings.pkl'

with open(pickle_path13, 'rb') as f:  
    pickle13=pickle.load(f)
    
with open(pickle_path14, 'rb') as f:  
    pickle14=pickle.load(f)
    
with open(pickle_path15, 'rb') as f:  
    pickle15=pickle.load(f)

with open(pickle_path16, 'rb') as f:  
    pickle16=pickle.load(f)


In [295]:
audio_ensemble4 = np.mean([pickle13['audio_embs'], pickle14['audio_embs'],pickle15['audio_embs'],pickle16['audio_embs']], axis=0)
caption_ensemble4 = np.mean([pickle13['cap_embs'], pickle14['cap_embs'],pickle15['cap_embs'],pickle16['cap_embs']], axis=0)

In [296]:
top10 = t2a_retrieval(audio_ensemble4, caption_ensemble4, return_ranks=True)
make_csv(pickle1['caption_names'], pickle1['audio_names_'], top10, csv_output_path='ensemble4.csv')

Saved CSV file at ensemble4.csv


### 5번째

In [298]:
pickle_path18 = '/home/user/audio-text_retrieval/outputs/0515_cnn14infovic_freeze_True_lr_0.0001_seed_42/pickle/temporal_csv_embeddings.pkl'
pickle_path19 = '/home/user/audio-text_retrieval/outputs/0515_cnn14info_freeze_True_lr_0.0001_seed_42/pickle/temporal_csv_embeddings.pkl'

with open(pickle_path18, 'rb') as f:  
    pickle18=pickle.load(f)

with open(pickle_path19, 'rb') as f:  
    pickle19=pickle.load(f)

In [299]:
audio_ensemble5 = np.mean([pickle18['audio_embs'], pickle19['audio_embs']], axis=0)
caption_ensemble5 = np.mean([pickle18['cap_embs'], pickle19['cap_embs']], axis=0)

In [300]:
top10 = t2a_retrieval(audio_ensemble5, caption_ensemble5, return_ranks=True)
make_csv(pickle1['caption_names'], pickle1['audio_names_'], top10, csv_output_path='ensemble5.csv')

Saved CSV file at ensemble5.csv


------

## changing column name

In [1]:
import pandas as pd

def save_csv(file_name, change_csv_name):
    a = pd.read_csv(file_name)
    a.rename(columns = {'Unnamed: 0' : 'caption'}, inplace=True)
    a.to_csv(change_csv_name, index=False)
    return 

In [321]:
save_csv('ensemble1.csv', 'Park_CAU_task6b_1.output.csv')
save_csv('ensemble2.csv', 'Park_CAU_task6b_2.output.csv')
save_csv('ensemble3.csv', 'Park_CAU_task6b_3.output.csv')
save_csv('ensemble4.csv', 'Park_CAU_task6b_4.output.csv')
save_csv('ensemble5.csv', 'Park_CAU_task6b_5.output.csv')

In [2]:
save_csv('results-Copy1.csv', 'Park_CAU_task6b_6.output.csv')