In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print("Number of available GPUs:", num_gpus)

    # Print the names of the visible GPUs
    for i in range(num_gpus):
        print("GPU", i, ":", torch.cuda.get_device_name(i))
else:
    print("CUDA is not available. Only CPU will be used.")

Number of available GPUs: 1
GPU 0 : NVIDIA GeForce RTX 2080 Ti


In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

from easydict import EasyDict
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import torch
import os.path as osp
from tqdm.auto import tqdm as tqdm
from languagebind import LanguageBind, to_device, transform_dict, LanguageBindImageTokenizer
from vl_ret.metrics import compute_metrics
import sys


class MyMSRVTT_DataLoader(Dataset):
    """MSRVTT dataset loader."""
    def __init__(
            self,
            csv_path,
            features_path,
    ):
        self.data = pd.read_csv(csv_path)
        self.features_path = features_path
       

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        video_id = self.data['video_id'].values[idx]
        sentence = self.data['sentence'].values[idx]

        video_path = os.path.join(self.features_path, "{}.mp4".format(video_id))
        return sentence, video_path

def get_args_msrvtt():
    # build args
    args = {
        "val_csv": '/raid/1moritz/datasets//MSRVTT/MSRVTT_JSFUSION_test.csv',
        "features_path": '/raid/1moritz/datasets//MSRVTT/MSRVTT_Videos',
        # "max_words": 77,
        # "feature_framerate": 1,
        # "max_frames": 8,
        # "eval_frame_order": 0,
        # "slice_framepos": 2,
        "batch_size_val": 8,
        "num_thread_reader": 1,
        "cache_dir": '/raid/1moritz/models/languagebind/downloaded_weights',
        # "model": 'laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K',
    }
    args = EasyDict(args)
    return args

def run_msrvtt_eval(model:LanguageBind, tokenizer:LanguageBindImageTokenizer, dataloader:DataLoader, modality_transform: dict, device: torch.device):
    batch_sentences_embeddings, batch_videos_embeddings = [], []
    # Calculate embeddings
    for bid, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        sentences, video_paths = batch

        if not isinstance(sentences, list):
            sentences = list(sentences)
        if not isinstance(video_paths, list):
            video_paths= list(video_paths)

        # print(sentences)
        # print(type(sentences))
        # print(video_paths)
        # print(type(video_paths))
        # sys.exit()
        inputs = {
            'video': to_device(modality_transform['video'](video_paths), device),
        }
        inputs['language'] = to_device(tokenizer(sentences, max_length=77, padding='max_length',
                                            truncation=True, return_tensors='pt'), device)
        
        with torch.no_grad():
            embeddings = model(inputs)

        batch_sentences_embeddings.append(embeddings['language'])
        batch_videos_embeddings.append(embeddings['video'])

    # Create similarity matrix
    sim_matrix = create_sim_matrix(batch_sentences_embeddings, batch_videos_embeddings)

    # Log metrics
    print(f"MSRVTT sim matrix size: {sim_matrix.shape[0]}, {sim_matrix.shape[1]}")
    tv_metrics = compute_metrics(sim_matrix)
    vt_metrics = compute_metrics(sim_matrix.T)
    print('\t Length-T: {}, Length-V:{}'.format(len(sim_matrix), len(sim_matrix[0])))

    print(f"MSRVTT Text-to-Video:")
    print('\t>>>  R@1: {:.1f} - R@5: {:.1f} - R@10: {:.1f} - Median R: {:.1f} - Mean R: {:.1f}'.
                format(tv_metrics['R1'], tv_metrics['R5'], tv_metrics['R10'], tv_metrics['MR'], tv_metrics['MeanR']))
    print(f"MSRVTT Video-to-Text:")
    print('\t>>>  V2T$R@1: {:.1f} - V2T$R@5: {:.1f} - V2T$R@10: {:.1f} - V2T$Median R: {:.1f} - V2T$Mean R: {:.1f}'.
                format(vt_metrics['R1'], vt_metrics['R5'], vt_metrics['R10'], vt_metrics['MR'], vt_metrics['MeanR']))

def create_sim_matrix(batch_sentences_embeddings, batch_videos_embeddings):
    """Calculate embedding vector product for similarity and download result to CPU
    
        Returns: 
            sim_matrix (Text X Video)
    """
    sim_matrix = []
    for idx1 in range(len(batch_sentences_embeddings)):
        sequence_output = batch_sentences_embeddings[idx1]
        each_row = []
        for idx2 in range(len(batch_videos_embeddings)):
            visual_output = batch_videos_embeddings[idx2]
            b1b2 =  sequence_output @ visual_output.T
            b1b2 = b1b2.cpu().detach().numpy()
            each_row.append(b1b2)
        each_row = np.concatenate(tuple(each_row), axis=-1)
        sim_matrix.append(each_row)
    sim_matrix = np.concatenate(tuple(sim_matrix), axis=0)
    return sim_matrix

def main():
    device = 'cuda:0'
    device = torch.device(device)
    clip_type = {
        'video': 'LanguageBind_Video_FT',  # also LanguageBind_Video
        'audio': 'LanguageBind_Audio_FT',  # also LanguageBind_Audio
        'image': 'LanguageBind_Image',
    }
    args = get_args_msrvtt()

    model = LanguageBind(clip_type=clip_type, cache_dir=args.cache_dir)
    model = model.to(device)
    model.eval()
    pretrained_ckpt = f'LanguageBind/LanguageBind_Image'
    tokenizer = LanguageBindImageTokenizer.from_pretrained(pretrained_ckpt, cache_dir=osp.join(args.cache_dir, 'tokenizer_cache_dir'))
    modality_transform = {c: transform_dict[c](model.modality_config[c]) for c in clip_type.keys()}

    dataloader_msrvtt = DataLoader(
        MyMSRVTT_DataLoader(csv_path=args.val_csv, features_path=args.features_path),
        batch_size=args.batch_size_val,
        num_workers=args.num_thread_reader,
        shuffle=False,
        drop_last=False,
    )
    run_msrvtt_eval(model, tokenizer, dataloader_msrvtt, modality_transform, device)


if __name__ == '__main__':
    main()




  0%|          | 0/125 [00:00<?, ?it/s]

MSRVTT sim matrix size: 1000, 1000
	 Length-T: 1000, Length-V:1000
MSRVTT Text-to-Video:
	>>>  R@1: 44.9 - R@5: 67.9 - R@10: 77.2 - Median R: 2.0 - Mean R: 22.3
MSRVTT Video-to-Text:
	>>>  V2T$R@1: 41.6 - V2T$R@5: 66.2 - V2T$R@10: 76.5 - V2T$Median R: 2.0 - V2T$Mean R: 22.5
