In [1]:
import argparse
import gdown
import cv2
import numpy as np
import os
import sys
sys.path.append(sys.path[0]+"/tracker")
sys.path.append(sys.path[0]+"/tracker/model")
from track_anything import TrackingAnything
from track_anything import parse_augment
import requests
import json
import torchvision
import torch 
from tools.painter import mask_painter
import psutil
import time
try: 
    from mmcv.cnn import ConvModule
except:
    os.system("mim install mmcv")
import matplotlib.pyplot as plt
from pycocotools import mask as maskUtils
from PIL import Image
from dataset.dataset import DAVIS_MO_Test
from dataset.longdataset import LongVideoDataset
import pandas as pd
import warnings
import random
import string

In [2]:
if os.name == 'posix':
    ovis_anotations = '../data.nosync/OVIS/annotations/'
    ovis_images = '../data.nosync/OVIS/train_images/'
    davis_root = '../data.nosync/DAVIS2017'
    longdataset_root = '../data.nosync/LongDataset/'
else:
    ovis_anotations = 'D:/HADA/data/OVIS/annotations/'
    ovis_images = 'D:/HADA/data/OVIS/train_images/'
    davis_root = 'D:\HADA\data\DAVIS'

all_tests_csv = './result/all_tests.csv'

In [3]:
def cargarDatos(ruta_ann):
    with open(ruta_ann + 'annotations_train.json') as f:
        annotationsTrain = json.load(f)

    with open(ruta_ann + 'annotations_valid.json') as f:
        annotationsValid = json.load(f)

    with open(ruta_ann + 'annotations_test.json') as f:
        annotationsTest = json.load(f)

    clases = annotationsTrain['categories']
    vidTrain = annotationsTrain['videos']
    annTrain = annotationsTrain['annotations']
    vidValid = annotationsValid['videos']
    annValid = annotationsValid['annotations']
    vidTest = annotationsTest['videos']
    annTest = annotationsTest['annotations']

    return clases, vidTrain, annTrain, vidValid, annValid, vidTest, annTest

clases, vidTrain, annTrain, vidValid, annValid, vidTest, annTest = cargarDatos(ovis_anotations)         

In [4]:
def annToRLE(ann, frameId):
    """
    Convert annotation which can be polygons, uncompressed RLE to RLE.
    :return: binary mask (numpy 2D array)
    """
    h, w = ann['height'], ann['width']
    segm = ann['segmentations'][frameId]
    if segm is None:
        return None
    if type(segm) == "list":
        # polygon -- a single object might consist of multiple parts
        # we merge all parts into one mask rle code
        rles = maskUtils.frPyObjects(segm, h, w)
        rle = maskUtils.merge(rles)
    elif type(segm['counts']) == "list":
        # uncompressed RLE
        rle = maskUtils.frPyObjects(segm, h, w)
    else:
        # rle
        rle = segm
    return rle


def annToMask(ann, frameId):
    """
    Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
    :return: binary mask (numpy 2D array)
    """
    rle = annToRLE(ann, frameId)
    if rle is not None:
        m = maskUtils.decode(rle)
        return m



def combineMasks(masks, width, height):
    # Crear una matriz vacía para la máscara combinada
    combined = np.zeros((height, width), dtype=np.uint8)

    # Combinar las máscaras en la matriz vacía
    for mask in masks:
        combined += mask  # Sumar la máscara a la máscara combinada

    # Aplicar umbral para obtener una única máscara binaria
    combined = np.where(combined > 0, 1, 0)
    return combined

def unifyMasks(masks, width, height):
    # Crear una matriz vacía para la máscara combinada
    unified = np.zeros((height, width), dtype=np.uint8)

    # Combinar las máscaras en la matriz vacía
    for mask in masks:
        unified += mask  # Sumar la máscara a la máscara combinada

    
    return unified

In [5]:
def load_images_from_folder(path,image_files):
    images = []
    for file in image_files:
        img = cv2.imread(os.path.join(path,file))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        images.append(img)
    return images

def load_all_images_davis(loader,video_info):
    name, frames, objects = video_info
    all_images, all_frames = [],[]
    for i in range(0,frames):
        F_last,M_last = loader.load_single_image(name,i)
        all_images.append((np.array(F_last[:,0]).transpose(1, 2, 0)* 255.).astype(np.uint8))
        all_frames.append(np.array(M_last[1:objects+1,0]).astype(np.uint8))
    return all_images,all_frames

def load_all_initial_masks_from_dataset():
    all_masks = []
    for video in vidTrain:
        ann = [a for a in annTrain if a['video_id'] == video['id']]
        masks = [annToMask(a, 0) * (i + 1) for i, a in enumerate(ann) if annToMask(a, 0) is not None]
        all_masks.append(unifyMasks(masks, video['width'], video['height']))
    return all_masks

def load_all_masks_for_video(video,num_masks):
    ann = [a for a in annTrain if a['video_id'] == video['id']]
    all_masks  = []
    for image_num in range(0,video['length']):
        masks = []
        for i in range(0, num_masks):
            annot = annToMask(ann[i], image_num)
            if annot is not None: masks.append(annot * (i + 1))
        single_mask = unifyMasks(masks, video['width'], video['height'])
        all_masks.append(single_mask)
    return all_masks

def pad_to_divisible_by_two(frames):
    max_height = max(frame.shape[0] for frame in frames)
    max_width = max(frame.shape[1] for frame in frames)
    new_height = max_height + 1 if max_height % 2 != 0 else max_height
    new_width = max_width + 1 if max_width % 2 != 0 else max_width

    padded_frames = []
    for frame in frames:
        height_pad = new_height - frame.shape[0]
        width_pad = new_width - frame.shape[1]
        padded_frame = np.pad(frame, ((0, height_pad), (0, width_pad), (0, 0)), mode='constant')
        padded_frames.append(padded_frame)

    return padded_frames

def generate_video_from_frames(frames, output_path, fps=30):
    frames = torch.from_numpy(np.asarray(frames))
    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))
    torchvision.io.write_video(output_path, frames, fps=fps, video_codec="libx264")
    return output_path

def select_smaller_set(videos, size = 100):
    size = [(video['id'],video['height']*video['height']*len(video['file_names'])) for video in videos]
    #size = [(video['id'],video['height']*video['height']) for video in videos]
    sorted_list = sorted(size, key=lambda x: x[1])
    listed = [tup[0] for tup in sorted_list[:100]]
    filtered_list = [d for id_ in listed for d in vidTrain if d.get('id') == id_]
    return filtered_list

def select_bigger_set(videos, size = 100):
    size = [(video['id'],video['height']*video['height']*len(video['file_names'])) for video in videos]
    sorted_list = sorted(size, key=lambda x: x[1])
    listed = [tup[0] for tup in sorted_list[::-1][:100]]
    filtered_list = [d for id_ in listed for d in vidTrain if d.get('id') == id_]
    return filtered_list

In [6]:
def calculate_iou(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate intersection and union for each label
    labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    intersection = np.zeros_like(mask1, dtype=np.float32)
    union = np.zeros_like(mask1, dtype=np.float32)
    iou_per_label = {}

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label
        c_intersection = np.logical_and(mask1_label, mask2_label)
        c_union = np.logical_or(mask1_label, mask2_label)
        intersection += c_intersection
        union += c_union
        iou_per_label[label] = np.sum(c_intersection) / np.sum(c_union)

    # Calculate IoU as total of the mask 
    iou = np.sum(intersection) / np.sum(union)
    
    # Calculate IoU as mean of objects
    iou_mean_object = sum(iou_per_label.values()) / len(iou_per_label)

    return iou,iou_mean_object, iou_per_label

def compute_f_score(true_positives, false_positives, false_negatives):
    divider = (true_positives + false_positives)
    precision = (true_positives / divider) if divider != 0 else 0

    divider = (true_positives + false_negatives)
    recall = (true_positives / divider) if divider != 0 else 0

    divider = (precision + recall)
    f_measure = (2 * (precision * recall) / divider) if divider != 0 else 0
    return f_measure

def compute_f_measure(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate F-measure for each label
    labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    f_measure_per_label = {}
    add_true_positives = 0
    add_false_positives = 0
    add_false_negatives = 0

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label

        true_positives = np.logical_and(mask1_label, mask2_label).sum()
        false_positives = np.logical_and(mask1_label, np.logical_not(mask2_label)).sum()
        false_negatives = np.logical_and(np.logical_not(mask1_label), mask2_label).sum()

        add_true_positives += true_positives
        add_false_positives += false_positives
        add_false_negatives += false_negatives

        f_measure = compute_f_score(true_positives, false_positives, false_negatives)
        f_measure_per_label[label] = f_measure

    # Calculate F Measure as total of the mask 
    overall_f_measure = compute_f_score(add_true_positives, add_false_positives, add_false_negatives)

    # Calculate IoU as mean of objects
    f_mean_object = sum(f_measure_per_label.values()) / len(f_measure_per_label)

    return overall_f_measure,f_mean_object,f_measure_per_label

def split_dict_list_to_lists(dict_list):

    key_lists = {}
    for dictionary in dict_list:
        for key, value in dictionary.items():
            if key in key_lists:
                key_lists[key].append(value)
            else:
                key_lists[key] = [value]
    result = [values_list for _, values_list in key_lists.items()]
    keys_in_order = list(key_lists.keys())
    return result, keys_in_order

def add_dict(list_of_dicts):
    mean_dict = {}
    key_counts = {}

    for d in list_of_dicts:
        for key, value in d.items():
            mean_dict[key] = mean_dict.get(key, 0) + value
            key_counts[key] = key_counts.get(key, 0) + 1

    for key in mean_dict:
        mean_dict[key] /= key_counts[key]
    return mean_dict

def db_statistics(per_frame_values):
    """ Compute mean,recall and decay from per-frame evaluation.
    Arguments:
        per_frame_values (ndarray): per-frame evaluation

    Returns:
        M,O,D (float,float,float):
            return evaluation statistics: mean,recall,decay.
    """

    # strip off nan values
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        M = np.nanmean(per_frame_values)
        O = np.nanmean(per_frame_values > 0.5)

    N_bins = 4
    ids = np.round(np.linspace(1, len(per_frame_values), N_bins + 1) + 1e-10) - 1
    ids = ids.astype(np.uint8)

    D_bins = [per_frame_values[ids[i]:ids[i + 1] + 1] for i in range(0, 4)]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        D = np.nanmean(D_bins[0]) - np.nanmean(D_bins[3])

    return M, O, D
def compute_statistics_per_label(args): 
    f_per_objectframe_list,list_of_keys = args
    metrics_dict = {}
    for metrics,key in zip (f_per_objectframe_list,list_of_keys):
        metrics_dict[f'{key}'] = db_statistics(np.array(metrics))
    return metrics_dict

def compute_all_metrics(name,masks,ground_truth_masks,df_per_frame_metrics):
    f_measure_lst,f_measure_object_lst, f_measure_per_label_lst, iou_lst, iou_object_lst, iou_per_label_lst  =  [], [], [], [], [], []
    for i,(mask_infered, mask_gt) in enumerate(zip(masks,ground_truth_masks)):

            f_measure,f_measure_object, f_measure_per_label = compute_f_measure(mask_infered,mask_gt)
            iou,iou_object, iou_per_label = calculate_iou(mask_infered,mask_gt)
            df_per_frame_metrics.loc[len(df_per_frame_metrics)] = np.array([name,i,f_measure,iou,f_measure_object,iou_object,f_measure_per_label,iou_per_label])
            #print(f'Mask {i + 1}: f_mesure {f_measure}, per label {f_measure_per_label}, iou {iou}, per label {iou_per_label}')

            f_measure_lst.append(f_measure)
            f_measure_object_lst.append(f_measure_object)
            f_measure_per_label_lst.append(f_measure_per_label)
            
            iou_lst.append(iou)
            iou_object_lst.append(iou_object)
            iou_per_label_lst.append(iou_per_label)

    f_statistics = db_statistics(np.array(f_measure_lst))
    j_statistics = db_statistics(np.array(iou_lst))

    f_statistics_object = db_statistics(np.array(f_measure_object_lst))
    j_statistics_object = db_statistics(np.array(iou_object_lst))

    f_statistics_per_label = compute_statistics_per_label(split_dict_list_to_lists(f_measure_per_label_lst))
    j_statistics_per_label = compute_statistics_per_label(split_dict_list_to_lists(iou_per_label_lst))
    return f_statistics,j_statistics,f_statistics_object,j_statistics_object, f_statistics_per_label, j_statistics_per_label




            

In [7]:
def run_model_on_ovis_set(name, model,path_set,videos, annotations, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    df_whole_metrics,df_per_frame_metrics = None,None
    if compute_metrics: 
        g_measures_by_video = ['Video','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay','J-Statiscts-Object','F-Statiscts-Object']
        df_whole_metrics = pd.DataFrame(columns=g_measures_by_video)
        df_per_frame_metrics_col = ['Video','Frame','J','F','J-Mean','F-Mean','J-Object','F-Object']
        df_per_frame_metrics = pd.DataFrame(columns=df_per_frame_metrics_col)

    folder_path = f'./result/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)

    for video in videos:
        # Load all images as np.array
        if verbose: print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
        video_folder = video["file_names"][0].split("/")[0]
        if verbose: print(f'Tracking Video {video_folder} with dimensions {video["width"]}x{video["height"]}')
        if verbose: print('Loading dataset images')
        images = load_images_from_folder(path_set,video['file_names'])

        # Load al poligon of first image to a usable mask
        if verbose: print('Creating first annotated mask for VOS model')
        ann = [a for a in annotations if a['video_id'] == video['id']]
        masks = [(annToMask(a, 0) * (i + 1))  for i, a in enumerate(ann) if annToMask(a, 0) is not None ]
        number_masks = len(masks)
        if verbose: print(f'Initial loaded MASKS {number_masks}')
        initial_mask = unifyMasks(masks, video['width'], video['height'])


        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images = model.generator(images=images[0:10], template_mask=initial_mask)
        model.xmem.clear_memory()  

        if compute_metrics:
            if verbose: print('Computing Metrics')
            ground_truth_masks = load_all_masks_for_video(video,number_masks)
            (f_mean, f_recall, f_decay),(j_mean, j_recall, j_decay),\
            (f_mean_obj, f_recall_obj, f_decay_obj),(j_mean_obj, j_recall_obj, j_decay_obj),\
            f_frame,j_frame = compute_all_metrics(video_folder,masks[1:],ground_truth_masks[1:],df_per_frame_metrics)

            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_folder,(f_mean+j_mean)/2,j_mean,j_recall,j_decay,f_mean,f_recall,f_decay,j_frame,f_frame])
            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_folder + '_object',(f_mean_obj+j_mean_obj)/2,j_mean_obj,j_recall_obj,j_decay_obj,f_mean_obj,f_recall_obj,f_decay_obj,j_frame,f_frame])

                
        if compute_video: 
            if verbose: print('Generating video')
            if video['width'] % 2 != 0 or video['height'] % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{video['id']}_{video_folder}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks'
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,mask in enumerate(painted_images): 
                image = Image.fromarray(mask)
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i + 1)))
    
    if compute_metrics:
        df_per_frame_metrics.to_csv(folder_path + '/per_object_metrics.csv',index=False)
        df_whole_metrics.to_csv(folder_path + '/whole_metrics.csv',index=False)

        all_test_metrics = None
        if os.path.exists(all_tests_csv):
            all_test_metrics = pd.read_csv(all_tests_csv,index_col = None)
        else:
            all_test_col = ['Test','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay',\
                                    'J&F-Mean-Obj', 'J-Mean-Obj', 'J-Recall-Obj', 'J-Decay-Obj', 'F-Mean-Obj', 'F-Recall-Obj']
            all_test_metrics = pd.DataFrame(columns=all_test_col)
        normal_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-2].mean().tolist()
        per_object_mean = df_whole_metrics[~df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-2].mean().tolist()
        all_test_metrics.loc[len(all_test_metrics)] = np.array([name] + normal_mean + per_object_mean)
        all_test_metrics.to_csv(all_tests_csv, index = False)


                
    return masks, logits, painted_images

In [8]:
def run_model_on_davis_set(name, model,videoLoader, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    df_whole_metrics,df_per_frame_metrics = None,None
    if compute_metrics: 
        g_measures_by_video = ['Video','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay','J-Statiscts-Object','F-Statiscts-Object']
        df_whole_metrics = pd.DataFrame(columns=g_measures_by_video)
        df_per_frame_metrics_col = ['Video','Frame','J-Mean','F-Mean','J-Mean-Object','F-Mean-Object','J-Object','F-Object']
        df_per_frame_metrics = pd.DataFrame(columns=df_per_frame_metrics_col)
    
    folder_path = f'./result/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)
    
    for video in list(videoLoader):

        num_objects, info = video
        video_name = info['name'] 
        num_frames = info['num_frames']
        if videoLoader.resolution == '480p':
            height = info['size_480p'][1]
            width = info['size_480p'][0]
        else: 
            if videoLoader.year == '2017':
                height = 3840
                width = 2026
            else:
                height = 1920
                width = 1080        
        if verbose: print(f'Tracking Video {video_name} with dimensions {width}x{height}')
        if verbose: print('Loading dataset images and masks')
        images,ground_truth_masks = load_all_images_davis(videoLoader,(video_name,num_frames,num_objects.item()))
        if verbose: print('Creating first annotated mask for VOS model')
        combined_masks = [[mask * (i+1) for i, mask in enumerate(frameMask)] for frameMask in ground_truth_masks]
        ground_truth_masks = [unifyMasks(mask, height, width) for mask in combined_masks]
        initial_mask = ground_truth_masks[0]
        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images = model.generator(images=images, template_mask=initial_mask)
        model.xmem.clear_memory()  

        if compute_metrics:
            if verbose: print('Computing Metrics')
            (f_mean, f_recall, f_decay),(j_mean, j_recall, j_decay),\
            (f_mean_obj, f_recall_obj, f_decay_obj),(j_mean_obj, j_recall_obj, j_decay_obj),\
            f_frame,j_frame = compute_all_metrics(video_name,masks[1:],ground_truth_masks[1:],df_per_frame_metrics)

            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_name,(f_mean+j_mean)/2,j_mean,j_recall,j_decay,f_mean,f_recall,f_decay,j_frame,f_frame])
            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_name + '_object',(f_mean_obj+j_mean_obj)/2,j_mean_obj,j_recall_obj,j_decay_obj,f_mean_obj,f_recall_obj,f_decay_obj,j_frame,f_frame])

                
        if compute_video: 
            if verbose: print('Generating video')
            if width % 2 != 0 or height % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{video_name}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks/' + video_name
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,mask in enumerate(painted_images): 
                image = Image.fromarray(mask)
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i + 1)))
                
    if compute_metrics:
        df_per_frame_metrics.to_csv(folder_path + '/per_object_metrics.csv',index=False)
        df_whole_metrics.to_csv(folder_path + '/whole_metrics.csv',index=False)

        all_test_metrics = None
        if os.path.exists(all_tests_csv):
            all_test_metrics = pd.read_csv(all_tests_csv,index_col = None)
        else:
            all_test_col = ['Test','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay',\
                                    'J&F-Mean-Obj', 'J-Mean-Obj', 'J-Recall-Obj', 'J-Decay-Obj', 'F-Mean-Obj', 'F-Recall-Obj','F-Decay-Obj']
            all_test_metrics = pd.DataFrame(columns=all_test_col)
        normal_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-2].mean().tolist()
        per_object_mean = df_whole_metrics[~df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-2].mean().tolist()
        all_test_metrics.loc[len(all_test_metrics)] = np.array([name] + normal_mean + per_object_mean)
        all_test_metrics.to_csv(all_tests_csv, index = False)
                
    return masks, logits, painted_images

In [14]:
def run_model_on_longdata_set(name, model,videoLoader, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    df_whole_metrics,df_per_frame_metrics = None,None
    if compute_metrics: 
        g_measures_by_video = ['Video','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay','J-Statiscts-Object','F-Statiscts-Object']
        df_whole_metrics = pd.DataFrame(columns=g_measures_by_video)
        df_per_frame_metrics_col = ['Video','Frame','J-Mean','F-Mean','J-Mean-Object','F-Mean-Object','J-Object','F-Object']
        df_per_frame_metrics = pd.DataFrame(columns=df_per_frame_metrics_col)
    
    folder_path = f'./result/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)
    
    for seq in list(videoLoader.get_sequences()):
        
        all_gt_masks, _, all_masks_id = videoLoader.get_all_masks(seq, True)
        images_root = os.path.join(videoLoader.root_folder ,'JPEGImages',seq)
        file_names = sorted(os.listdir(images_root))
        file_ids = [int(name.split('.')[0]) for name in file_names]
        test_ids = [file_ids.index(int(mask_id)) for mask_id in all_masks_id[1:]]
        all_frames = load_images_from_folder(images_root,file_names)
        initial_mask = all_gt_masks[0,0,:,:]
        #test_ids = [int((int(all_masks_id[i]) - int(all_masks_id[0]))/3) for i in range(1,len(all_masks_id))]
        width,height,_= all_frames[0].shape
        
        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images = model.generator(images=all_frames, template_mask=initial_mask)
        model.xmem.clear_memory()  
        
        print(f'Num Frames {len(masks)}')
        if compute_metrics:
            if verbose: print('Computing Metrics')
            (f_mean, f_recall, f_decay),(j_mean, j_recall, j_decay),\
            (f_mean_obj, f_recall_obj, f_decay_obj),(j_mean_obj, j_recall_obj, j_decay_obj),\
            f_frame,j_frame = compute_all_metrics(seq, [masks[i] for i in test_ids],all_gt_masks[0,1:,:,:],df_per_frame_metrics)

            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([seq,(f_mean+j_mean)/2,j_mean,j_recall,j_decay,f_mean,f_recall,f_decay,j_frame,f_frame])
            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([seq + '_object',(f_mean_obj+j_mean_obj)/2,j_mean_obj,j_recall_obj,j_decay_obj,f_mean_obj,f_recall_obj,f_decay_obj,j_frame,f_frame])

                
        if compute_video: 
            if verbose: print('Generating video')
            if width % 2 != 0 or height % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{seq}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks/' + seq
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,mask in enumerate(painted_images): 
                image = Image.fromarray(mask)
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i + 1)))
                
    if compute_metrics:
        df_per_frame_metrics.to_csv(folder_path + '/per_object_metrics.csv',index=False)
        df_whole_metrics.to_csv(folder_path + '/whole_metrics.csv',index=False)

        all_test_metrics = None
        if os.path.exists(all_tests_csv):
            all_test_metrics = pd.read_csv(all_tests_csv,index_col = None)
        else:
            all_test_col = ['Test','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay',\
                                    'J&F-Mean-Obj', 'J-Mean-Obj', 'J-Recall-Obj', 'J-Decay-Obj', 'F-Mean-Obj', 'F-Recall-Obj','F-Decay-Obj']
            all_test_metrics = pd.DataFrame(columns=all_test_col)
        normal_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-2].mean().tolist()
        per_object_mean = df_whole_metrics[~df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-2].mean().tolist()
        all_test_metrics.loc[len(all_test_metrics)] = np.array([name] + normal_mean + per_object_mean)
        all_test_metrics.to_csv(all_tests_csv, index = False)
                
    return masks, logits, painted_images

In [10]:
SAM_checkpoint = "./checkpoints/sam_vit_h_4b8939.pth"
if 'HQ' in sys.prefix.split('.')[-1]:SAM_checkpoint = "./checkpoints/sam_hq_vit_h.pth"
xmem_checkpoint = "./checkpoints/XMem-s012.pth"
#e2fgvi_checkpoint = "./checkpoints/E2FGVI-HQ-CVPR22.pth"


LONG DATASET

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'LongDataset',
        'Set' : 'val',
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False
                }
}

VideoLoader = LongVideoDataset(root_folder=longdataset_root, filelist= runtimeargs['DatasetArgs']['Set']+'.txt')
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'])
runname = '{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
masks, logits, painted_images = run_model_on_longdata_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

DAVIS 

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Davis',
        'Year' : 16,
        'Set' : 'val',
        'Resolution': '1080p'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False
                }
}

VideoLoader = DAVIS_MO_Test(davis_root, resolution=runtimeargs['DatasetArgs']['Resolution'], imset='20{}/{}.txt'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set']), single_object=(runtimeargs['DatasetArgs']['Year']==16))
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'])
runname = '{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else 'Ovis',
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
masks, logits, painted_images = run_model_on_davis_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Davis',
        'Year' : 17,
        'Set' : 'val',
        'Resolution': '1080p'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False,
                }
}

VideoLoader = DAVIS_MO_Test(davis_root, resolution=runtimeargs['DatasetArgs']['Resolution'], imset='20{}/{}.txt'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set']), single_object=(runtimeargs['DatasetArgs']['Year']==16))
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'])
runname = '{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else 'Ovis',
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
masks, logits, painted_images = run_model_on_davis_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Davis',
        'Year' : 16,
        'Set' : 'val',
        'Resolution': '480p'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : True,
            'refinement_mode' : 'mask_bbox_pos_neg'
                }
}

VideoLoader = DAVIS_MO_Test(davis_root, resolution=runtimeargs['DatasetArgs']['Resolution'], imset='20{}/{}.txt'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set']), single_object=(runtimeargs['DatasetArgs']['Year']==16))
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'])
runname = '{}{}_{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '',
    runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else 'Ovis',
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
masks, logits, painted_images = run_model_on_davis_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Davis',
        'Year' : 17,
        'Set' : 'val',
        'Resolution': '480p'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : True,
            'refinement_mode' : 'mask_bbox_pos_neg'
                }
}

VideoLoader = DAVIS_MO_Test(davis_root, resolution=runtimeargs['DatasetArgs']['Resolution'], imset='20{}/{}.txt'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set']), single_object=(runtimeargs['DatasetArgs']['Year']==16))
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'])
runname = '{}{}_{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '',
    runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else 'Ovis',
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
masks, logits, painted_images = run_model_on_davis_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

OVIS

In [None]:
vidToTest = select_smaller_set(vidTrain)

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Ovis'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False,
            'refinement_mode' : 'mask_bbox'
                }
}

vidToTest = select_smaller_set(vidTrain,50)
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'])
runname = '{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_' + runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else 'Ovis',
    ''.join(random.choice(string.ascii_letters) for _ in range(5))

)
masks, logits, painted_images = run_model_on_ovis_set(name = runname,model = model, path_set = ovis_images,videos = vidTrain[117:118],annotations = annTrain,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

JUNK TESTING 