In [1]:
import argparse
import gdown
import cv2
import numpy as np
import os
import sys
sys.path.append(sys.path[0]+"/tracker")
sys.path.append(sys.path[0]+"/tracker/model")
from track_anything import TrackingAnything
from track_anything import parse_augment
import requests
import json
import torchvision
import torch 
from tools.painter import mask_painter
import psutil
import time
try: 
    from mmcv.cnn import ConvModule
except:
    os.system("mim install mmcv")
import matplotlib.pyplot as plt
from pycocotools import mask as maskUtils
from PIL import Image
from dataset.dataset import DAVIS_MO_Test
from dataset.longdataset import LongVideoDataset
from dataset.errorfunctions import db_eval_boundary,db_eval_iou,seg2bmap
import pandas as pd
import warnings
import random
import string
import cv2
import random

In [2]:
if os.name == 'posix':
    ovis_anotations = '../data.nosync/OVIS/annotations/'
    ovis_images = '../data.nosync/OVIS/train_images/'
    davis_root = '../data.nosync/DAVIS2017'
    longdataset_root = '../data.nosync/LongDataset/'
    longvos_root = '../data.nosync/LongVOS/test'
else:
    ovis_anotations = 'D:/HADA/data/OVIS/annotations/'
    ovis_images = 'D:/HADA/data/OVIS/train_images/'
    davis_root = 'D:\HADA\data\DAVIS'
    longdataset_root = 'D:/HADA/data/LongDataset/'
    longvos_root = 'D:\HADA\data/LongVOS/test'


all_tests_csv = './result/all_tests.csv'

Carga de datos. (Anotaciones, mascaras e imagenes)

In [3]:
def cargarDatos(ruta_ann):
    with open(ruta_ann + 'annotations_train.json') as f:
        annotationsTrain = json.load(f)

    with open(ruta_ann + 'annotations_valid.json') as f:
        annotationsValid = json.load(f)

    with open(ruta_ann + 'annotations_test.json') as f:
        annotationsTest = json.load(f)

    clases = annotationsTrain['categories']
    vidTrain = annotationsTrain['videos']
    annTrain = annotationsTrain['annotations']
    vidValid = annotationsValid['videos']
    annValid = annotationsValid['annotations']
    vidTest = annotationsTest['videos']
    annTest = annotationsTest['annotations']

    return clases, vidTrain, annTrain, vidValid, annValid, vidTest, annTest

clases, vidTrain, annTrain, vidValid, annValid, vidTest, annTest = cargarDatos(ovis_anotations)         

In [4]:
def annToRLE(ann, frameId):
    """
    Convert annotation which can be polygons, uncompressed RLE to RLE.
    :return: binary mask (numpy 2D array)
    """
    h, w = ann['height'], ann['width']
    segm = ann['segmentations'][frameId]
    if segm is None:
        return None
    if type(segm) == "list":
        # polygon -- a single object might consist of multiple parts
        # we merge all parts into one mask rle code
        rles = maskUtils.frPyObjects(segm, h, w)
        rle = maskUtils.merge(rles)
    elif type(segm['counts']) == "list":
        # uncompressed RLE
        rle = maskUtils.frPyObjects(segm, h, w)
    else:
        # rle
        rle = segm
    return rle


def annToMask(ann, frameId):
    """
    Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
    :return: binary mask (numpy 2D array)
    """
    rle = annToRLE(ann, frameId)
    if rle is not None:
        m = maskUtils.decode(rle)
        return m



def combineMasks(masks, width, height):
    # Crear una matriz vacía para la máscara combinada
    combined = np.zeros((height, width), dtype=np.uint8)

    # Combinar las máscaras en la matriz vacía
    for mask in masks:
        combined += mask  # Sumar la máscara a la máscara combinada

    # Aplicar umbral para obtener una única máscara binaria
    combined = np.where(combined > 0, 1, 0)
    return combined

def unifyMasks(masks, width, height):
    # Crear una matriz vacía para la máscara combinada
    unified = np.zeros((height, width), dtype=np.uint8)

    # Combinar las máscaras en la matriz vacía
    for mask in masks:
        unified += mask  # Sumar la máscara a la máscara combinada

    
    return unified

In [5]:
def load_images_from_folder(path,image_files):
    images = []
    for file in image_files:
        img = cv2.imread(os.path.join(path,file))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        images.append(img)
    return images

def load_all_images_davis(loader,video_info):
    name, frames, objects = video_info
    all_images, all_frames = [],[]
    for i in range(0,frames):
        F_last,M_last = loader.load_single_image(name,i)
        all_images.append((np.array(F_last[:,0]).transpose(1, 2, 0)* 255.).astype(np.uint8))
        all_frames.append(np.array(M_last[1:objects+1,0]).astype(np.uint8))
    return all_images,all_frames

def load_all_initial_masks_from_dataset():
    all_masks = []
    for video in vidTrain:
        ann = [a for a in annTrain if a['video_id'] == video['id']]
        masks = [annToMask(a, 0) * (i + 1) for i, a in enumerate(ann) if annToMask(a, 0) is not None]
        all_masks.append(unifyMasks(masks, video['width'], video['height']))
    return all_masks

def load_all_masks_for_video(video,num_masks):
    ann = [a for a in annTrain if a['video_id'] == video['id']]
    all_masks  = []
    for image_num in range(0,video['length']):
        masks = []
        for i in range(0, num_masks):
            annot = annToMask(ann[i], image_num)
            if annot is not None: masks.append(annot * (i + 1))
        single_mask = unifyMasks(masks, video['width'], video['height'])
        all_masks.append(single_mask)
    return all_masks

def pad_to_divisible_by_two(frames):
    max_height = max(frame.shape[0] for frame in frames)
    max_width = max(frame.shape[1] for frame in frames)
    new_height = max_height + 1 if max_height % 2 != 0 else max_height
    new_width = max_width + 1 if max_width % 2 != 0 else max_width

    padded_frames = []
    for frame in frames:
        height_pad = new_height - frame.shape[0]
        width_pad = new_width - frame.shape[1]
        padded_frame = np.pad(frame, ((0, height_pad), (0, width_pad), (0, 0)), mode='constant')
        padded_frames.append(padded_frame)

    return padded_frames

def generate_video_from_frames(frames, output_path, fps=30):
    frames = torch.from_numpy(np.asarray(frames))
    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))
    torchvision.io.write_video(output_path, frames, fps=fps, video_codec="libx264")
    return output_path

def select_smaller_set(videos, number_videos = 100):
    size = [(video['id'],video['height']*video['height']*len(video['file_names'])) for video in videos]
    #size = [(video['id'],video['height']*video['height']) for video in videos]
    sorted_list = sorted(size, key=lambda x: x[1])
    listed = [tup[0] for tup in sorted_list[:number_videos]]
    filtered_list = [d for id_ in listed for d in vidTrain if d.get('id') == id_]
    return filtered_list

def select_bigger_set(videos, size = 100):
    size = [(video['id'],video['height']*video['height']*len(video['file_names'])) for video in videos]
    sorted_list = sorted(size, key=lambda x: x[1])
    listed = [tup[0] for tup in sorted_list[::-1][:100]]
    filtered_list = [d for id_ in listed for d in vidTrain if d.get('id') == id_]
    return filtered_list

MÉTRICAS

Video

In [6]:
def calculate_real_iou(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate intersection and union for each label
    #labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    labels =np.unique(mask2)
    
    iou_per_label = {}
    iou_values = []

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label
        iou = db_eval_iou(mask1_label,mask2_label)
        iou_per_label[label] = iou
        iou_values.append(iou)

    # Calculate IoU as total of the mask 
    #iou = np.sum(intersection) / np.sum(union)
    iou = np.nanmean(iou_values)
    
    # Calculate IoU as mean of objects
    iou_mean_object = sum(iou_per_label.values()) / len(iou_per_label)

    return iou,iou_mean_object, iou_per_label

def compute_f_score(true_positives, false_positives, false_negatives):
    divider = (true_positives + false_positives)
    precision = (true_positives / divider) if divider != 0 else 0

    divider = (true_positives + false_negatives)
    recall = (true_positives / divider) if divider != 0 else 0

    divider = (precision + recall)
    f_measure = (2 * (precision * recall) / divider) if divider != 0 else 0
    return f_measure

def compute_f_measure(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate F-measure for each label
    #labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    labels =np.unique(mask2)
    f_measure_per_label = {}
    add_true_positives = 0
    add_false_positives = 0
    add_false_negatives = 0

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label

        true_positives = np.logical_and(mask1_label, mask2_label).sum()
        false_positives = np.logical_and(mask1_label, np.logical_not(mask2_label)).sum()
        false_negatives = np.logical_and(np.logical_not(mask1_label), mask2_label).sum()

        add_true_positives += true_positives
        add_false_positives += false_positives
        add_false_negatives += false_negatives

        f_measure = compute_f_score(true_positives, false_positives, false_negatives)
        f_measure_per_label[label] = f_measure

    # Calculate F Measure as total of the mask 
    overall_f_measure = compute_f_score(add_true_positives, add_false_positives, add_false_negatives)

    # Calculate IoU as mean of objects
    f_mean_object = sum(f_measure_per_label.values()) / len(f_measure_per_label)

    return overall_f_measure,f_mean_object,f_measure_per_label

In [7]:
def calculate_iou(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate intersection and union for each label
    labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    #labels =np.intersect1d(np.unique(mask1), np.unique(mask2))[1:]

    labels =np.unique(mask2)[1:]
    intersection = np.zeros_like(mask1, dtype=np.float32)
    union = np.zeros_like(mask1, dtype=np.float32)
    iou_per_label = {}

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label
        c_intersection = np.logical_and(mask1_label, mask2_label)
        c_union = np.logical_or(mask1_label, mask2_label)
        intersection += c_intersection
        union += c_union
        iou_per_label[label] = np.sum(c_intersection) / np.sum(c_union)
    # Calculate IoU as total of the mask 
 
    iou = np.sum(intersection) / np.sum(union) if np.sum(union) != 0 else np.nan
    #aaa iou = np.sum(intersection) / np.sum(union) 
    
    
    # Calculate IoU as mean of objects
    if len(iou_per_label) > 0: 
        iou_mean_object = sum(iou_per_label.values()) / len(iou_per_label)
    else: iou_mean_object  = 0

    #print('J_Measure->OverAll, Object Mean, Per label',iou,iou_mean_object, iou_per_label)
    return iou,iou_mean_object, iou_per_label


def compute_real_f_measure(mask1, mask2): #(mask_infered,mask_gt)
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate F-measure for each label
    labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    labels =np.unique(mask2)[1:]
    f_measure_per_label = {}
    f_measures = []
    all_precision = []
    all_recall = []

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label

        f_measure,precision,recall = db_eval_boundary(mask1_label, mask2_label)
        all_precision.append(precision)
        all_recall.append(recall)
        f_measure_per_label[label] = f_measure
        f_measures.append(f_measure)

    # Calculate F Measure as total of the mask 

    overall_f_measure = np.nanmean(f_measures) if len(f_measures) != 0 else np.nan  
    #aaa overall_f_measure = np.nanmean(f_measures)    

   
    # Calculate IoU as mean of objects
    f_mean_object = sum(f_measure_per_label.values()) / len(f_measure_per_label) if len(f_measure_per_label) != 0 else 0.0
    #aaaf_mean_object = sum(f_measure_per_label.values()) / len(f_measure_per_label)

    return overall_f_measure,f_mean_object,f_measure_per_label

def split_dict_list_to_lists(dict_list):

    key_lists = {}
    for dictionary in dict_list:
        for key, value in dictionary.items():
            if key in key_lists:
                key_lists[key].append(value)
            else:
                key_lists[key] = [value]
    result = [values_list for _, values_list in key_lists.items()]
    keys_in_order = list(key_lists.keys())
    return result, keys_in_order

def add_dict(list_of_dicts):
    mean_dict = {}
    key_counts = {}

    for d in list_of_dicts:
        for key, value in d.items():
            mean_dict[key] = mean_dict.get(key, 0) + value
            key_counts[key] = key_counts.get(key, 0) + 1

    for key in mean_dict:
        mean_dict[key] /= key_counts[key]
    return mean_dict

def db_statistics(per_frame_values):
    """ Compute mean,recall and decay from per-frame evaluation.
    Arguments:
        per_frame_values (ndarray): per-frame evaluation

    Returns:
        M,O,D (float,float,float):
            return evaluation statistics: mean,recall,decay.
    """

    # strip off nan values
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        M = np.nanmean(per_frame_values)
        O = np.nanmean(per_frame_values > 0.5)

    N_bins = 4
    ids = np.round(np.linspace(1, len(per_frame_values), N_bins + 1) + 1e-10) - 1
    ids = ids.astype(np.uint8)

    D_bins = [per_frame_values[ids[i]:ids[i + 1] + 1] for i in range(0, 4)]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        D = np.nanmean(D_bins[0]) - np.nanmean(D_bins[3])

    return M, O, D
def compute_statistics_per_label(args): 
    f_per_objectframe_list,list_of_keys = args
    metrics_dict = {}
    for metrics,key in zip (f_per_objectframe_list,list_of_keys):
        metrics_dict[f'{key}'] = db_statistics(np.array(metrics))
    return metrics_dict

def compute_all_video_metrics(name,masks,ground_truth_masks,df_per_frame_metrics):
    f_measure_lst,f_measure_object_lst, f_measure_per_label_lst, iou_lst, iou_object_lst, iou_per_label_lst  =  [], [], [], [], [], []
    for i,(mask_infered, mask_gt) in enumerate(zip(masks,ground_truth_masks)):
            #print(f'Frame {i+1}: Real Values {len(np.unique(mask_gt)) - 1}, Values Infered {len(np.unique(mask_infered)) - 1}  ')
            f_measure,f_measure_object, f_measure_per_label = compute_real_f_measure(mask_infered,mask_gt)
            #f_real_measure,f_real_measure_object, f_real_measure_per_label = compute_real_f_measure(mask_infered, mask_gt)
            #print(f'F measure: {f_measure}, F measure per object {f_measure_object}, F measure {f_measure_per_label}')
            #print(f'REAL: F measure: {f_real_measure}, F measure per object {f_real_measure_object}, F measure {f_real_measure_per_label}')
            iou,iou_object, iou_per_label = calculate_iou(mask_infered,mask_gt)
            #iou_real,iou_object_real, iou_per_label_real = calculate_real_iou(mask_infered,mask_gt)
            #print(f'Iou measure: {iou}, Iou measure per object {iou_object}, Iou measure {iou_per_label}')
            #print(f'REAL Iou measure: {iou_real}, Iou measure per object {iou_object_real}, Iou measure {iou_per_label_real}')
            df_per_frame_metrics.loc[len(df_per_frame_metrics)] = np.array([name,i + 1,f_measure,iou,f_measure_object,iou_object,f_measure_per_label,iou_per_label])
            #print(f'Mask {i + 1}: f_mesure {f_measure}, per label {f_measure_per_label}, iou {iou}, per label {iou_per_label}')

            f_measure_lst.append(f_measure)
            f_measure_object_lst.append(f_measure_object)
            f_measure_per_label_lst.append(f_measure_per_label)
            
            iou_lst.append(iou)
            iou_object_lst.append(iou_object)
            iou_per_label_lst.append(iou_per_label)
    
    f_statistics = db_statistics(np.array(f_measure_lst))
    j_statistics = db_statistics(np.array(iou_lst))

    f_statistics_object = db_statistics(np.array(f_measure_object_lst))
    j_statistics_object = db_statistics(np.array(iou_object_lst))

    f_statistics_per_label = compute_statistics_per_label(split_dict_list_to_lists(f_measure_per_label_lst))
    j_statistics_per_label = compute_statistics_per_label(split_dict_list_to_lists(iou_per_label_lst))
    return f_statistics,j_statistics,f_statistics_object,j_statistics_object, f_statistics_per_label, j_statistics_per_label




            

Imagen

In [8]:
def compute_iou(mask1, mask2):
    """Compute Intersection over Union (IoU) for two binary masks."""
    intersection = np.logical_and(mask1, mask2).sum()
    union = np.logical_or(mask1, mask2).sum()
    return intersection / union

def compute_auc_interpolated(precisions, recalls):
    sorted_indices = np.argsort(recalls)
    sorted_recalls = np.array(recalls)[sorted_indices]
    sorted_precisions = np.array(precisions)[sorted_indices]
    
    interpolated_precisions = np.maximum.accumulate(sorted_precisions[::-1])[::-1]
    area = np.trapz(interpolated_precisions, sorted_recalls)
    
    return area



def true_positives(gt_masks, pred_masks, scores):

    iou_thresholds = [round(v,2) for v in np.arange(0.5, 1.0, 0.05)]
    columns = ['Frame','Object','Object_Type','Confidences'] + iou_thresholds 
    df_tp = pd.DataFrame(columns=columns)
    n_objects,n_objects_s,n_objects_m,n_objects_l = 0,0,0,0

    for i,(gt_mask, pred_mask) in enumerate(zip(gt_masks, pred_masks)):

        labels_gt = np.unique(gt_mask)
        labels_gt = np.delete(labels_gt,np.where(labels_gt == 0))
        n_objects += len(labels_gt)
        labels_dt = np.unique(pred_mask)
        labels_dt = np.delete(labels_dt,np.where(labels_dt == 0))

        iou_thresholds = np.arange(0.5, 1.0, 0.05)

        '''
        if len(labels_gt) != len(labels_dt): 
            print(f'Frame {i}, GT {len(labels_gt)}, DT{len(labels_dt)}')
            print('Ground_truth')
            plt.imshow(gt_mask)
            plt.show()
            print('Prediction')
            plt.imshow(pred_mask)
            plt.show()
        '''
        #print(f'Labels GT {labels_gt}')
        #print(f'Labels DT {labels_dt}')

        #print(f'Scores {scores[i]}')

        for j,label in enumerate(labels_gt):
            gt_mask_label = gt_mask == label
            object_type = None
            area = cv2.countNonZero(gt_mask_label*1)
            #print(f'Frame {i}, Object {j}')
            if area <= 32*32: 
                object_type = 0
                n_objects_s += 1
            elif area <= 96*96: 
                object_type = 1
                n_objects_m += 1
            else: 
                object_type = 2
                n_objects_l += 1

            if label in labels_dt:
                pred_mask_label = pred_mask == label

                iou = compute_iou(gt_mask_label, pred_mask_label)
                tp_values = [1 if iou >= iou_threshold else 0 for iou_threshold in iou_thresholds]
                df_tp.loc[len(df_tp)] = np.array([i + 1,label,object_type,scores[i][np.where(labels_dt == label)[0].item()]] + tp_values )

    return df_tp,[n_objects,n_objects_s,n_objects_m,n_objects_l]


def compute_AP_for_df(df,n):
    df = df.sort_values(by='Confidences', ascending=False)
    AP = {}
    for column in df.columns[4:].tolist():
        pos = df.columns.get_loc(column)
        df.insert(pos + 1,f'{column}_fp',1 - df[column])
        df.insert(pos + 2,f'{column}_accTp',df[column].cumsum())
        df.insert(pos + 3,f'{column}_accFp',df[f'{column}_fp'].cumsum())
        df.insert(pos + 4,f'{column}_Precision',df[f'{column}_accTp'] / (df[f'{column}_accTp']+df[f'{column}_accFp']))
        df.insert(pos + 4,f'{column}_Recall',df[f'{column}_accTp'] / n)
        AP[column] = compute_auc_interpolated(df[f'{column}_Precision'].values,df[f'{column}_Recall'].values)
        #print_curve(df[f'{column}_Precision'].values,df[f'{column}_Recall'].values,f'Precision Recall Curve_{column}')
        #print(AP[column])
        df.drop(columns = [f'{column}_fp',f'{column}_accTp',f'{column}_accFp',f'{column}_Precision',f'{column}_Recall'])
    return AP


def calculate_video_AP(gt_masks, pred_masks, scores):
    df,object_counts = true_positives(gt_masks, pred_masks,scores)
    AP = compute_AP_for_df(df,object_counts[0])
    AP_size = []
    for object_type in range(0,3):
        if df['Object_Type'].isin([object_type]).any(): AP_size.append(compute_AP_for_df(df[df['Object_Type'] == object_type],[object_counts[object_type+1]]))
        else: AP_size.append(None)
    return AP, AP_size

In [9]:
def run_model_on_davis_set(name, model,videoLoader, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    df_whole_metrics,df_per_frame_metrics,df_score = None,None,None


    if compute_metrics: 
        g_measures_by_video = ['Video','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay','AP','AP@.5','AP@.75','AP_s','AP_m','AP_l','J-Statiscts-Object','F-Statiscts-Object']
        df_whole_metrics = pd.DataFrame(columns=g_measures_by_video)
        df_score = pd.DataFrame(columns=['Video','Scores'])
        df_per_frame_metrics_col = ['Video','Frame','J-Mean','F-Mean','J-Mean-Object','F-Mean-Object','J-Object','F-Object']
        df_per_frame_metrics = pd.DataFrame(columns=df_per_frame_metrics_col)
    
    folder_path = f'./result/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)

    for video in list(videoLoader):

        num_objects, info = video
        video_name = info['name'] 
        num_frames = info['num_frames']
        if videoLoader.resolution == '480p':
            height = info['size_480p'][1]
            width = info['size_480p'][0]
        else: 
            if videoLoader.year == '2017':
                height = 3840
                width = 2026
            else:
                height = 1920
                width = 1080        
        if verbose: print(f'Tracking Video {video_name} with dimensions {width}x{height}')
        if verbose: print('Loading dataset images and masks')
        images,ground_truth_masks = load_all_images_davis(videoLoader,(video_name,num_frames,num_objects.item()))
        
        if verbose: print('Creating first annotated mask for VOS model')

        model.xmem.current_video = video_name

        combined_masks = [[mask * (i+1) for i, mask in enumerate(frameMask)] for frameMask in ground_truth_masks]
        ground_truth_masks = [unifyMasks(mask, height, width) for mask in combined_masks]
        initial_mask = ground_truth_masks[0]
        #Compute masks for all images
        
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images, scores = model.generator(images=images, template_mask=initial_mask)
        model.xmem.clear_memory() 
        
        df_score.loc[len(df_score)] = [video_name, [item[0] for item in scores]]
        #df_score.append({'Video': video_name, 'Scores': [item[0] for item in scores]}, ignore_index=True)
        
        if compute_metrics:
            if verbose: print('Computing Metrics')
            (f_mean, f_recall, f_decay),(j_mean, j_recall, j_decay),\
            (f_mean_obj, f_recall_obj, f_decay_obj),(j_mean_obj, j_recall_obj, j_decay_obj),\
            f_frame,j_frame = compute_all_video_metrics(video_name,masks[1:],ground_truth_masks[1:],df_per_frame_metrics)

            AP, AP_objectSize = calculate_video_AP(ground_truth_masks[1:],masks[1:], scores)

            AP_n = sum(AP.values())/len(AP)
            AP_5 = AP[0.5]
            AP_75 = AP[0.75]
            AP_s = sum(AP_objectSize[0].values())/len(AP_objectSize[0]) if AP_objectSize[0] is not None else np.nan
            AP_m = sum(AP_objectSize[1].values())/len(AP_objectSize[1]) if AP_objectSize[1] is not None else np.nan
            AP_l = sum(AP_objectSize[2].values())/len(AP_objectSize[2]) if AP_objectSize[2] is not None else np.nan

            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_name,(f_mean+j_mean)/2,j_mean,j_recall,j_decay,f_mean,f_recall,f_decay,AP_n,AP_5,AP_75,AP_s,AP_m,AP_l,j_frame,f_frame])
            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_name + '_object',(f_mean_obj+j_mean_obj)/2,j_mean_obj,j_recall_obj,j_decay_obj,f_mean_obj,f_recall_obj,f_decay_obj,AP_n,AP_5,AP_75,AP_s,AP_m,AP_l,j_frame,f_frame])

         
        if compute_video: 
            if verbose: print('Generating video')
            if width % 2 != 0 or height % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{video_name}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks/' + video_name
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,mask in enumerate(painted_images): 
                image = Image.fromarray(mask)
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i)))
    
    if compute_metrics:
        df_per_frame_metrics.to_csv(folder_path + '/per_object_metrics.csv',index=False)
        df_whole_metrics.to_csv(folder_path + '/whole_metrics.csv',index=False)

        all_test_metrics = None
        if os.path.exists(all_tests_csv):
            all_test_metrics = pd.read_csv(all_tests_csv,index_col = None)
        else:
            all_test_col = ['Test','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay',\
                                    'AP-Mean','AP@.5-Mean','AP@.75-Mean','AP_s-Mean','AP_m-Mean','AP_l-Mean',\
                                    'J&F-Mean-Obj', 'J-Mean-Obj', 'J-Recall-Obj', 'J-Decay-Obj', 'F-Mean-Obj', 'F-Recall-Obj','F-Decay-Obj']
            all_test_metrics = pd.DataFrame(columns=all_test_col)
        normal_mean = df_whole_metrics[~df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-8].mean().tolist()
        ap_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 8:14].mean().tolist()
        per_object_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-8].mean().tolist()
        all_test_metrics.loc[len(all_test_metrics)] = np.array([name] + normal_mean + ap_mean + per_object_mean)
        all_test_metrics.to_csv(all_tests_csv, index = False)
    
    df_score.to_csv(folder_path + '/scores.csv',index=False)

    return masks, logits, painted_images

In [10]:
def run_model_on_ovis_set(name, model,path_set,videos, annotations, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    df_whole_metrics,df_per_frame_metrics,df_score = None,None,None
    if compute_metrics: 
        g_measures_by_video = ['Video','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay','AP','AP@.5','AP@.75','AP_s','AP_m','AP_l','J-Statiscts-Object','F-Statiscts-Object']
        df_whole_metrics = pd.DataFrame(columns=g_measures_by_video)
        df_per_frame_metrics_col = ['Video','Frame','J','F','J-Mean','F-Mean','J-Object','F-Object']
        df_per_frame_metrics = pd.DataFrame(columns=df_per_frame_metrics_col)

    folder_path = f'./result/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)

    for video in videos:
        # Load all images as np.array
        if verbose: print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
        video_folder = video["file_names"][0].split("/")[0]
        model.xmem.current_video = video_folder
        if verbose: print(f'Tracking Video {video_folder} with dimensions {video["width"]}x{video["height"]}')
        if verbose: print('Loading dataset images')
        images = load_images_from_folder(path_set,video['file_names'])

        # Load al poligon of first image to a usable mask
        if verbose: print('Creating first annotated mask for VOS model')
        ann = [a for a in annotations if a['video_id'] == video['id']]
        masks = [(annToMask(a, 0) * (i + 1))  for i, a in enumerate(ann) if annToMask(a, 0) is not None ]
        number_masks = len(masks)
        if verbose: print(f'Initial loaded MASKS {number_masks}')
        initial_mask = unifyMasks(masks, video['width'], video['height'])


        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images,scores = model.generator(images=images, template_mask=initial_mask)
        model.xmem.clear_memory()  

        
        if compute_metrics:
            if verbose: print('Computing Metrics')
            ground_truth_masks = load_all_masks_for_video(video,number_masks)
            (f_mean, f_recall, f_decay),(j_mean, j_recall, j_decay),\
            (f_mean_obj, f_recall_obj, f_decay_obj),(j_mean_obj, j_recall_obj, j_decay_obj),\
            f_frame,j_frame = compute_all_video_metrics(video_folder,masks[1:],ground_truth_masks[1:],df_per_frame_metrics)

            AP, AP_objectSize = calculate_video_AP(ground_truth_masks[1:],masks[1:], scores)

            AP_n = sum(AP.values())/len(AP)
            AP_5 = AP[0.5]
            AP_75 = AP[0.75]
            AP_s = sum(AP_objectSize[0].values())/len(AP_objectSize[0]) if AP_objectSize[0] is not None else np.nan
            AP_m = sum(AP_objectSize[1].values())/len(AP_objectSize[1]) if AP_objectSize[1] is not None else np.nan
            AP_l = sum(AP_objectSize[2].values())/len(AP_objectSize[2]) if AP_objectSize[2] is not None else np.nan

            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_folder,(f_mean+j_mean)/2,j_mean,j_recall,j_decay,f_mean,f_recall,f_decay,AP_n,AP_5,AP_75,AP_s,AP_m,AP_l,j_frame,f_frame])
            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([video_folder + '_object',(f_mean_obj+j_mean_obj)/2,j_mean_obj,j_recall_obj,j_decay_obj,f_mean_obj,f_recall_obj,f_decay_obj,AP_n,AP_5,AP_75,AP_s,AP_m,AP_l,j_frame,f_frame])

                
        if compute_video: 
            if verbose: print('Generating video')
            if video['width'] % 2 != 0 or video['height'] % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{video['id']}_{video_folder}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks/' + video_folder
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,mask in enumerate(painted_images): 
                image = Image.fromarray(mask)
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i + 1)))
    
    if compute_metrics:
        df_per_frame_metrics.to_csv(folder_path + '/per_object_metrics.csv',index=False)
        df_whole_metrics.to_csv(folder_path + '/whole_metrics.csv',index=False)

        all_test_metrics = None
        if os.path.exists(all_tests_csv):
            all_test_metrics = pd.read_csv(all_tests_csv,index_col = None)
        else:
            all_test_col = ['Test','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay',\
                                    'AP-Mean','AP@.5-Mean','AP@.75-Mean','AP_s-Mean','AP_m-Mean','AP_l-Mean',\
                                    'J&F-Mean-Obj', 'J-Mean-Obj', 'J-Recall-Obj', 'J-Decay-Obj', 'F-Mean-Obj', 'F-Recall-Obj','F-Decay-Obj']
            all_test_metrics = pd.DataFrame(columns=all_test_col)
        normal_mean = df_whole_metrics[~df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-8].mean().tolist()
        ap_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 8:14].mean().tolist()
        per_object_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-8].mean().tolist()
        all_test_metrics.loc[len(all_test_metrics)] = np.array([name] + normal_mean + ap_mean + per_object_mean)
        all_test_metrics.to_csv(all_tests_csv, index = False)
                
    return masks, logits, painted_images

In [11]:
def run_model_on_longdata_set(name, model,videoLoader, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    df_whole_metrics,df_per_frame_metrics = None,None
    if compute_metrics: 
        g_measures_by_video = ['Video','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay','AP','AP@.5','AP@.75','AP_s','AP_m','AP_l','J-Statiscts-Object','F-Statiscts-Object']
        df_whole_metrics = pd.DataFrame(columns=g_measures_by_video)
        df_per_frame_metrics_col = ['Video','Frame','J-Mean','F-Mean','J-Mean-Object','F-Mean-Object','J-Object','F-Object']
        df_per_frame_metrics = pd.DataFrame(columns=df_per_frame_metrics_col)
    
    folder_path = f'./result/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)
    
    for seq in list(videoLoader.get_sequences()):
        
        model.xmem.current_video = seq
        if verbose: print(f'Video: {seq}')

        all_gt_masks, _, all_masks_id = videoLoader.get_all_masks(seq, True)
        if os.name == 'nt': all_masks_id = [int(folder.split('\\')[-1]) for folder in all_masks_id]
        images_root = os.path.join(videoLoader.root_folder ,'JPEGImages',seq)
        file_names = sorted(os.listdir(images_root))
        file_ids = [int(name.split('.')[0]) for name in file_names]
        test_ids = [file_ids.index(int(mask_id)) for mask_id in all_masks_id[1:]]
        all_frames = load_images_from_folder(images_root,file_names)
        initial_mask = all_gt_masks[0,0,:,:]
        #test_ids = [int((int(all_masks_id[i]) - int(all_masks_id[0]))/3) for i in range(1,len(all_masks_id))]
        width,height,_= all_frames[0].shape
        
        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images, scores = model.generator(images=all_frames, template_mask=initial_mask)
        model.xmem.clear_memory()  
        
        if compute_metrics:
            if verbose: print('Computing Metrics')
            masks_compute =  [masks[i] for i in test_ids]
            (f_mean, f_recall, f_decay),(j_mean, j_recall, j_decay),\
            (f_mean_obj, f_recall_obj, f_decay_obj),(j_mean_obj, j_recall_obj, j_decay_obj),\
            f_frame,j_frame = compute_all_video_metrics(seq,masks_compute,all_gt_masks[0,1:,:,:],df_per_frame_metrics)
            
            AP, AP_objectSize = calculate_video_AP(all_gt_masks[0,1:,:,:],masks_compute, scores)

            AP_n = sum(AP.values())/len(AP)
            AP_5 = AP[0.5]
            AP_75 = AP[0.75]
            AP_s = sum(AP_objectSize[0].values())/len(AP_objectSize[0]) if AP_objectSize[0] is not None else np.nan
            AP_m = sum(AP_objectSize[1].values())/len(AP_objectSize[1]) if AP_objectSize[1] is not None else np.nan
            AP_l = sum(AP_objectSize[2].values())/len(AP_objectSize[2]) if AP_objectSize[2] is not None else np.nan

            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([seq,(f_mean+j_mean)/2,j_mean,j_recall,j_decay,f_mean,f_recall,f_decay,AP_n,AP_5,AP_75,AP_s,AP_m,AP_l,j_frame,f_frame])
            df_whole_metrics.loc[len(df_whole_metrics)] = np.array([seq + '_object',(f_mean_obj+j_mean_obj)/2,j_mean_obj,j_recall_obj,j_decay_obj,f_mean_obj,f_recall_obj,f_decay_obj,AP_n,AP_5,AP_75,AP_s,AP_m,AP_l,j_frame,f_frame])

                
        if compute_video: 
            if verbose: print('Generating video')
            if width % 2 != 0 or height % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{seq}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks/' + seq
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            #for i,mask in enumerate(painted_images): 
            #    image = Image.fromarray(mask)
            #    image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i)))
            path_to_masks = path_to_masks + '/testedmasks'
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,j in zip(test_ids,all_masks_id[1:]): 
                image = Image.fromarray(painted_images[i])
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(j)))
            
                
    if compute_metrics:
        df_per_frame_metrics.to_csv(folder_path + '/per_object_metrics.csv',index=False)
        df_whole_metrics.to_csv(folder_path + '/whole_metrics.csv',index=False)

        all_test_metrics = None
        if os.path.exists(all_tests_csv):
            all_test_metrics = pd.read_csv(all_tests_csv,index_col = None)
        else:
            all_test_col = ['Test','J&F-Mean', 'J-Mean', 'J-Recall', 'J-Decay', 'F-Mean', 'F-Recall', 'F-Decay',\
                                    'AP-Mean','AP@.5-Mean','AP@.75-Mean','AP_s-Mean','AP_m-Mean','AP_l-Mean',\
                                    'J&F-Mean-Obj', 'J-Mean-Obj', 'J-Recall-Obj', 'J-Decay-Obj', 'F-Mean-Obj', 'F-Recall-Obj','F-Decay-Obj']
            all_test_metrics = pd.DataFrame(columns=all_test_col)
        normal_mean = df_whole_metrics[~df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-8].mean().tolist()
        ap_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 8:14].mean().tolist()
        per_object_mean = df_whole_metrics[df_whole_metrics['Video'].str.contains('_object')].iloc[:, 1:-8].mean().tolist()
        all_test_metrics.loc[len(all_test_metrics)] = np.array([name] + normal_mean + ap_mean + per_object_mean)
        all_test_metrics.to_csv(all_tests_csv, index = False)
                
    return masks, logits, painted_images

    

In [12]:
SAM_checkpoint = "./checkpoints/sam_vit_h_4b8939.pth"
if 'HQ' in sys.prefix.split('.')[-1]:SAM_checkpoint = "./checkpoints/sam_hq_vit_h.pth"
xmem_checkpoint = "./checkpoints/XMem-s012.pth"
#e2fgvi_checkpoint = "./checkpoints/E2FGVI-HQ-CVPR22.pth"


In [13]:
plt.ioff()

<contextlib.ExitStack at 0x2598005c750>

In [14]:
runtimeargs_lst = [ 
{
    'DatasetArgs' :{
        'Dataset' : 'Ovis'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : True,
            'refinement_mode' : 'mask_bbox_pos_neg',
            'addArgs1':'C&Poly'
    }
}]


In [23]:
''' Gets most ocludded smallest videos in the dataset'''
from collections import Counter
columns = ['video_id','no_occlusion','slight_occlusion','severe_occlusion']
df = pd.DataFrame(columns=columns)
for ann in annTrain:
    counted_repetitions = Counter(ann['occlusion'])
    for key in columns[1:]:
        counted_repetitions[key] = counted_repetitions.get(key, 0)
    if ann['video_id'] in df.index: 
        df.loc[ann['video_id'],'no_occlusion'] += counted_repetitions['no_occlusion']
        df.loc[ann['video_id'],'slight_occlusion'] += counted_repetitions['slight_occlusion']
        df.loc[ann['video_id'],'severe_occlusion'] += counted_repetitions['severe_occlusion']
    else: df.loc[ann['video_id']] = np.array([ann['video_id'],counted_repetitions['no_occlusion'],counted_repetitions['slight_occlusion'],counted_repetitions['severe_occlusion']])
df['occlusion_value'] = (df['slight_occlusion'] + df['severe_occlusion'] * 1.5 )/(df['no_occlusion'] + df['slight_occlusion'] + df['severe_occlusion']* 1.5)
df['total_masks'] = (df['no_occlusion'] + df['slight_occlusion'] + df['severe_occlusion'])
df['frames'] = [len(vid['file_names']) for vid in vidTrain]
df['width'] = [vid['width'] for vid in vidTrain]
df['height'] = [vid['height'] for vid in vidTrain]
df['size'] = [vid['height']*vid['height']*len(vid['file_names']) for vid in vidTrain]
listed = df.sort_values(by=['occlusion_value', 'size'], ascending=[False,True]).head(54)['video_id'].values
occludedTrain = [d for id_ in listed for d in vidTrain if d.get('id') == id_]
remove = [18,19,38,43]
occludedTrain  = [item for i, item in enumerate(occludedTrain) if i not in remove]

Loop OVIS

In [None]:
for runtimeargs in runtimeargs_lst:
    #vidToTest = select_smaller_set(vidTrain,25)
    runname = '{}{}{}{}_{}_{}'.format(
        'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
        '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
        '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
        '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
        'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
        ''.join(random.choice(string.ascii_letters) for _ in range(5))
    )
    print(f'Running Test: {runname}')
    model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
    masks, logits, painted_images = run_model_on_ovis_set(name = runname,model = model, path_set = ovis_images,videos = occludedTrain,annotations = annTrain,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

Loop Long

In [None]:
filelist = runtimeargs_lst[0]['DatasetArgs']['Set']+'.txt'
VideoLoader = LongVideoDataset(root_folder=longdataset_root, filelist= filelist)
xmem_checkpoint = 'checkpoints\XMem.pth'

for runtimeargs in runtimeargs_lst:
    runname = '{}{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
    )
    print(f'Running Test: {runname}')
    model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
    masks, logits, painted_images = run_model_on_longdata_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)


Loop Davis

In [None]:
resolution = runtimeargs_lst[0]['DatasetArgs']['Resolution']
year = runtimeargs_lst[0]['DatasetArgs']['Year']
data_set = runtimeargs_lst[0]['DatasetArgs']['Set']
VideoLoader = DAVIS_MO_Test(davis_root, resolution=resolution, imset='20{}/{}.txt'.format(year,data_set), single_object=(year==16))

for runtimeargs in runtimeargs_lst:
    runname = '{}{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
    )
    print(f'Running Test: {runname}')
    model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
    masks, logits, painted_images = run_model_on_davis_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)


DAVIS 

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Davis',
        'Year' : 16,
        'Set' : 'val',
        'Resolution': '480p'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : True,
            'refinement_mode' : 'point',
            'addArgs1':''
    }
}

VideoLoader = DAVIS_MO_Test(davis_root, resolution=runtimeargs['DatasetArgs']['Resolution'], imset='20{}/{}.txt'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set']), single_object=(runtimeargs['DatasetArgs']['Year']==16))
runname = '{}{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
masks, logits, painted_images = run_model_on_davis_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=True, compute_video=True,verbose = True)

LONG DATASET

In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'LongDataset',
        'Set' : 'val',
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False
                }
}

VideoLoader = LongVideoDataset(root_folder=longdataset_root, filelist= runtimeargs['DatasetArgs']['Set']+'.txt')
runname = '{}{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
xmem_checkpoint = 'checkpoints\XMem.pth'
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
masks, logits, painted_images = run_model_on_longdata_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = True, save_masks=False, compute_video=False,verbose = False)

OVIS


In [None]:
runtimeargs = {
    'DatasetArgs' :{
        'Dataset' : 'Ovis'
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False,
            'refinement_mode' : 'bbox',
            'addArgs1':''
    }
}

vidToTest = select_smaller_set(vidTrain,25)
runname = '{}{}{}{}_{}_{}'.format(
    'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
    '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
    '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
    '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
    'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
    ''.join(random.choice(string.ascii_letters) for _ in range(5))
)
print(f'Running Test: {runname}')
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
masks, logits, painted_images = run_model_on_ovis_set(name = runname,model = model, path_set = ovis_images,videos = vidToTest,annotations = annTrain,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

JUNK TESTING 

In [14]:
def run_model_on_longVOS_set(name, model,videoLoader, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    
    folder_path = f'./resultLongVOS/{name}'
    if compute_metrics or compute_video or save_masks: 
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        if compute_video: 
            path_to_videos = folder_path + '/videos'
            if not os.path.exists(path_to_videos): os.makedirs(path_to_videos)
    
    for seq in list(videoLoader.get_sequences()):
        
        model.xmem.current_video = seq
        if verbose: print(f'Video: {seq}')

        all_gt_masks, _, all_masks_id = videoLoader.get_all_masks(seq, True) 
        if os.name == 'nt': all_masks_id = [int(folder.split('\\')[-1]) for folder in all_masks_id]
        images_root = os.path.join(videoLoader.root_folder ,'JPEGImages',seq)
        file_names = sorted(os.listdir(images_root))
        file_ids = [int(name.split('.')[0]) for name in file_names]
        test_ids = [file_ids.index(int(mask_id)) for mask_id in all_masks_id[1:]]
        all_frames = load_images_from_folder(images_root,file_names)
        initial_mask = all_gt_masks[0,0,:,:]
        #test_ids = [int((int(all_masks_id[i]) - int(all_masks_id[0]))/3) for i in range(1,len(all_masks_id))]
        width,height,_= all_frames[0].shape
        
        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images, scores = model.generator(images=all_frames, template_mask=initial_mask)
        model.xmem.clear_memory()  
        
        if compute_video: 
            if verbose: print('Generating video')
            if width % 2 != 0 or height % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= path_to_videos + f"/{seq}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks/' + seq
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            #for i,mask in enumerate(painted_images): 
            #    image = Image.fromarray(mask)
            #    image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i)))
            path_to_masks = path_to_masks + '/testedmasks'
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,j in zip(test_ids,all_masks_id[1:]): 
                image = Image.fromarray(painted_images[i])
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(j)))

    return masks, logits, painted_images

    

In [15]:
runtimeargs_lst = [ 
    {
    'DatasetArgs' :{
        'Dataset' : 'LongVOS',
        'Set' : 'test',
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : True,
            'refinement_mode' : 'mask_bbox_pos_neg',
            'addArgs1':'C&Poly'
    }
},
 {
    'DatasetArgs' :{
        'Dataset' : 'LongVOS',
        'Set' : 'test',
    },
    'TrackingAnythingArgs' : {
            'use_refinement' : False
                }
}]

In [17]:

VideoLoader = LongVideoDataset(root_folder=longvos_root, filelist= f"{runtimeargs_lst[0]['DatasetArgs']['Set']}.txt")
for runtimeargs in runtimeargs_lst:
    runname = '{}{}{}{}_{}_{}'.format(
        'Refined' if runtimeargs['TrackingAnythingArgs']['use_refinement']  else 'XMEM',
        '_HQ' if 'HQ' in sys.prefix.split('.')[-1] else '_SAM',
        '_'+ runtimeargs['TrackingAnythingArgs']['refinement_mode'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] else '',
        '_' + runtimeargs['TrackingAnythingArgs']['addArgs1'] if runtimeargs['TrackingAnythingArgs']['use_refinement'] and runtimeargs['TrackingAnythingArgs']['addArgs1'] != '' else '',
        'Davis_{}_{}_{}'.format(runtimeargs['DatasetArgs']['Year'],runtimeargs['DatasetArgs']['Set'],runtimeargs['DatasetArgs']['Resolution']) if runtimeargs['DatasetArgs']['Dataset'] == 'Davis' else runtimeargs['DatasetArgs']['Dataset'],
        ''.join(random.choice(string.ascii_letters) for _ in range(5))
    )
    print(f'Running Test: {runname}')
    model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, None,runtimeargs['TrackingAnythingArgs'], save_inner_masks_folder = runname)
    masks, logits, painted_images = run_model_on_longVOS_set(name = runname,model = model,videoLoader = VideoLoader,compute_metrics = False, save_masks=True, compute_video=True,verbose = False)

Seq names: ['TVfoBiU2', 'R3UJGYwy', '81Oju1e7', 'JefVS4hq', 'FmzvB44A', 'shW04XAm', 'PhiOlFlA', 'w3XiaDdm', 'gUca15ef', 'gw84JOqH', 'bDv75KJl', 'irsQQbUO', 'B9fFIrgq', 'YMRntd88', 'T4pEUdxQ', 'UtEnRpiP', 'RCgebO9V', 'ealukzgh', 'LWVlKBlK', 'UBITNvZb', 'qeD7QcDW', 'Oj1z5fO0', 'Y6tOZtQV', 'UykV25jo', '6mNj04tC', 'GmfSNNa3', 'AIv9mv7T', 'aSNxf5ms', '58jpr19X', 'Kz1wgslb', '058oeZ2p', 'vlKROEmy', '5NImTLT6', 'T9woxlyM', '8kv99Cop', 'yxCvN6OJ', 'xL5OHe3Y', 'yfzVCnvU', 'HQ2UJdrW', 'Gz3GEPPC', 'tAyQIobn', 'yWnanBID', '7im4LEcb', '711gAS21', 'YHUoFe2u', '5VwnMLaz', 'MI0uwoMt', 'R4xwHrdP', 'GchnEETZ', 'PQPOuFNO']
Running Test: Refined_HQ_mask_bbox_pos_neg_C&Poly_LongVOS_ZhkQI
Initializing BaseSegmenter to cuda:0
<All keys matched successfully>
Hyperparameters read from the model weights: C^k=64, C^v=512, C^h=64
Single object mode: False
Sam Refinement ACTIVATED. Mode: mask_bbox_pos_neg


Tracking image: 100%|██████████| 498/498 [02:51<00:00,  2.90it/s]
Tracking image: 100%|██████████| 583/583 [01:57<00:00,  4.94it/s]
Tracking image: 100%|██████████| 553/553 [03:05<00:00,  2.98it/s]
Tracking image: 100%|██████████| 315/315 [01:23<00:00,  3.79it/s]
Tracking image: 100%|██████████| 402/402 [00:50<00:00,  7.93it/s]
Tracking image: 100%|██████████| 480/480 [01:00<00:00,  7.96it/s]
Tracking image: 100%|██████████| 975/975 [05:34<00:00,  2.92it/s]
Tracking image: 100%|██████████| 1965/1965 [47:15<00:00,  1.44s/it]
Tracking image: 100%|██████████| 422/422 [19:22<00:00,  2.76s/it]
Tracking image: 100%|██████████| 261/261 [00:33<00:00,  7.85it/s]
Tracking image: 100%|██████████| 263/263 [01:27<00:00,  2.99it/s]
Tracking image: 100%|██████████| 526/526 [02:59<00:00,  2.92it/s]
Tracking image: 100%|██████████| 352/352 [00:29<00:00, 11.99it/s]
Tracking image: 100%|██████████| 272/272 [01:34<00:00,  2.88it/s]
Tracking image: 100%|██████████| 321/321 [00:57<00:00,  5.62it/s]
Tracking

Running Test: XMEM_HQ_LongVOS_CZoob
Initializing BaseSegmenter to cuda:0
<All keys matched successfully>
Hyperparameters read from the model weights: C^k=64, C^v=512, C^h=64
Single object mode: False
Sam Refinement NOT ACTIVATED


Tracking image: 100%|██████████| 498/498 [00:43<00:00, 11.49it/s]
Tracking image: 100%|██████████| 583/583 [03:12<00:00,  3.03it/s]
Tracking image: 100%|██████████| 553/553 [03:11<00:00,  2.89it/s]
Tracking image: 100%|██████████| 315/315 [00:37<00:00,  8.39it/s]
Tracking image: 100%|██████████| 402/402 [01:15<00:00,  5.34it/s]
Tracking image: 100%|██████████| 480/480 [00:45<00:00, 10.48it/s]
Tracking image: 100%|██████████| 975/975 [05:12<00:00,  3.12it/s]
Tracking image: 100%|██████████| 1965/1965 [1:04:11<00:00,  1.96s/it]
Tracking image: 100%|██████████| 422/422 [14:15<00:00,  2.03s/it]
Tracking image: 100%|██████████| 261/261 [01:52<00:00,  2.32it/s]
Tracking image: 100%|██████████| 263/263 [01:22<00:00,  3.17it/s]
Tracking image: 100%|██████████| 526/526 [03:03<00:00,  2.86it/s]
Tracking image: 100%|██████████| 352/352 [01:13<00:00,  4.81it/s]
Tracking image: 100%|██████████| 272/272 [01:24<00:00,  3.22it/s]
Tracking image: 100%|██████████| 321/321 [01:15<00:00,  4.25it/s]
Tracki

In [24]:
with open(os.path.join(longvos_root,'test_meta.json'), 'r') as archivo:
    datos = json.load(archivo)
all_videos = []
for video in datos['videos']:
    frame_data  = datos['videos'][video]['objects']['1']
    start = int(frame_data['frame_range']['start'])
    end = int(frame_data['frame_range']['end'])
    video_file_names = []
    for i in range(start, end + 1, 5):
        file_name = f"{video}/{str(i).zfill(8)}.jpg"
        video_file_names.append(file_name)
    all_videos.append(video_file_names)

longvos_images = load_images_from_folder(os.path.join(longvos_root,'JPEGImages'),all_videos[0])

Seq names: ['TVfoBiU2', 'R3UJGYwy', '81Oju1e7', 'JefVS4hq', 'FmzvB44A', 'shW04XAm', 'PhiOlFlA', 'w3XiaDdm', 'gUca15ef', 'gw84JOqH', 'bDv75KJl', 'irsQQbUO', 'B9fFIrgq', 'YMRntd88', 'T4pEUdxQ', 'UtEnRpiP', 'RCgebO9V', 'ealukzgh', 'LWVlKBlK', 'UBITNvZb', 'qeD7QcDW', 'Oj1z5fO0', 'Y6tOZtQV', 'UykV25jo', '6mNj04tC', 'GmfSNNa3', 'AIv9mv7T', 'aSNxf5ms', '58jpr19X', 'Kz1wgslb', '058oeZ2p', 'vlKROEmy', '5NImTLT6', 'T9woxlyM', '8kv99Cop', 'yxCvN6OJ', 'xL5OHe3Y', 'yfzVCnvU', 'HQ2UJdrW', 'Gz3GEPPC', 'tAyQIobn', 'yWnanBID', '7im4LEcb', '711gAS21', 'YHUoFe2u', '5VwnMLaz', 'MI0uwoMt', 'R4xwHrdP', 'GchnEETZ', 'PQPOuFNO']


In [32]:
all_gt_masks, _, all_masks_id = videoLoader.get_all_masks('TVfoBiU2', True)

In [None]:
from matplotlib.pyplot import figure


def print_curve(precision, recall,column):
    figure(figsize=(12, 7), dpi=80)
    interpolated_precisions = np.maximum.accumulate(precision[::-1])[::-1]
    plt.plot(recall, interpolated_precisions, color='red', linestyle='-')
    plt.plot(recall, precision, color='blue', linestyle='-')
    # Add legend, grid, title, etc.
    plt.legend()
    plt.grid(True)
    plt.title(f'Precision Recall Curve', fontsize=20)
    plt.xlabel('Recall', fontsize=14)
    plt.ylabel('Precision', fontsize=14)
 
    plt.grid(False)
    # Show the plot
    plt.show()