In [32]:
import argparse
import gdown
import cv2
import numpy as np
import os
import sys
sys.path.append(sys.path[0]+"/tracker")
sys.path.append(sys.path[0]+"/tracker/model")
from track_anything import TrackingAnything
from track_anything import parse_augment
import requests
import json
import torchvision
import torch 
from tools.painter import mask_painter
import psutil
import time
try: 
    from mmcv.cnn import ConvModule
except:
    os.system("mim install mmcv")
import matplotlib.pyplot as plt
from pycocotools import mask as maskUtils
from PIL import Image

In [2]:

if os.name == 'posix':
    ovis_anotations = '../data.nosync/OVIS/annotations/'
    ovis_images = '../data.nosync/OVIS/train_images/'
else:
    ovis_anotations = 'D:/HADA/data/OVIS/annotations/'
    ovis_images = 'D:/HADA/data/OVIS/train_images/'

In [3]:
def cargarDatos(ruta_ann):
    with open(ruta_ann + 'annotations_train.json') as f:
        annotationsTrain = json.load(f)

    with open(ruta_ann + 'annotations_valid.json') as f:
        annotationsValid = json.load(f)

    with open(ruta_ann + 'annotations_test.json') as f:
        annotationsTest = json.load(f)

    clases = annotationsTrain['categories']
    vidTrain = annotationsTrain['videos']
    annTrain = annotationsTrain['annotations']
    vidValid = annotationsValid['videos']
    annValid = annotationsValid['annotations']
    vidTest = annotationsTest['videos']
    annTest = annotationsTest['annotations']

    return clases, vidTrain, annTrain, vidValid, annValid, vidTest, annTest

clases, vidTrain, annTrain, vidValid, annValid, vidTest, annTest = cargarDatos(ovis_anotations) 

In [4]:
def annToRLE(ann, frameId):
    """
    Convert annotation which can be polygons, uncompressed RLE to RLE.
    :return: binary mask (numpy 2D array)
    """
    h, w = ann['height'], ann['width']
    segm = ann['segmentations'][frameId]
    if segm is None:
        return None
    if type(segm) == "list":
        # polygon -- a single object might consist of multiple parts
        # we merge all parts into one mask rle code
        rles = maskUtils.frPyObjects(segm, h, w)
        rle = maskUtils.merge(rles)
    elif type(segm['counts']) == "list":
        # uncompressed RLE
        rle = maskUtils.frPyObjects(segm, h, w)
    else:
        # rle
        rle = segm
    return rle


def annToMask(ann, frameId):
    """
    Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
    :return: binary mask (numpy 2D array)
    """
    rle = annToRLE(ann, frameId)
    if rle is not None:
        m = maskUtils.decode(rle)
        return m



def combineMasks(masks, width, height):
    # Crear una matriz vacía para la máscara combinada
    combined = np.zeros((height, width), dtype=np.uint8)

    # Combinar las máscaras en la matriz vacía
    for mask in masks:
        combined += mask  # Sumar la máscara a la máscara combinada

    # Aplicar umbral para obtener una única máscara binaria
    combined = np.where(combined > 0, 1, 0)
    return combined

def unifyMasks(masks, width, height):
    # Crear una matriz vacía para la máscara combinada
    unified = np.zeros((height, width), dtype=np.uint8)

    # Combinar las máscaras en la matriz vacía
    for mask in masks:
        unified += mask  # Sumar la máscara a la máscara combinada

    
    return unified

In [14]:
def load_images_from_folder(path,image_files):
    images = []
    for file in image_files:
        img = cv2.imread(os.path.join(path,file))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        images.append(img)
    return images

def load_all_initial_masks_from_dataset():
    all_masks = []
    for video in vidTrain:
        ann = [a for a in annTrain if a['video_id'] == video['id']]
        masks = [annToMask(a, 0) * (i + 1) for i, a in enumerate(ann) if annToMask(a, 0) is not None]
        all_masks.append(unifyMasks(masks, video['width'], video['height']))
    return all_masks

def load_all_masks_for_video(video):
    ann = [a for a in annTrain if a['video_id'] == video['id']]
    all_masks  = []
    for image_num in range(0,video['length']):
        masks = []
        for i, a in enumerate(ann):
            annot = annToMask(a, image_num)
            if annot is not None: masks.append(annot * (i + 1))
        single_mask = unifyMasks(masks, video['width'], video['height'])
        all_masks.append(single_mask)
    return all_masks

def pad_to_divisible_by_two(frames):
    max_height = max(frame.shape[0] for frame in frames)
    max_width = max(frame.shape[1] for frame in frames)
    new_height = max_height + 1 if max_height % 2 != 0 else max_height
    new_width = max_width + 1 if max_width % 2 != 0 else max_width

    padded_frames = []
    for frame in frames:
        height_pad = new_height - frame.shape[0]
        width_pad = new_width - frame.shape[1]
        padded_frame = np.pad(frame, ((0, height_pad), (0, width_pad), (0, 0)), mode='constant')
        padded_frames.append(padded_frame)

    return padded_frames

def generate_video_from_frames(frames, output_path, fps=30):
    frames = torch.from_numpy(np.asarray(frames))
    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))
    torchvision.io.write_video(output_path, frames, fps=fps, video_codec="libx264")
    return output_path

def select_smaller_set(videos, size = 100):
    size = [(video['id'],video['height']*video['height']*len(video['file_names'])) for video in videos]
    #size = [(video['id'],video['height']*video['height']) for video in videos]
    sorted_list = sorted(size, key=lambda x: x[1])
    listed = [tup[0] for tup in sorted_list[:100]]
    filtered_list = [d for id_ in listed for d in vidTrain if d.get('id') == id_]
    return filtered_list

In [6]:
def calculate_iou(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate intersection and union for each label
    labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    intersection = np.zeros_like(mask1, dtype=np.float32)
    union = np.zeros_like(mask1, dtype=np.float32)
    iou_per_label = {}

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label
        c_intersection = np.logical_and(mask1_label, mask2_label)
        c_union = np.logical_or(mask1_label, mask2_label)
        intersection += c_intersection
        union += c_union
        iou_per_label[label] = np.sum(c_intersection) / np.sum(c_union)

    # Calculate IoU
    iou = np.sum(intersection) / np.sum(union)

    return iou, iou_per_label

def compute_f_score(true_positives, false_positives, false_negatives):
    divider = (true_positives + false_positives)
    precision = (true_positives / divider) if divider != 0 else 0

    divider = (true_positives + false_negatives)
    recall = (true_positives / divider) if divider != 0 else 0

    divider = (precision + recall)
    f_measure = (2 * (precision * recall) / divider) if divider != 0 else 0
    return f_measure

def compute_f_measure(mask1, mask2):
    # Ensure both masks have the same shape
    assert mask1.shape == mask2.shape, "Mask shapes must be the same."

    # Calculate F-measure for each label
    labels = np.unique(np.concatenate((mask1, mask2)))[1:]
    f_measure_per_label = {}
    add_true_positives = 0
    add_false_positives = 0
    add_false_negatives = 0

    for label in labels:
        mask1_label = mask1 == label
        mask2_label = mask2 == label

        true_positives = np.logical_and(mask1_label, mask2_label).sum()
        false_positives = np.logical_and(mask1_label, np.logical_not(mask2_label)).sum()
        false_negatives = np.logical_and(np.logical_not(mask1_label), mask2_label).sum()

        add_true_positives += true_positives
        add_false_positives += false_positives
        add_false_negatives += false_negatives

        f_measure = compute_f_score(true_positives, false_positives, false_negatives)
        f_measure_per_label[label] = f_measure

    overall_f_measure = compute_f_score(add_true_positives, add_false_positives, add_false_negatives)
    return overall_f_measure,f_measure_per_label

def add_dict(list_of_dicts):
    mean_dict = {}
    key_counts = {}

    for d in list_of_dicts:
        for key, value in d.items():
            mean_dict[key] = mean_dict.get(key, 0) + value
            key_counts[key] = key_counts.get(key, 0) + 1

    for key in mean_dict:
        mean_dict[key] /= key_counts[key]
    return mean_dict

def compute_all_metrics(masks,ground_truth_masks):
    f_measure_lst, f_measure_per_label_lst, iou_lst, iou_per_label_lst  =  [], [], [], []
    for i,(mask_infered, mask_gt) in enumerate(zip(masks,ground_truth_masks)):
            f_measure, f_measure_per_label = compute_f_measure(mask_infered,mask_gt)
            iou, iou_per_label = calculate_iou(mask_infered,mask_gt)
            #print(f'Mask {i + 1}: f_mesure {f_measure}, per label {f_measure_per_label}, iou {iou}, per label {iou_per_label}')
            f_measure_lst.append(f_measure)
            f_measure_per_label_lst.append(f_measure_per_label)
            iou_lst.append(iou)
            iou_per_label_lst.append(iou_per_label)
    mean_f_measure = np.array(f_measure_lst).mean()
    mean_iou = np.array(iou_lst).mean()
    mean_f_measure_per_label  = add_dict(f_measure_per_label_lst)
    mean_iou_per_label_label  = add_dict(iou_per_label_lst)#{key: sum(d[key] for d in iou_per_label_lst)/len(iou_per_label_lst) for key in iou_per_label_lst[0]}
    return mean_f_measure,mean_iou, mean_f_measure_per_label, mean_iou_per_label_label




            

In [35]:
def run_model_on_ovis_set(name, model,path_set,videos, annotations, compute_metrics = False,save_masks = False, compute_video = False, verbose = True):
    for video in videos:
        # Load all images as np.array
        if verbose: print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
        video_folder = video["file_names"][0].split("/")[0]
        if verbose: print(f'Tracking Video {video_folder} with dimensions {video["width"]}x{video["height"]}')
        if verbose: print('Loading dataset images')
        images = load_images_from_folder(path_set,video['file_names'])

        # Load al poligon of first image to a usable mask
        if verbose: print('Creating first annotated mask for VOS model')
        ann = [a for a in annotations if a['video_id'] == video['id']]
        masks = [(annToMask(a, 0) * (i + 1))  for i, a in enumerate(ann) if annToMask(a, 0) is not None ]
        initial_mask = unifyMasks(masks, video['width'], video['height'])

        #Compute masks for all images
        if verbose:print('Computing all masks')
        model.xmem.clear_memory()
        masks, logits, painted_images = model.generator(images=images, template_mask=initial_mask)
        model.xmem.clear_memory()  

        folder_path = f'./result/{name}'
        if compute_metrics or compute_video or save_masks: 
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

        if compute_metrics:
            if verbose: print('Computing Metrics')
            ground_truth_masks = load_all_masks_for_video(video)
            f_measure, iou, f_label, iou_label = compute_all_metrics(masks[1:],ground_truth_masks[1:])
            print(f_measure, iou, f_label, iou_label)
                
        if compute_video: 
            if verbose: print('Generating video')
            if video['width'] % 2 != 0 or video['height'] % 2 != 0: 
                painted_images = pad_to_divisible_by_two(painted_images)
            generate_video_from_frames(painted_images, output_path= folder_path + f"/{video['id']}_{video_folder}.mp4", fps = 10) 

        if save_masks:
            if verbose: print('Saving masks') 
            path_to_masks = folder_path + '/masks'
            if not os.path.exists(path_to_masks): os.makedirs(path_to_masks)
            for i,mask in enumerate(painted_images): 
                image = Image.fromarray(mask)
                image.save(os.path.join(path_to_masks, '{:05d}.png'.format(i + 1)))

                
    return masks, logits, painted_images

In [8]:
SAM_checkpoint = "./checkpoints/sam_vit_h_4b8939.pth"
SAM_hq_checkpoints = "./checkpoints/sam_hq_vit_h.pth"
xmem_checkpoint = "./checkpoints/XMem-s012.pth"
e2fgvi_checkpoint = "./checkpoints/E2FGVI-HQ-CVPR22.pth"
args = {
    'use_refinement' : False
        }
model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)

''' 
args = {
    'use_refinement' : True,
    'refinement_mode' : 'bbox'
         }
modelSamBbox = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)


args = {
   'use_refinement' : True,
   'refinement_mode' : 'point'
       }
modelSamPoint = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)



args = {
   'use_refinement' : True,
   'refinement_mode' : 'both'
       }
modelBoth = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)

'''
args = {
   'use_refinement' : True,
   'refinement_mode' : 'mask_bbox_pos_neg'
       }
modelBothNeg = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)


Initializing BaseSegmenter to cuda:0
Hyperparameters read from the model weights: C^k=64, C^v=512, C^h=64
Single object mode: False
Sam Refinement NOT ACTIVATED
Initializing BaseSegmenter to cuda:0
Hyperparameters read from the model weights: C^k=64, C^v=512, C^h=64
Single object mode: False
Sam Refinement ACTIVATED. Mode: mask_bbox_pos_neg


In [9]:
vidToTest = select_smaller_set(vidTrain,50)

In [11]:
masks, logits, painted_images = run_model_on_ovis_set(name = 'FirstName',model = model, path_set = ovis_images,videos = vidToTest[8:9],annotations = annTrain,compute_metrics = True, save_masks=False, compute_video=True,verbose = False)

Tracking image: 100%|██████████| 21/21 [00:00<00:00, 24.70it/s]


0.18012959829450154 0.11718963 {1: 0.16035959304449188, 2: 0.06073457267986593, 3: 0.1851564090402146, 4: 0.12183350504265537, 5: 0.0668668643459481, 6: 0.0403632372004753, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0} {1: 0.11090765322937288, 2: 0.03560684171917701, 3: 0.13267274252721278, 4: 0.08055986689446075, 5: 0.053229007279399644, 6: 0.024224088284481037, 7: 0.0, 8: 0.0, 9: 0.0, 10: 0.0}


In [36]:
masks, logits, painted_images = run_model_on_ovis_set(name = 'CCCC',model = modelBothNeg, path_set = ovis_images,videos = vidToTest[0:1],annotations = annTrain,compute_metrics = True, save_masks=True, compute_video=True,verbose = False)

Tracking image: 100%|██████████| 18/18 [00:06<00:00,  2.92it/s]


0.661423838068007 0.58558905 {1: 0.6622708741412114, 2: 0.52261799744606} {1: 0.6229903321812119, 2: 0.450181631876849}


In [17]:
painted_images[0].shape

(317, 640, 3)

In [20]:
pad_to_divisible_by_two(painted_images)[0].shape

(318, 640, 3)

JUNK TESTING 

In [None]:
images = load_images_from_folder(ovis_images,vidTrain[1]['file_names'])
plt.imshow(images[0])

In [None]:
count = 0
for video in vidTrain: 
    if video['width'] % 2 != 0 or video['height'] % 2 != 0:
        count+= 1
        print(video['width'], video['height'])
count