Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

save depth maps first, and then validate metrics on NYUv2, got rmse of 0.375, but rmse in paper is 0.27 #111

Open
suzdl opened this issue Mar 28, 2024 · 1 comment

Comments

@suzdl
Copy link

suzdl commented Mar 28, 2024

I load the model "ZoeD_M12_N.pt", and run the general following code with little modification
, to save metrics depth map first.

import torch
import os
import datetime

import numpy as np

from PIL import Image
from pathlib import Path
from torchvision import transforms

from zoedepth.models.builder import build_model
from zoedepth.utils.config import get_config

from zoedepth.data.preprocess import get_black_border

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

def main():
    datas_dir = 'nyu_scenes_split/test'

    method = 'mde_zoedepth_eval'

    crop_black_or_white_border_ = False

    # conf = get_config("zoedepth", "infer")
    conf = get_config("zoedepth", "eval")
    model_zoe_n = build_model(conf)
    zoe = model_zoe_n.to('cuda')

    dataset_path = Path(datas_dir)
    count = 0
    for img_dir in dataset_path.rglob('*.jpg'):
        img_dir = str(img_dir)
        if 'rgb_' in img_dir:
            count += 1

            
            if crop_black_or_white_border_:
                rgb_pil = Image.open(img_dir).convert('RGB')
                w, h = rgb_pil.size
                crop_params = get_black_border(np.array(rgb_pil, dtype=np.uint8))
                rgb_pil = rgb_pil.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom))
                rgb_array = np.array(rgb_pil)
                rgb_array = np.pad(rgb_array, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), mode='reflect')
                rgb_pil = Image.fromarray(rgb_array)
            else:
                rgb_pil = Image.open(img_dir).convert('RGB')


            with torch.no_grad():
                mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")*1000
                # mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")

            mde_array = mde_array.astype(np.uint16)

            mde_pil = Image.fromarray(mde_array)
            save_dir = img_dir.replace('test', method)
            save_dir = save_dir.replace('jpg', 'png')
            save_dir = save_dir.replace('rgb', 'mde')
            img_name = save_dir.split('/')[-1]
            saves_dir = save_dir.replace(img_name, '')
            os.makedirs(saves_dir, exist_ok=True)
            mde_pil.save(save_dir)
            
            # save_dir = img_dir.replace('test', method)
            # save_dir = save_dir.replace('jpg', 'npy')
            # save_dir = save_dir.replace('rgb', 'mde')
            # img_name = save_dir.split('/')[-1]
            # saves_dir = save_dir.replace(img_name, '')
            # os.makedirs(saves_dir, exist_ok=True)
            # np.save(save_dir, mde_array)
        else:
            count += 0
            
    print(f'{method} has generated {count} mdes, saved to {saves_dir}')


if __name__ == '__main__':
    tm_begin = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    main()
    tm_end = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    print('tm_end: ', tm_end)

in this way, I will get the output metric depth in mm, and I test it by the following code, got bad rmse of 0.375 by the following code:

from PIL import Image
from pathlib import Path

import datetime
import numpy as np
import torch
import math
import torch.nn.functional as F

def gaussian(window_size, sigma):
    gauss = torch.Tensor([math.exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
    return gauss/gauss.sum()
def create_window(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    return window
def compute_ssim(pre_gsmaps, gt_gsmaps, window_size=11, size_average=True, full=False, val_range=None, window=None):
    if val_range is None:
        if torch.max(pre_gsmaps) > 128:
            max_val = 255
        else:
            max_val = 1
        if torch.min(pre_gsmaps) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range
    padd = 0
    (_, channel, height, width) = pre_gsmaps.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window(real_size, channel=channel).to(pre_gsmaps.device)

    mu1 = F.conv2d(pre_gsmaps, window, padding=padd, groups=channel)
    mu2 = F.conv2d(gt_gsmaps, window, padding=padd, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(pre_gsmaps * pre_gsmaps, window, padding=padd, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(gt_gsmaps * gt_gsmaps, window, padding=padd, groups=channel) - mu2_sq
    sigma12 = F.conv2d(pre_gsmaps * gt_gsmaps, window, padding=padd, groups=channel) - mu1_mu2
 
    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2
 
    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)  # contrast sensitivity
 
    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
 
    if size_average:
        ret = ssim_map.mean() # 无mean返回map
        # ret = ssim_map
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)
 
    if full:
        return ret, cs
    return ret

def gen_1d_rand_pairs(gt_1d_array, point_num):
    if np.array(gt_1d_array.shape).shape != (1,):
        raise ValueError('array must be 1d')
    np.random.seed(int(gt_1d_array[0]))
    length = gt_1d_array.shape[0]
    pos = np.random.rand(2, point_num)
    pos *= np.array([[length], [length]])
    pos = np.floor(pos)
    return pos

def ranking_1d_eval(pre, gt, positions, t=0.03):
    idx1 = positions[0].astype(np.int64)
    idx2 = positions[1].astype(np.int64)

    z_1_pre = pre[idx1]
    z_2_pre = pre[idx2]
    rela_pre = z_1_pre/z_2_pre

    z_1_gt = gt[idx1]
    z_2_gt = gt[idx2]
    rela_gt = z_1_gt/z_2_gt

    mask1 = z_1_gt == 0
    mask2 = z_2_gt == 0
    mask = mask1 * mask2

    z_1_pre[mask == True] = 1
    z_2_pre[mask == True] = 1
    z_1_gt[mask == True] = 1
    z_2_gt[mask == True] = 1
    
    mask_list_pre = np.zeros_like(rela_pre)
    mask_list_pre[rela_pre>(1+t)] = 1
    mask_list_pre[rela_pre<(1-t)] = -1
    
    mask_list_gt = np.zeros_like(rela_pre)
    mask_list_gt[rela_gt>(1+t)] = 1
    mask_list_gt[rela_gt<(1-t)] = -1

    diff_mask = mask_list_gt - mask_list_pre
    wrong_points = np.count_nonzero(diff_mask)

    return wrong_points/len(rela_gt)

def compute_errors(gt, pred):

    rand_pairs = gen_1d_rand_pairs(gt, point_num=50000)
    oe = ranking_1d_eval(pred, gt, rand_pairs) * 100

    pre_tensor = torch.from_numpy(pred).unsqueeze(0).unsqueeze(0).unsqueeze(0)
    gt_tensor = torch.from_numpy(gt).unsqueeze(0).unsqueeze(0).unsqueeze(0)
    ssim = compute_ssim(pre_tensor, gt_tensor)

    ms = (gt - pred) ** 2
    psnr = 20 * np.log10(255 / np.sqrt(ms.mean()))

    mae = np.mean(np.abs(gt - pred))

    igt = 1 / gt
    ipred = 1 / pred

    iabsrel = np.mean(np.abs(igt - ipred) / igt)
    irmse = np.sqrt(np.mean((igt - ipred) ** 2))
    imae = np.mean(np.abs(igt - ipred))

    thresh = np.maximum((gt / pred), (pred / gt))
    d1 = (thresh < 1.25).mean()
    d2 = (thresh < 1.25 ** 2).mean()
    d3 = (thresh < 1.25 ** 3).mean()

    rmse = (gt - pred) ** 2
    rmse = np.sqrt(rmse.mean())

    logrmse = (np.log(gt) - np.log(pred)) ** 2
    logrmse = np.sqrt(logrmse.mean())

    absrel = np.mean(np.abs(gt - pred) / gt)
    sqrel = np.mean(((gt - pred) ** 2) / gt)

    err = np.log(pred) - np.log(gt)
    silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100

    err = np.abs(np.log10(pred) - np.log10(gt))
    log10 = np.mean(err)

    # return np.array([silog, absrel, log10, rmse, sqrel, logrmse, d1, d2, d3])
    return np.array([oe, psnr, ssim, absrel, rmse, mae, iabsrel, irmse, imae, d1*100, d2*100, d3*100])

def main():

    # method = 'mde_newcrfs'
    # method = 'mde_zoedepth'
    method = 'mde_zoedepth_eval'

    gts_path = Path('nyu_scenes_split/test')
    measures_sum = np.zeros(12)
    cnt = 0
    for gt_path in gts_path.rglob('*.png'):
        gt_dir = str(gt_path)

        mde_dir = gt_dir.replace('test', method)
        mde_dir = mde_dir.replace('sync_depth', 'mde')
        # mde_dir = mde_dir.replace('png', 'npy')

        mde_array = np.array(Image.open(mde_dir), dtype=np.float32)/1000
        # mde_array = np.load(mde_dir)
        gt_array = np.array(Image.open(gt_dir), dtype=np.float32)/1000

        # a = gt_array.shape
        # import cv2
        # mde_array = cv2.resize(mde_array, (gt_array.shape[1], gt_array.shape[0]), interpolation=cv2.INTER_CUBIC)

        mde_array[mde_array < 1e-3] = 1e-3
        mde_array[mde_array > 10] = 10
        mde_array[np.isinf(mde_array)] = 10
        mde_array[np.isnan(mde_array)] = 1e-3

        valid_mask = np.logical_and(gt_array > 1e-3, gt_array < 10)
        eval_mask = np.zeros(valid_mask.shape)
        eval_mask[45:471, 41:601] = 1
        valid_mask = np.logical_and(valid_mask, eval_mask)
        
        measures_sum += compute_errors(gt_array[valid_mask], mde_array[valid_mask])
        cnt += 1

    measures_sum /= cnt
    print('oe psnr ssim absrel rmse mae iabsrel irmse imae d1 d2 d3')
    print(f'{measures_sum[0]:.3f} {measures_sum[1]:.3f} {measures_sum[2]:.3f} {measures_sum[3]:.3f} {measures_sum[4]:.3f} {measures_sum[5]:.3f} {measures_sum[6]:.3f} {measures_sum[7]:.3f} {measures_sum[8]:.3f} {measures_sum[9]:.1f} {measures_sum[10]:.1f} {measures_sum[11]:.1f}')
    
        

if __name__ == '__main__':
    tm_begin = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    main()
    tm_end = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    print('tm_end: ', tm_end)

I compute the rmse in the same way for the method "NeWCRFs", and got the right rmse of 0.333. However, when I tried zoedepth, I got the rmse of 0.375. As you can see in my annotation, I have tried to crop the black border of RGB the same as Zoedepth, changed the mode of 'inter' into 'eval', and also tried to save the npy file to avoid round-off error, but none of them work. An interesting thing is that when I directly run the evaluate.py by the author, I got the right rmse of 0.27.

Directly utilize "mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")" is so convenient to save depth map, but what makes the metrics down?

@suzdl suzdl changed the title save imgs first and validate metrics on NYUv2, got rmse of 0.375, but rmse in paper is 0.27 save depth maps first, and then validate metrics on NYUv2, got rmse of 0.375, but rmse in paper is 0.27 Mar 28, 2024
@suzdl
Copy link
Author

suzdl commented Mar 29, 2024

solve it by the following code

#mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")*1000
mde_array = zoe.infer_pil(rgb_pil, pad_input=False, output_type="numpy")*1000

the parameter pad_input may make a confusing image with padded border in NYUv2, like this
test_before

this confusing operation not only affects the resolution of RGB input, but also affects the focal length of NYUv2.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant