In [125]:
import cv2
import torch
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'

import sys, os
depth_anything_path = "/home/harry7557558/GitHub/external/Depth-Anything-V2/"
sys.path += [depth_anything_path]
current_directory = os.getcwd()
os.chdir(depth_anything_path)

from depth_anything_v2.dpt import DepthAnythingV2

model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
    'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
    'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
    'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}

encoder = 'vitl' # or 'vits', 'vitb', 'vitg'

model = DepthAnythingV2(**model_configs[encoder])
model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location='cpu'))
model = model.to(device).eval()

print(model)

DepthAnythingV2(
  (pretrained): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-23): 24 x NestedTensorBlock(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (drop): Dropout(p=0.0, inplace=Fal

In [None]:
from PIL import Image

# img = "/media/harry7557558/New Volume/sfm_test/pride_float/sun_morning_20_16500/images/00160.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/pride_float/mon_evening_5/images/00460.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/solar_clock/10/images/00100.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/utias/20/images/01660.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/coffee/12/images/00516.jpg"
img = "/media/harry7557558/New Volume/sfm_test/monument/20/images/01220.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/bouquet/10/images/00340.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/mosque/8/images/01248.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/ndravaw/25/images/02000.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/ndravaw/25/images/04900.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/ndravaw2/20/images/04680.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/pit_241203/15/images/00165.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/DinoManTest_JPG_Ori/images/000093.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/chris_yip/20/images/00720.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/2t7/10/images/02430.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/engschool/5/images/00055.jpg"
# img = "/media/harry7557558/New Volume/sfm_test/sydney/imgs/180.jpg"
# img = "/media/harry7557558/New Volume/a2rl_gate/r3w1d1_4/images/00072.jpg"
# img = "/media/harry7557558/New Volume/a2rl_gate/r3w1d3_1/images/00560.jpg"
# img = "/media/harry7557558/New Volume/a2rl_gate/r3w1d3_1/undistorted/00560.jpg"
# img = "/media/harry7557558/New Volume/a2rl_gate/r3w2d1_6/undistorted/00150.jpg"

img = Image.open(img)

from PIL import ExifTags
for orientation in ExifTags.TAGS.keys():
    if ExifTags.TAGS[orientation]=='Orientation':
        break
if hasattr(img, '_getexif'):
    exif = img._getexif()
    if exif is not None and orientation in exif:
        print("exif orientation:", exif[orientation])
        if exif[orientation] == 3:
            img=img.rotate(180, expand=True)
        elif exif[orientation] == 6:
            img=img.rotate(270, expand=True)
        elif exif[orientation] == 8:
            img=img.rotate(90, expand=True)

x = img.convert("RGB")
if False:
    scale = 1024/(x.width*x.height)**0.5
    w = int(x.width*scale/14+0.5)*14
    h = int(x.height*scale/14+0.5)*14
    x = x.resize((w, h), Image.LANCZOS)

print("Image size:", x.size)

x = np.array(x)

from time import perf_counter
with torch.no_grad():
    time0 = perf_counter()
    
    y = model.infer_image(x)

    time1 = perf_counter()
    print(1000*(time1-time0), 'ms')

x = np.transpose(x, (2, 0, 1)).astype(np.float32) / 255.0
print(x.shape, y.shape)
print(y.min().item(), y.max().item())
print(np.median(y).item(), y.mean().item())

Image size: (1080, 1920)
606.5338740008883 ms
(3, 1920, 1080) (1920, 1080)
0.0 660.4945068359375
272.32489013671875 213.77529907226562


In [179]:
from plyfile import PlyData, PlyElement

def write_ply(rgb_array, depth_array, file_path):
    h, w = depth_array.shape
    num_points = w * h

    # Flatten arrays
    depth_flat = depth_array.flatten()
    rgb_flat = rgb_array.reshape((3, -1)).T

    # Create vertex data
    vertex = np.zeros(num_points, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'), 
                                         ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')])
    vertex['x'] = np.tile(np.arange(w), h) / (w*h)**0.5
    vertex['y'] = w - np.repeat(np.arange(h), w) / (w*h)**0.5
    # vertex['z'] = depth_flat / (2.5*np.std(depth_flat))
    # depth_flat = 1.0 / np.clip(depth_flat+0.005, 0.1, np.inf)
    vertex['z'] = depth_flat / (3.0*np.mean(depth_flat))
    rgb = (np.clip(rgb_flat,0,1)*255+0.5).astype(np.uint8)
    vertex['red'] = rgb[:, 0]
    vertex['green'] = rgb[:, 1]
    vertex['blue'] = rgb[:, 2]

    faces = []
    depth = vertex['z']
    print(depth.shape)
    for y in range(h - 1):
        for x in range(w - 1):
            i00 = y * w + x
            i10 = i00 + 1
            i01 = (y + 1) * w + x
            i11 = i01 + 1
            gx = depth[i01]+depth[i11]-depth[i00]-depth[i10]
            gy = depth[i10]+depth[i11]-depth[i00]-depth[i01]
            if max(depth[i00], depth[i10], depth[i01], depth[i11]) < 1e-4:
                continue
            if np.hypot(gx, gy) < 20 / (w*h)**0.5:
                faces.append(([i00, i10, i11, i01], 0))

    faces = np.array(faces,
                     dtype=[('vertex_indices', 'i4', (4,)), ('i', 'u1')])

    # Create PlyElements
    vertex_element = PlyElement.describe(vertex, 'vertex')
    face_element = PlyElement.describe(faces, 'face')

    # Write to PLY file
    ply_data = PlyData([vertex_element, face_element])
    ply_data.write(file_path)


from scipy.ndimage import zoom

scale = 512/(x.shape[1]*x.shape[2])**0.5
# x = zoom(x, (1, scale, scale))
# y = zoom(y, (scale, scale))
write_ply(x, y, "depth_mesh.ply")


(2073600,)
