In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T

from PIL import Image
import requests
import matplotlib.pyplot as plt

import h5py
import glob
from tqdm import tqdm

In [3]:
from models.backbone import Backbone, Joiner
from models.position_encoding import PositionEmbeddingSine
from models.transformer import Transformer
from models.reltr import RelTR

position_embedding = PositionEmbeddingSine(128, normalize=True)
backbone = Backbone('resnet50', False, False, False)
backbone = Joiner(backbone, position_embedding)
backbone.num_channels = 2048

transformer = Transformer(d_model=256, dropout=0.1, nhead=8,
                          dim_feedforward=2048,
                          num_encoder_layers=6,
                          num_decoder_layers=6,
                          normalize_before=False,
                          return_intermediate_dec=True)

model = RelTR(backbone, transformer, num_classes=151, num_rel_classes = 51,
              num_entities=100, num_triplets=200)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")
    

model_path = "/home3/fsml62/LLM_and_SGG_for_MDE/SGG/RelTR/pretrained/checkpoint0149.pth" 
state_dict = torch.load(model_path)

model.load_state_dict(state_dict['model'])

model.eval()

model.to(device)

Using GPU: NVIDIA A100 80GB PCIe MIG 1g.10gb


RelTR(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerDecoderLayer(
          (self_attn_entity): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features

In [4]:
# Some transformation functions
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
          (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    print(out_bbox.shape)
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

In [5]:
import os


data_path = "/home3/fsml62/LLM_and_SGG_for_MDE/dataset/nyu_depth_v2"
save_path = "/home3/fsml62/LLM_and_SGG_for_MDE/GNN_for_MDE/results/SGG/nyu_depth_v2"

# Check if save_path exists
if not os.path.exists(save_path):
    print(f"Creating directory: {save_path}")
    os.makedirs(save_path, exist_ok=True)
else:
    print(f"Directory already exists: {save_path}")

Directory already exists: /home3/fsml62/LLM_and_SGG_for_MDE/GNN_for_MDE/results/SGG/nyu_depth_v2


In [12]:
jpg_files = glob.glob(os.path.join(data_path, '**', '*.jpg'), recursive=True)

os.makedirs(save_path, exist_ok=True)

for img_path in tqdm(jpg_files, total=len(jpg_files)):

    relative_path = os.path.relpath(img_path, data_path)
    h5_filename = os.path.splitext(relative_path)[0] + '.h5'
    h5_file_path = os.path.join(save_path, h5_filename)
    
    os.makedirs(os.path.dirname(h5_file_path), exist_ok=True)
    
    
    im = Image.open(img_path)
    img = transform(im).unsqueeze(0).to(device)
    
    model.eval()
    model.to(device)

    outputs = model(img)
    
    with h5py.File(h5_file_path, 'w') as h5_file:

        for key, value in outputs.items():
            h5_file.create_dataset(key, data=value.detach().cpu().numpy())

    
print("Processing complete.")

100%|██████████| 1449/1449 [03:26<00:00,  7.02it/s]

Processing complete.





In [3]:
with h5py.File('/home3/fsml62/LLM_and_SGG_for_MDE/GNN_for_MDE/results/SGG/nyu_depth_v2/official_splits/train/printer_room/rgb_00448.h5', 'r') as f:
    outputs = {key: torch.tensor(f[key]) for key in f.keys()}


  outputs = {key: torch.tensor(f[key]) for key in f.keys()}


In [11]:
outputs['sub_boxes'][0]

tensor([[0.2887, 0.3825, 0.2953, 0.1416],
        [0.2369, 0.2324, 0.0598, 0.1156],
        [0.6234, 0.7847, 0.4410, 0.4206],
        [0.0777, 0.3373, 0.1512, 0.1748],
        [0.6441, 0.4414, 0.3045, 0.1077],
        [0.5009, 0.4992, 0.9995, 0.9985],
        [0.0760, 0.3371, 0.1489, 0.1756],
        [0.2634, 0.4531, 0.2592, 0.0940],
        [0.6231, 0.7856, 0.4509, 0.4204],
        [0.2730, 0.4532, 0.2803, 0.1177],
        [0.6163, 0.7848, 0.4474, 0.4170],
        [0.0863, 0.5870, 0.1751, 0.2838],
        [0.6218, 0.7845, 0.4448, 0.4207],
        [0.2994, 0.3937, 0.2841, 0.1867],
        [0.6211, 0.7835, 0.4458, 0.4244],
        [0.6207, 0.7860, 0.4367, 0.4150],
        [0.9219, 0.1810, 0.0454, 0.2193],
        [0.0881, 0.5886, 0.1771, 0.2877],
        [0.2225, 0.6882, 0.1039, 0.1054],
        [0.0886, 0.5894, 0.1800, 0.2883],
        [0.2842, 0.6823, 0.3241, 0.4025],
        [0.2350, 0.2325, 0.0584, 0.1129],
        [0.0876, 0.5873, 0.1780, 0.2917],
        [0.0843, 0.5912, 0.1708, 0

In [12]:
outputs['sub_boxes'][0,:, 2]

tensor([0.2953, 0.0598, 0.4410, 0.1512, 0.3045, 0.9995, 0.1489, 0.2592, 0.4509,
        0.2803, 0.4474, 0.1751, 0.4448, 0.2841, 0.4458, 0.4367, 0.0454, 0.1771,
        0.1039, 0.1800, 0.3241, 0.0584, 0.1780, 0.1708, 0.3296, 0.1762, 0.0426,
        0.0467, 0.0629, 0.3252, 0.4435, 0.4358, 0.0605, 0.1167, 0.0615, 0.3247,
        0.3194, 0.1783, 0.0548, 0.0465, 0.0591, 0.0561, 0.0608, 0.2867, 0.3282,
        0.4329, 0.3210, 0.4234, 0.4361, 0.4011, 0.3183, 0.2877, 0.1726, 0.2790,
        0.0654, 0.0998, 0.3221, 0.2023, 0.3172, 0.3193, 0.4368, 0.3121, 0.1795,
        0.4426, 0.1462, 0.0519, 0.3254, 0.4108, 0.1132, 0.3243, 0.4408, 0.4234,
        0.1395, 0.1796, 0.4001, 0.0498, 0.3358, 0.4267, 0.2388, 0.3202, 0.2659,
        0.4437, 0.1881, 0.3681, 0.3229, 0.0573, 0.1131, 0.3237, 0.2350, 0.1811,
        0.4348, 0.4420, 0.1819, 0.0469, 0.1065, 0.4390, 0.3273, 0.4344, 0.4430,
        0.4416, 0.1505, 0.3189, 0.4219, 0.4076, 0.2724, 0.3166, 0.3247, 0.0566,
        0.2988, 0.3272, 0.3903, 0.9999, 

In [15]:

# keep only predictions with >0.3 confidence
probas = outputs['rel_logits'].softmax(-1)[0, :, :-1]
probas_sub = outputs['sub_logits'].softmax(-1)[0, :, :-1]
probas_obj = outputs['obj_logits'].softmax(-1)[0, :, :-1]
keep = torch.logical_and(probas.max(-1).values > 0.3, torch.logical_and(probas_sub.max(-1).values > 0.3,
                                                                        probas_obj.max(-1).values > 0.3))

In [17]:
# convert boxes from [0; 1] to image scales
sub_bboxes_scaled = rescale_bboxes(outputs['sub_boxes'][0, keep], im.size)
obj_bboxes_scaled = rescale_bboxes(outputs['obj_boxes'][0, keep], im.size)
rel_list = torch.argmax(outputs['rel_logits'][0, keep], dim=1)

torch.Size([2, 4])
torch.Size([2, 4])


In [47]:
probas = outputs['rel_logits'].softmax(-1)[0, :, :-1]
probas_sub = outputs['sub_logits'].softmax(-1)[0, :, :-1]
probas_obj = outputs['obj_logits'].softmax(-1)[0, :, :-1]

keep = torch.logical_and(probas.max(-1).values > 0.3,
                         torch.logical_and(probas_sub.max(-1).values > 0.1,
                                           probas_obj.max(-1).values > 0.1))


sub_bboxes_scaled = rescale_bboxes(outputs['sub_boxes'][0, keep], im.size)
obj_bboxes_scaled = rescale_bboxes(outputs['obj_boxes'][0, keep], im.size)
rel_list = torch.argmax(outputs['rel_logits'][0, keep], dim=1)


# Store the filtered outputs in the dictionary
filtered_data = {
    'probas': probas[keep],
    'probas_sub': probas_sub[keep],
    'probas_obj': probas_obj[keep]
}

torch.Size([6, 4])
torch.Size([6, 4])


In [45]:
filtered_data['probas'].shape

torch.Size([6, 51])

In [48]:
obj_bboxes_scaled

tensor([[ 79.8119, 216.0722, 287.6272, 426.2114],
        [ 77.8434, 216.6798, 286.8960, 424.1441],
        [ 66.4892, 193.3925, 235.2505, 243.1502],
        [ 72.4065, 184.8528, 285.2302, 269.1777],
        [ 77.5479, 223.8046, 287.3661, 431.0623],
        [ 72.3060, 188.4806, 287.1606, 270.6227]])

In [51]:
outputs['rel_logits'][0, keep].shape

torch.Size([6, 52])