In [None]:
import torch
import torch.nn as nn

# import from guided-diffusion folder
from model import GuidedDiffusionNetwork
from ddpm_scheduler import DDPMScheduler
from scenes_dataset import ScenesDataset, DatasetConstants

In [None]:
import json

# Load data from JSON file
with open('datasets/data/train.json', 'r') as file:
    train_data = json.load(file)['scenes']

with open('datasets/data/val.json', 'r') as file:
    val_data = json.load(file)['scenes']
  
# Not available yet  
# with open('datasets/data/test.json', 'r') as file:
#     test_data = json.load(file)['scenes']

In [None]:
B = 128 # num of scenes in batch

# Scene hyperparams
N = 20 # num of objects in scene
D = 15 # dim of objects from the scene

# Time hyperparams
T = 14

# Condition hyperparmas
C = 300 # dim of node features
R = 23+1 # num of relations

hparams = {
    # constants
    'epochs': 2000, 'scheduler_loss': 'l2', 'rgc_activation': 'tanh',
    # from hparam search
    'batch_size': 32, 'time_dim': 44, 'rgc_hidden_dims': '()', 'rgc_num_bases': 4, 'rgc_aggr': 'mean', 'rgc_dp_rate': 0.14463856683812687, 'rgc_bias': False, 'attention_self_head_dims': 30, 'attention_num_heads': 1, 'attention_cross_head_dims': 30, 'scheduler_timesteps': 1000, 'scheduler_beta_schedule': 'linear', 'cfg_cond_drop_prob': 0.16303181894889107, 'optimizer_lr': 0.000571096217369203, 'optimizer_weight_decay': 0.00010261093147577781, 'lr_scheduler_factor': 0.813888153675873, 'lr_scheduler_patience': 60, 'lr_scheduler_minlr': 0.00036368282361166394
}

In [None]:
general_params = {
    "num_obj": N,
    "obj_cond_dim": C,
    'layer_1_dim': D,
    'layer_2_dim': D + hparams['time_dim'],
    "time_dim": hparams['time_dim'],
}

attention_params = {
    "attention_self_head_dim": hparams['attention_self_head_dims'],
    "attention_num_heads": hparams['attention_num_heads'],
    "attention_cross_head_dim": hparams['attention_cross_head_dims']
}

rgc_params = {
    "rgc_hidden_dims": hparams['rgc_hidden_dims'],
    "rgc_num_relations": R,
    "rgc_num_bases": hparams['rgc_num_bases'],
    "rgc_aggr": hparams['rgc_aggr'],
    "rgc_activation": hparams['rgc_activation'],
    "rgc_dp_rate": hparams['rgc_dp_rate'],
    "rgc_bias": hparams['rgc_bias']
}

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
# Not all operations support MPS yet so this option is not available for now
# elif torch.has_mps:
#     device = torch.device('mps')
else:
    device = torch.device('cpu')


# --- Load the data
range_matrix = DatasetConstants.get_range_matrix().to(device)

# --- Instantiate the model
model = GuidedDiffusionNetwork(
    general_params=general_params,
    attention_params=attention_params,
    rgc_params=rgc_params,
    cond_drop_prob=hparams['cfg_cond_drop_prob']
)

# load the best model
model.load_state_dict(torch.load('models/val-model_0146_l2_all+CFG.pt'))

print(f"Model:\n{model}")

scheduler = DDPMScheduler(
    model=model,
    N=N,
    D=D,
    range_matrix = range_matrix,
    timesteps=hparams['scheduler_timesteps'],
    sampling_timesteps=None,
    loss_type=hparams['scheduler_loss'],
    objective='pred_noise',
    beta_schedule=hparams['scheduler_beta_schedule'],
    ddim_sampling_eta=1.0,
    min_snr_loss_weight=False,
    min_snr_gamma=5
)

print(f"DDPM Scheduler:\n{scheduler}")

# Move to device
model = model.to(device)
scheduler = scheduler.to(device)

model.eval()
scheduler.eval()

In [None]:
from torch_geometric.loader import DataLoader

train_dataset = ScenesDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=B, shuffle=False)

val_dataset = ScenesDataset(val_data)
val_dataloader = DataLoader(val_dataset, batch_size=B, shuffle=False)

In [None]:
def generate_semseg_file(sampled_scene, labels=None, scan_id=0):
    # Take the first sample in the batch
    filtered_scene = sampled_scene
    
    # Take the first sample in the batch
    labels = labels

    objs = []
    for i in range(20):
        label = labels[i]
        location = filtered_scene[i, 0:3]
        normalized_axes = filtered_scene[i, 3:12]
        sizes = filtered_scene[i, 12:15]
        
        objs.append({
            'obb': {
                'centroid': location.tolist(),
                'normalizedAxes': normalized_axes.tolist(),
                'axesLengths': sizes.tolist()
            },
            'label': label,
            'dominantNormal': [0, 0, 0], # not used for now
        })

    # Store the sampled scene to visualize using DVIS
    encoded_scene = {
        'scan_id': scan_id,
        'segGroups': objs, # TODO: add segGroups
    }

    # save the sampled scene to a JSON file (create the folder if it doesn't exist)
    with open(f'datasets/data/gen/{scan_id}_semseg.v2.json', 'w') as file:
        json.dump(encoded_scene, file, indent=2)

## DVIS Visualizer

In [None]:
from dvis import dvis
from mathutils import Matrix
import numpy as np
from scipy.spatial.transform import Rotation
import json
import os

In [None]:
def encode_rotation(normalized_axes, rotation_angle, rotation_axis):
    # Convert rotation angle to radians
    rotation_angle_rad = np.deg2rad(rotation_angle)

    if rotation_axis == 'x':
        rotation_matrix = np.array([
            [1, 0, 0],
            [0, np.cos(rotation_angle_rad), -np.sin(rotation_angle_rad)],
            [0, np.sin(rotation_angle_rad), np.cos(rotation_angle_rad)]
        ])
    elif rotation_axis == 'y':
        rotation_matrix = np.array([
            [np.cos(rotation_angle_rad), 0, np.sin(rotation_angle_rad)],
            [0, 1, 0],
            [-np.sin(rotation_angle_rad), 0, np.cos(rotation_angle_rad)]
        ])
    elif rotation_axis == 'z':
        rotation_matrix = np.array([
            [np.cos(rotation_angle_rad), -np.sin(rotation_angle_rad), 0],
            [np.sin(rotation_angle_rad), np.cos(rotation_angle_rad), 0],
            [0, 0, 1]
        ])
    else:
        raise ValueError("Invalid rotation axis. Supported values are 'x', 'y', and 'z'.")
    
    encoded_normalized_axes = np.dot(normalized_axes, rotation_matrix)

    return encoded_normalized_axes

def translate_corners(corners, translation):
    translated_corners = corners + translation
    return translated_corners

# Unit cube definition
unit_cube_corners = np.array([
    [0, 0, 0],
    [0, 0, 1],
    [1, 0, 0],
    [1, 0, 1],
    
    [0, 1, 0],
    [0, 1, 1],
    [1, 1, 0],
    [1, 1, 1],
])

centroid = np.mean(unit_cube_corners, axis=0)
unit_cube_corners = unit_cube_corners - centroid

# Original normalized_axes matrix representing the unit cube's orientation
normalized_axes = np.array([
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1]
])

# Encode degree rotation around the axis
rotation_angle, axis = 0, 'y'
encoded_normalized_axes = encode_rotation(normalized_axes, rotation_angle, axis)

# Apply the encoded rotation to the unit cube corners
unit_cube_corners = np.dot(unit_cube_corners, encoded_normalized_axes)

# Translate the rotated cube
translation = np.array([0, 0, 0])
unit_cube_corners = translate_corners(unit_cube_corners, translation)

# dvis(unit_cube_corners, 'corners', c=-1)

In [None]:
# Specify the path to the dataset folder
dataset_path = 'datasets/data'

def generate_corners(obj):
    obb = obj['obb']
    axes_lengths = obb['axesLengths']
    centroid = obb['centroid']
    normalized_axes = np.reshape(obb['normalizedAxes'], (3, 3))
    
    axes_lengths = np.array(axes_lengths)
    centroid = np.array(centroid)
    normalized_axes = np.array(normalized_axes)
    
    # Swap y and z axes
    # normalized_axes[[1, 2]] = normalized_axes[[2, 1]] # TODO: rotation is off
    axes_lengths[[1, 2]] = axes_lengths[[2, 1]]
    centroid[[1, 2]] = centroid[[2, 1]]

    corners = np.zeros((8, 3))
    for i in range(8):
        corner = unit_cube_corners[i]
        scaled_corner = corner * axes_lengths
        transformed_corner = np.dot(normalized_axes, scaled_corner)
        corners[i] = transformed_corner + centroid

    return corners


def visualize_gt_dataset(split='val', t_max=44, T=None):
    """Visualize the ground truth scenes after filtering (same json file as the one used for training/val)"""
    semseg_file = os.path.join(dataset_path, f'{split}.json')
    
    if not os.path.isfile(semseg_file):
        exit(1)
        
    with open(semseg_file, 'r') as file:
        semseg_data = json.load(file)['scenes']
        
    for t, scene in enumerate(semseg_data):
        if t > t_max:
            break
        
        if T is not None and t != T:
            continue
        
        scan_id = scene['scene_id']
        scene_matrix = scene['scene_matrix']
        labels = scene['labels']
        
        colors_labels_map = {}
        col_index = 0

        for i, row in enumerate(scene_matrix):
            if labels[i] == 'none':
                continue
            
            obj = {
                'obb': {
                    'centroid': row[0:3],
                    'normalizedAxes': row[3:12],
                    'axesLengths': row[12:15]
                }
            }            
            corners = generate_corners(obj)
            
            
            colors_labels_map[labels[i]] = colors_labels_map.get(labels[i], col_index)
            col_index += 1
            # Pass the corners to the visualizer
            dvis(corners, "corners", name=labels[i], c=colors_labels_map[labels[i]], t=t)
        


def visualize_scene(scene_id, t=0):
    """Visualize a single generated scene from a semseg file"""
    scan_folder_path = os.path.join(dataset_path, scene_id)

    # Check if the folder contains semseg.v2.json file
    semseg_file = os.path.join(scan_folder_path, f'{t}_semseg.v2.json')
    if not os.path.isfile(semseg_file):
        exit(1)

    # Read and parse the semseg.v2.json file
    with open(semseg_file, 'r') as file:
        semseg_data = json.load(file)

    scan_id = semseg_data['scan_id']
    seg_groups = semseg_data['segGroups']
    
    colors_labels_map = {}
    col_index = 0

    for i, obj in enumerate(seg_groups):
        if obj['label'] == 'none':
            continue
        
        corners = generate_corners(obj)
        
        # print(obj['label'])
        
        colors_labels_map[obj['label']] = colors_labels_map.get(obj['label'], col_index)
        col_index += 1
        # Pass the corners to the visualizer
        dvis(corners, "corners", name=obj['label'], c=colors_labels_map[obj['label']], t=t)

# Ground Truth visualization

In [None]:
visualize_gt_dataset('val', t_max=45, T=44)

In [None]:
from ddpm_scheduler import DDPMUtils

scene_id = 44

semseg_file = os.path.join(dataset_path, f'val.json')

if not os.path.isfile(semseg_file):
    exit(1)
    
with open(semseg_file, 'r') as file:
    semseg_data = json.load(file)['scenes']
    
scene = semseg_data[scene_id]

scene_matrix = scene['scene_matrix']
labels = scene['labels']

colors_labels_map = {}
col_index = 0

shelf_counter = 0
chair_counter = 0
counter_counter = 0

T = 200
betas = DDPMUtils.linear_beta_schedule(T)
alphas = 1. - betas
alphas_cumprod = torch.cumprod(alphas, dim=0) # alpha_hat_t for every timestep
        

for i, row in enumerate(scene_matrix):
    if labels[i] == 'shelf':
        shelf_counter += 1
    if labels[i] == 'chair':
        chair_counter += 1
    if labels[i] == 'counter':
        counter_counter += 1
    if labels[i] in ['none', 'sink'] or (labels[i] == 'shelf' and shelf_counter > 1) or (labels[i] == 'chair' and chair_counter > 2):
        continue
    
    for t in range(50):
        row = torch.tensor(row)
        # apply diffusion to the row depending on the time step
        row = torch.sqrt(alphas_cumprod[t]) * row + torch.sqrt(1. - alphas_cumprod[t]) * torch.randn_like(row)
        
        obj = {
            'obb': {
                'centroid': row[0:3],
                'normalizedAxes': row[3:12],
                'axesLengths': row[12:15]
            }
        }            
        corners = generate_corners(obj)
        
        if t == 0:
            colors_labels_map[labels[i]] = colors_labels_map.get(labels[i], col_index)
            col_index += 1
            
        # Pass the corners to the visualizer
        dvis(corners, "corners", name=labels[i], c=colors_labels_map[labels[i]], t=t)

# Diffusion process inference

Visualize how a scene gets diffused over time.

In [None]:
for batch in val_dataloader:
    # x_batch = batch.x.to(device)
    obj_cond_batch = batch.cond.to(device)
    edge_cond_batch = batch.edge_index.to(device)
    relation_cond_batch = batch.edge_attr.to(device)
    
    # obj_cond is read as [B*N, C] and needs to be reshaped to [B, N, C]
    obj_cond_batch = obj_cond_batch.view(batch.num_graphs, N, C)
    
    labels_batch = batch.labels

    # Run inference
    with torch.no_grad():      
        # Sample from the model (use the same conditioning as the overfitting)
        # (!) NOTICE: this will return all the samples from the scheduler
        sampled_scenes = scheduler.sample(obj_cond_batch, edge_cond_batch, relation_cond_batch, cond_scale=3., return_all_samples=True)    
        # use only first sample in the batch
        for scene_pair in sampled_scenes:
            t, t_sampled_scenes = scene_pair
            # only visualize one scene from the batch
            sampled_scene = t_sampled_scenes[0]
            labels = labels_batch[0]
            generate_semseg_file(sampled_scene, labels=labels, scan_id=t)

    # do one batch only
    break

In [None]:
#4, 12

i=17

for scene_pair in sampled_scenes:
    t, t_sampled_scenes = scene_pair
    # only visualize one scene from the batch
    sampled_scene = t_sampled_scenes[i]
    labels = labels_batch[i]
    generate_semseg_file(sampled_scene, labels=labels, scan_id=t)

In [None]:
scene_id = 'gen'

# Single scene visualization
# visualize_scene(scene_id)

# Visualize certain timesteps in the reverse order (clean to noisy)
for t in range(0, 400, 10):
    visualize_scene(scene_id, t=t)

# Benchmarking alignment

Generate scenes from all conditions from val_dataset, visualize them, and manually compute alignment score (see paper).

In [None]:
counter = 0

for batch in val_dataloader:
    # x_batch = batch.x.to(device)
    obj_cond_batch = batch.cond.to(device)
    edge_cond_batch = batch.edge_index.to(device)
    relation_cond_batch = batch.edge_attr.to(device)
    
    # X is read as [B*N, D] and needs to be reshaped to [B, N, D]
    # x_batch = x_batch.view(batch.num_graphs, N, D)
    # obj_cond is read as [B*N, C] and needs to be reshaped to [B, N, C]
    obj_cond_batch = obj_cond_batch.view(batch.num_graphs, N, C)
    
    labels_batch = batch.labels

    # Run inference
    with torch.no_grad():      
        # Sample from the model (use the same conditioning as the overfitting)
        # (!) NOTICE: this will return all the samples from the scheduler
        sampled_scenes = scheduler.sample(obj_cond_batch, edge_cond_batch, relation_cond_batch, cond_scale=3., return_all_samples=False)    
        for i in range(sampled_scenes.shape[0]):
            generate_semseg_file(sampled_scenes[i], labels=labels_batch[i], scan_id=counter)
            counter += 1

In [None]:
val_path = 'val'

# Visualize certain timesteps in the reverse order (clean to noisy)
# Scan all folders inside the dataset/val folder
for t, folder in enumerate(os.listdir(os.path.join(dataset_path, val_path))):
    if t == 0:
        visualize_scene(os.path.join(val_path, folder), t=t)

In [None]:
t = 0

all_relationships = ['left', 'right', 'close by', 'behind', 'front', 'attached to', 'standing on', 'lower than', 'higher than', 'lying on', 'smaller than', 'bigger than', 'hanging on', 'supported by', 'standing in', 'leaning against', 'build in', 'lying in', 'connected to', 'belonging to', 'cover', 'part of', 'hanging in']

# Get the t-th scene from (!) val_data and extract labels and rel_cond from it
scene = val_data[t]

# Get the labels from the scene
labels = scene['labels']
edges = scene['graph_edges']
relationships = scene['graph_relationships']

print(f"Labels: {labels}")
print(f"Edges: {edges}")
print(f"Relationships: {relationships}")

# For every edge, get the relationship from the relationships list as well as the two objects from the labels list
# Then, generate a human-readable description of the relationship between the two objects
for i in range(len(edges[0])):
    # Get the two objects from the labels list
    obj1 = labels[edges[0][i]]
    obj2 = labels[edges[1][i]]

    # Get the relationship from the relationships list
    relationship = all_relationships[relationships[i]]

    # Generate a human-readable description of the relationship between the two objects
    print(f"{obj1} is {relationship} {obj2}")

print(f"Total number of relationships: {len(relationships)} for scene id: {t}")

## Alignment score

In [None]:
import numpy as np

gt_results = {
    # example
    # '-1': ([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1], 12)
    '0': ([], )
}

def compute_alignment(results):
    # iterate over manual_results and compute the avg. number of correct predictions
    avg_alignment = 0
    for key, value in results.items():
        # get the predicted relationships for the scene
        predicted_relationships = value[0]
        # get the number of correct predictions
        num_relations = value[1]
        # compute the avg. number of correct predictions
        avg_num_correct_predictions = np.sum(predicted_relationships) / num_relations
        avg_alignment += avg_num_correct_predictions

    avg_alignment /= len(results)
    return avg_alignment
    
print(compute_alignment(gt))