In [4]:
import torch
import torch.nn.functional as F
import numpy as np
import pickle
import enum
import os
os.makedirs('./processed_dataset', exist_ok=True)

print("Starting data preprocessing pipeline...")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


Starting data preprocessing pipeline...
PyTorch version: 2.0.1
CUDA available: True


In [5]:
# Define utility functions and enums
class NodeType(enum.IntEnum):
    """Node type enumeration for boundary conditions."""
    NORMAL = 0
    WALL_BOUNDARY = 1
    SIZE = 2

def triangles_to_edges(faces):
    """
    Convert triangle faces to edge connectivity.
    
    Args:
        faces (torch.Tensor): Triangle faces [F, 3]
        
    Returns:
        tuple: (two_way_connectivity, min_to_max_connectivity)
    """
    edges = torch.cat((
        faces[:, 0:2],
        faces[:, 1:3],
        torch.stack((faces[:, 2], faces[:, 0]), dim=1)
    ), dim=0)
    
    receivers, _ = torch.min(edges, dim=1)
    senders, _ = torch.max(edges, dim=1)
    packed_edges = torch.stack((senders, receivers), dim=1)
    unique_edges = torch.unique(packed_edges, return_inverse=False, return_counts=False, dim=0)
    senders, receivers = torch.unbind(unique_edges, dim=1)
    senders = senders.to(torch.int64)
    receivers = receivers.to(torch.int64)

    min_to_max_connectivity = torch.stack((senders, receivers), dim=0)
    two_way_connectivity = torch.stack((
        torch.cat((senders, receivers), dim=0), 
        torch.cat((receivers, senders), dim=0)
    ))
    
    return two_way_connectivity, min_to_max_connectivity

def preprocess_node_types(data_list):
    """
    Preprocess node types by converting boundary node codes to standard format.
    
    Args:
        data_list (list): List of trajectory data
    
    Returns:
        list: Processed data with standardized node types
    """
    processed_data = []
    
    for trace in data_list:
        processed_trace = []
        for time_step in trace:
            processed_step = time_step.copy()
            
            # Convert node type 6 to WALL_BOUNDARY
            node_types = processed_step['node_type'].copy()
            for i, node_type in enumerate(node_types):
                if node_type == 6:
                    node_types[i] = NodeType.WALL_BOUNDARY
            
            processed_step['node_type'] = node_types
            processed_trace.append(processed_step)
        
        processed_data.append(processed_trace)
    
    return processed_data

print("Utility functions defined successfully.")


Utility functions defined successfully.


In [6]:
# Load raw data
print("Loading raw data...")

# Define data paths
dataset_path = "./raw_dataset/train.pickle"
test_path = "./raw_dataset/test.pickle"
processed_dir = "./processed_dataset/"

# Load training and test data
with open(dataset_path, 'rb') as file:
    data_train = pickle.load(file)
    print(f"Loaded training data: {len(data_train)} trajectories")

with open(test_path, 'rb') as file:
    data_test = pickle.load(file)
    print(f"Loaded test data: {len(data_test)} trajectories")

# Show data structure
print(f"Sample data keys: {data_train[0][0].keys()}")
print(f"Test data keys: {data_test[0][0].keys()}")

# # Select subsets for processing
# data_train = data_train[0:10]
# data_test = data_test[4:6]

print(f"Selected {len(data_train)} training trajectories")
print(f"Selected {len(data_test)} test trajectories")

# Preprocess node types
data_train = preprocess_node_types(data_train)
data_test = preprocess_node_types(data_test)

print("Node type preprocessing completed.")


Loading raw data...
Loaded training data: 30 trajectories
Loaded test data: 6 trajectories
Sample data keys: dict_keys(['cells', 'mesh_pos', 'node_type', 'world_pos', 'prev|world_pos', 'target|world_pos', 'velocity', 'dt', 'youngs'])
Test data keys: dict_keys(['cells', 'mesh_pos', 'node_type', 'world_pos', 'prev|world_pos', 'target|world_pos', 'velocity', 'dt', 'youngs'])
Selected 30 training trajectories
Selected 6 test trajectories
Node type preprocessing completed.


In [7]:
def preparation(save_name, data):
    """
    Prepare raw trajectory data for training by converting to graph format.
    
    Args:
        save_name (str): Name for saving processed data
        data (list): List of trajectory data
        
    Returns:
        None: Saves processed data to file
    """
    number_trajectories = len(data)
    data_to_graph = []
    
    for i, trajectory in enumerate(data):
        if i == number_trajectories:
            break
            
        print(f"Processing trajectory: {i}")
        tra = {
            "edge_index": [],
            "edge_attr": [],
            "mesh_edge_attr": [],
            "cells": [],
            "world_pos": [],
            "prev_world_pos": [],
            "target_world_pos": [],
            "velocity": [],
            "youngs": [],
            "one_hot_node_type": [],
            "node_type": [],
        }
        data_to_graph.append(tra)
        number_ts = len(trajectory)

        for ts in range(number_ts):
            if ts == 400:  # Limit time steps
                break
                
            inputs = trajectory[ts]

            # Extract and convert data
            world_pos = torch.squeeze(torch.tensor(inputs['world_pos']))
            prev_world_pos = torch.squeeze(torch.tensor(inputs['prev|world_pos']))
            target_world_pos = torch.squeeze(torch.tensor(inputs['target|world_pos']))
            cells = torch.squeeze(torch.tensor(inputs['cells']))
            youngs = torch.squeeze(torch.tensor(inputs['youngs']))
            mesh_pos = torch.squeeze(torch.tensor(inputs['mesh_pos']))
            node_type = torch.tensor(inputs['node_type'])

            # Calculate velocity
            velocity = (world_pos - prev_world_pos) / 0.1

            # Create one-hot encoding for node types
            one_hot_node_type = F.one_hot(node_type[:, 0].to(torch.int64), NodeType.SIZE)

            # Prepare material properties
            node_number = world_pos.shape[0]
            youngs = youngs.reshape(1, 1).repeat(node_number, 1) / 1e6

            # Generate edge connectivity
            edge_index, _ = triangles_to_edges(cells)

            # Calculate edge features
            m_i = mesh_pos[edge_index[0]]
            m_j = mesh_pos[edge_index[1]]
            u_i = world_pos[edge_index[0]]
            u_j = world_pos[edge_index[1]]

            m_ij = m_i - m_j
            u_ij = u_i - u_j

            m_ij_norm = torch.norm(m_ij, p=2, dim=1, keepdim=True)
            u_ij_norm = torch.norm(u_ij, p=2, dim=1, keepdim=True)

            edge_attr = torch.cat((u_ij, u_ij_norm), dim=-1).float()
            mesh_edge_attr = torch.cat((m_ij, m_ij_norm), dim=-1).float()

            # Convert to appropriate types
            youngs = youngs.float()
            one_hot_node_type = one_hot_node_type.float()
            node_type = node_type.float()
            velocity = velocity.float()
            edge_index = edge_index.long()

            # Store processed data
            data_to_graph[i]["edge_index"].append(edge_index)
            data_to_graph[i]["edge_attr"].append(edge_attr)
            data_to_graph[i]["mesh_edge_attr"].append(mesh_edge_attr)
            data_to_graph[i]["cells"].append(cells)
            data_to_graph[i]["world_pos"].append(world_pos)
            data_to_graph[i]["prev_world_pos"].append(prev_world_pos)
            data_to_graph[i]["target_world_pos"].append(target_world_pos)
            data_to_graph[i]["velocity"].append(velocity)
            data_to_graph[i]["youngs"].append(youngs)
            data_to_graph[i]["one_hot_node_type"].append(one_hot_node_type)
            data_to_graph[i]["node_type"].append(node_type)

        # Stack temporal dimension
        for key in data_to_graph[i].keys():
            data_to_graph[i][key] = torch.stack(data_to_graph[i][key], dim=0)

    print(f"Sample processed data keys: {data_to_graph[0].keys()}")
    # torch.save(data_to_graph, f'./{save_name}.pt')
    torch.save(data_to_graph, os.path.join(processed_dir, f'{save_name}.pt'))
    print(f"Saved processed data to {save_name}.pt")

print("Data preparation function defined.")


Data preparation function defined.


In [8]:
def normalize(to_normalize, mean_vec, std_vec):
    """Normalize tensor using mean and standard deviation."""
    return (to_normalize - mean_vec) / std_vec

def unnormalize(to_unnormalize, mean_vec, std_vec):
    """Unnormalize tensor using mean and standard deviation."""
    return to_unnormalize * std_vec + mean_vec

def get_stats(data_list, save_name):
    """
    Calculate normalization statistics for the dataset.
    
    Args:
        data_list (list): List of processed trajectory data
        save_name (str): Name for saving statistics
        
    Returns:
        None: Saves statistics to file
    """
    mean_std = {
        "vel_mean": [],
        "vel_std": [],
        "youngs_mean": [],
        "youngs_std": [],
        "edge_mean": [],
        "edge_std": [],
        "mesh_edge_mean": [],
        "mesh_edge_std": [],
        "node_size": [],
        "edge_size": [],
    }
    eps = 1e-8

    # Collect all data across trajectories and time steps
    all_vel = torch.cat([traj["velocity"].reshape(-1, 3) for traj in data_list], dim=0)
    all_youngs = torch.cat([traj["youngs"].reshape(-1, traj["youngs"].shape[-1]) for traj in data_list], dim=0)
    all_edge = torch.cat([traj["edge_attr"].reshape(-1, traj["edge_attr"].shape[-1]) for traj in data_list], dim=0)
    all_mesh_edge = torch.cat([traj["mesh_edge_attr"].reshape(-1, traj["mesh_edge_attr"].shape[-1]) for traj in data_list], dim=0)

    # Calculate sizes
    edge_size = all_edge.size(1) + all_mesh_edge.size(1)
    node_size = all_youngs.size(1) + data_list[0]["one_hot_node_type"][0].size(1)
    
    print(f"Node feature size: {node_size}")
    print(f"Edge feature size: {edge_size}")
    
    # Calculate statistics
    mean_std["vel_mean"] = all_vel.mean(dim=0)
    mean_std["vel_std"] = torch.sqrt(all_vel.var(dim=0))
    mean_std["youngs_mean"] = all_youngs.mean(dim=0)
    mean_std["youngs_std"] = torch.sqrt(all_youngs.var(dim=0))
    mean_std["edge_mean"] = all_edge.mean(dim=0)
    mean_std["edge_std"] = torch.sqrt(all_edge.var(dim=0))
    mean_std["mesh_edge_mean"] = all_mesh_edge.mean(dim=0)
    mean_std["mesh_edge_std"] = torch.sqrt(all_mesh_edge.var(dim=0))

    mean_std["node_size"] = torch.tensor([node_size])
    mean_std["edge_size"] = torch.tensor([edge_size])

    # Clamp standard deviations to avoid division by zero
    for key in mean_std:
        if "std" in key:
            mean_std[key] = torch.clamp(mean_std[key], min=eps)

    # torch.save(mean_std, f'./{save_name}.pt')
    torch.save(mean_std, os.path.join(processed_dir, f'{save_name}.pt'))
    print(f"Saved statistics to {save_name}.pt")

print("Statistics calculation function defined.")


Statistics calculation function defined.


In [9]:
# Execute data preprocessing pipeline
print("=" * 50)
print("EXECUTING DATA PREPROCESSING PIPELINE")
print("=" * 50)

# Process training data
print("\nProcessing training data...")
preparation('train', data_train)

# Process test data
print("\nProcessing test data...")
preparation('test', data_test)

# Load processed data for statistics calculation
print("\nLoading processed data for statistics calculation...")
# dataset_train = torch.load('train.pt')
# dataset_test = torch.load('test.pt')
dataset_train = torch.load(os.path.join(processed_dir, 'train.pt'))
dataset_test = torch.load(os.path.join(processed_dir, 'test.pt'))

# Calculate and save statistics
print("\nCalculating normalization statistics...")
get_stats(dataset_train, "stats_train")

print("\n" + "=" * 50)
print("DATA PREPROCESSING COMPLETED SUCCESSFULLY")
print("=" * 50)
print("Generated files:")
print("- train.pt: Processed training data")
print("- test.pt: Processed test data") 
print("- stats_train.pt: Normalization statistics")
print("\nData is ready for training!")


EXECUTING DATA PREPROCESSING PIPELINE

Processing training data...
Processing trajectory: 0
Processing trajectory: 1
Processing trajectory: 2
Processing trajectory: 3
Processing trajectory: 4
Processing trajectory: 5
Processing trajectory: 6
Processing trajectory: 7
Processing trajectory: 8
Processing trajectory: 9
Processing trajectory: 10
Processing trajectory: 11
Processing trajectory: 12
Processing trajectory: 13
Processing trajectory: 14
Processing trajectory: 15
Processing trajectory: 16
Processing trajectory: 17
Processing trajectory: 18
Processing trajectory: 19
Processing trajectory: 20
Processing trajectory: 21
Processing trajectory: 22
Processing trajectory: 23
Processing trajectory: 24
Processing trajectory: 25
Processing trajectory: 26
Processing trajectory: 27
Processing trajectory: 28
Processing trajectory: 29
Sample processed data keys: dict_keys(['edge_index', 'edge_attr', 'mesh_edge_attr', 'cells', 'world_pos', 'prev_world_pos', 'target_world_pos', 'velocity', 'youngs