In [62]:
import matplotlib.pyplot as plt
import numpy as np

# Plotting only enabled for elems_per_side <= 4
elems_per_side = 8
num_ranks = 4
elems_per_plane = elems_per_side * elems_per_side
elems_per_domain = elems_per_plane * elems_per_side
planes_per_rank = elems_per_side // num_ranks

# Create 3d array with numbers
numbers_3d = np.arange(elems_per_domain).reshape(elems_per_side, elems_per_side, elems_per_side)
print("Original 3D array shape:", numbers_3d.shape)
print("Total elements:", numbers_3d.size)
print("Value range:", numbers_3d.min(), "to", numbers_3d.max())

def plot_3d_array_subplot(arr, ax, title="3D Array Visualization"):
    """Plot a 3d array in 3D space with grid lines on given axis"""
    # Create coordinate grids
    y, z, x = np.meshgrid(range(arr.shape[0]), range(arr.shape[1]), range(arr.shape[2]), indexing='xy')
    
    # Flatten everything for plotting
    x_flat = x.flatten()
    y_flat = y.flatten()
    z_flat = z.flatten()
    numbers_flat = arr.flatten()
    
    # Use 16 colors cycling
    colors = numbers_flat // elems_per_side
    
    # Plot the scattered points
    scatter = ax.scatter(x_flat, y_flat, z_flat, c=colors, cmap='tab20', s=80, alpha=0.9)
    
    # Add grid lines
    # Lines along X direction (keeping Y,Z constant)
    for j in range(arr.shape[1]):  # Y direction
        for k in range(arr.shape[2]):  # Z direction
            x_line = range(arr.shape[0])
            y_line = [j] * arr.shape[0]
            z_line = [k] * arr.shape[0]
            ax.plot(x_line, y_line, z_line, 'k-', alpha=0.3, linewidth=0.5)
    
    # Lines along Y direction (keeping X,Z constant)
    for i in range(arr.shape[0]):  # X direction
        for k in range(arr.shape[2]):  # Z direction
            x_line = [i] * arr.shape[1]
            y_line = range(arr.shape[1])
            z_line = [k] * arr.shape[1]
            ax.plot(x_line, y_line, z_line, 'k-', alpha=0.3, linewidth=0.5)
    
    # Lines along Z direction (keeping X,Y constant)
    for i in range(arr.shape[0]):  # X direction
        for j in range(arr.shape[1]):  # Y direction
            x_line = [i] * arr.shape[2]
            y_line = [j] * arr.shape[2]
            z_line = range(arr.shape[2])
            ax.plot(x_line, y_line, z_line, 'k-', alpha=0.3, linewidth=0.5)
    
    # Add number labels
    for xi, yi, zi, num in zip(x_flat, y_flat, z_flat, numbers_flat):
        ax.text(xi, yi, zi, str(num), fontsize=7)
    
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title(title)
    
    # Set axis limits to show full range
    ax.set_xlim(0, 3)
    ax.set_ylim(0, 3)
    ax.set_zlim(0, 3)
    
    return scatter


print("CYCLIC TRANSITIONS - X → Z → Y → X:")
print("="*50)

if elems_per_side <= 4:
    # Create figure with 2x2 subplot layout
    fig = plt.figure(figsize=(12, 8))

# Step 1: Original (X continuous)
print("Step 1: Original array (X continuous)")

if elems_per_side <= 4:
    ax1 = fig.add_subplot(221, projection='3d')
    scatter1 = plot_3d_array_subplot(numbers_3d, ax1, "Step 1: Original (X continuous)")

# Step 2: X → Z (make Z continuous)
# This means: (X,Y,Z) → (Z,X,Y)
step2 = np.transpose(numbers_3d, (1, 2, 0))
print("\nStep 2: X→Y transition (Y continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
    
if elems_per_side <= 4:
    ax2 = fig.add_subplot(222, projection='3d')
    scatter2 = plot_3d_array_subplot(step2, ax2, "Step 2: X→Z (Z continuous)")

# Step 3: Z → Y (make Y continuous) 
# Starting from (Z,X,Y), we want Z continuous: (Z,X,Y) → (Y,Z,X)
step3 = np.transpose(step2, (1, 2, 0))
print("\nStep 3: Z→Y transition (Y continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
    
if elems_per_side <= 4:
    ax3 = fig.add_subplot(223, projection='3d')
    scatter3 = plot_3d_array_subplot(step3, ax3, "Step 3: Z→Y (Y continuous)")

# Step 4: Y → X (back to X continuous, should match original)
# Starting from (Y,Z,X), we want X continuous: (Y,Z,X) → (X,Y,Z)
step4 = np.transpose(step3, (1, 2, 0))
print("\nStep 4: Y→X transition (back to X continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
    
if elems_per_side <= 4:
    ax4 = fig.add_subplot(224, projection='3d')
    scatter4 = plot_3d_array_subplot(step4, ax4, "Step 4: Y→X (back to X continuous)")

# Verify we're back to the original
print(f"\nVerification: Arrays are identical: {np.array_equal(numbers_3d, step4)}")

if elems_per_side <= 4:
    plt.tight_layout()
    plt.show()

Original 3D array shape: (8, 8, 8)
Total elements: 512
Value range: 0 to 511
CYCLIC TRANSITIONS - X → Z → Y → X:
Step 1: Original array (X continuous)

Step 2: X→Y transition (Y continuous)
Transpose: (0,1,2) → (1,2,0)

Step 3: Z→Y transition (Y continuous)
Transpose: (0,1,2) → (1,2,0)

Step 4: Y→X transition (back to X continuous)
Transpose: (0,1,2) → (1,2,0)

Verification: Arrays are identical: True


In [63]:
# Verify split and concatenation results in same array
split_arrays = []
for i in range(num_ranks):
    slice_array = numbers_3d[i*planes_per_rank:i*planes_per_rank+planes_per_rank, :, :]
    split_arrays.append(slice_array)

# Unite them back using concatenate
united_arrays = np.concatenate(split_arrays, axis=0)

print(f"\nVerification: Arrays are identical: {np.array_equal(numbers_3d, united_arrays)}")


Verification: Arrays are identical: True


In [64]:
# Use split arrays to simulate data distribution across ranks

# Transpose (1,2,0)
def transpose(idx_x, idx_y, glob_idx_z):
    t_idx_x = glob_idx_z
    t_idx_y = idx_x 
    glob_t_idx_z = idx_y 
    
    return t_idx_x, t_idx_y, glob_t_idx_z


def get_dst_rank_and_indices(rank, idx_x, idx_y, idx_z):
    glob_idx_z = idx_z + rank * planes_per_rank
    
    t_idx_x, t_idx_y, glob_t_idx_z = transpose(idx_x, idx_y, glob_idx_z)
    
    t_idx_z = glob_t_idx_z % planes_per_rank
    dst_rank = glob_t_idx_z // planes_per_rank
    
    return dst_rank, t_idx_x, t_idx_y, t_idx_z
    

# Simulate transposition kernel logic with cross memory access
def dist_transpose(spl_arr):
    work_spl_arr = np.zeros_like(spl_arr)
    
    for rank in range(len(spl_arr)):
        for idx_z in range(planes_per_rank):
            for idx_y in range(elems_per_side):
                for idx_x in range(elems_per_side):
                    dst_rank, t_idx_x, t_idx_y, t_idx_z = get_dst_rank_and_indices(rank, idx_x, idx_y, idx_z)
                    work_spl_arr[dst_rank][t_idx_z, t_idx_y, t_idx_x] = spl_arr[rank][idx_z, idx_y, idx_x]
            
    return work_spl_arr


# Step 1 distributed: Original (X continuous)
united_arrays = np.concatenate(split_arrays, axis=0)
print("\nStep 1 distributed: Original array (X continuous)")
print(f"Verification Step 1 distributed: Arrays are identical: {np.array_equal(numbers_3d, united_arrays)}")

# Step 2: X → Z (make Z continuous)
# This means: (X,Y,Z) → (Z,X,Y)
split_step2 = dist_transpose(split_arrays)
step2_dist = np.concatenate(split_step2, axis=0)
print("\nStep 2 distributed: X→Z transition (Z continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
print(f"Verification Step 2 distributed: Arrays are identical: {np.array_equal(step2, step2_dist)}")

# Step 3: Z → Y (make Y continuous) 
# Starting from (Z,X,Y), we want Z continuous: (Z,X,Y) → (Y,Z,X)
split_step3 = dist_transpose(split_step2)
step3_dist = np.concatenate(split_step3, axis=0)
print("\nStep 3 distributed: Z→Y transition (Y continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
print(f"Verification Step 3 distributed: Arrays are identical: {np.array_equal(step3, step3_dist)}")

# Step 4: Y → X (back to X continuous, should match original)
# Starting from (Y,Z,X), we want X continuous: (Y,Z,X) → (X,Y,Z)
split_step4 = dist_transpose(split_step3)
step4_dist = np.concatenate(split_step4, axis=0)
print("\nStep 4 distributed: Y→X transition (back to X continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
print(f"Verification Step 4 distributed: Arrays are identical: {np.array_equal(step4, step4_dist)}")


Step 1 distributed: Original array (X continuous)
Verification Step 1 distributed: Arrays are identical: True

Step 2 distributed: X→Z transition (Z continuous)
Transpose: (0,1,2) → (1,2,0)
Verification Step 2 distributed: Arrays are identical: True

Step 3 distributed: Z→Y transition (Y continuous)
Transpose: (0,1,2) → (1,2,0)
Verification Step 3 distributed: Arrays are identical: True

Step 4 distributed: Y→X transition (back to X continuous)
Transpose: (0,1,2) → (1,2,0)
Verification Step 4 distributed: Arrays are identical: True


In [65]:
# Simulate real transposition across ranks without cross memory access

# Prepare send buffer
shape = split_arrays[0].shape
print(f"Initial shape of split array: {shape}")
print(f"Initial rank[0] array: \n{split_arrays[0]}")

send_buffer = np.zeros(elems_per_domain).reshape(num_ranks, elems_per_side, elems_per_side, planes_per_rank)
    
shape = send_buffer[0].shape
print(f"\nShape of rank[0] send buffer: {shape}")

# Fill send buffer with z-continuous pre communication data
# In z-direction only [planes_per_rank] elements can be continuous because the other values are on a different rank.
# This organization enables sending in batches
for rank in range(num_ranks):
    for idx_z in range(planes_per_rank):
        for idx_y in range(elems_per_side):
            for idx_x in range(elems_per_side):
                send_buffer[rank][idx_y, idx_x, idx_z] = split_arrays[rank][idx_z, idx_y, idx_x]
                
print(f"Rank[0] send buffer: \n{send_buffer[0]}")

Initial shape of split array: (2, 8, 8)
Initial rank[0] array: 
[[[  0   1   2   3   4   5   6   7]
  [  8   9  10  11  12  13  14  15]
  [ 16  17  18  19  20  21  22  23]
  [ 24  25  26  27  28  29  30  31]
  [ 32  33  34  35  36  37  38  39]
  [ 40  41  42  43  44  45  46  47]
  [ 48  49  50  51  52  53  54  55]
  [ 56  57  58  59  60  61  62  63]]

 [[ 64  65  66  67  68  69  70  71]
  [ 72  73  74  75  76  77  78  79]
  [ 80  81  82  83  84  85  86  87]
  [ 88  89  90  91  92  93  94  95]
  [ 96  97  98  99 100 101 102 103]
  [104 105 106 107 108 109 110 111]
  [112 113 114 115 116 117 118 119]
  [120 121 122 123 124 125 126 127]]]

Shape of rank[0] send buffer: (8, 8, 2)
Rank[0] send buffer: 
[[[  0.  64.]
  [  1.  65.]
  [  2.  66.]
  [  3.  67.]
  [  4.  68.]
  [  5.  69.]
  [  6.  70.]
  [  7.  71.]]

 [[  8.  72.]
  [  9.  73.]
  [ 10.  74.]
  [ 11.  75.]
  [ 12.  76.]
  [ 13.  77.]
  [ 14.  78.]
  [ 15.  79.]]

 [[ 16.  80.]
  [ 17.  81.]
  [ 18.  82.]
  [ 19.  83.]
  [ 20.  

In [66]:
# simulate all to all communication
recv_buffer = np.zeros_like(send_buffer)

for src_rank in range(num_ranks):
    for dst_rank in range(num_ranks):
        src_slice = slice(dst_rank * planes_per_rank, (dst_rank + 1) * planes_per_rank)
        dst_slice = slice(src_rank * planes_per_rank, (src_rank + 1) * planes_per_rank)
        recv_buffer[dst_rank][dst_slice] = send_buffer[src_rank][src_slice]
        
print(f"Rank[0] receive buffer: \n{recv_buffer[0]}")

Rank[0] receive buffer: 
[[[  0.  64.]
  [  1.  65.]
  [  2.  66.]
  [  3.  67.]
  [  4.  68.]
  [  5.  69.]
  [  6.  70.]
  [  7.  71.]]

 [[  8.  72.]
  [  9.  73.]
  [ 10.  74.]
  [ 11.  75.]
  [ 12.  76.]
  [ 13.  77.]
  [ 14.  78.]
  [ 15.  79.]]

 [[128. 192.]
  [129. 193.]
  [130. 194.]
  [131. 195.]
  [132. 196.]
  [133. 197.]
  [134. 198.]
  [135. 199.]]

 [[136. 200.]
  [137. 201.]
  [138. 202.]
  [139. 203.]
  [140. 204.]
  [141. 205.]
  [142. 206.]
  [143. 207.]]

 [[256. 320.]
  [257. 321.]
  [258. 322.]
  [259. 323.]
  [260. 324.]
  [261. 325.]
  [262. 326.]
  [263. 327.]]

 [[264. 328.]
  [265. 329.]
  [266. 330.]
  [267. 331.]
  [268. 332.]
  [269. 333.]
  [270. 334.]
  [271. 335.]]

 [[384. 448.]
  [385. 449.]
  [386. 450.]
  [387. 451.]
  [388. 452.]
  [389. 453.]
  [390. 454.]
  [391. 455.]]

 [[392. 456.]
  [393. 457.]
  [394. 458.]
  [395. 459.]
  [396. 460.]
  [397. 461.]
  [398. 462.]
  [399. 463.]]]


In [67]:
# Reorganize receive buffer
shape = recv_buffer[0].shape
print(f"Initial shape of recveive buffer: {shape}")

final_buffer = np.zeros(elems_per_domain).reshape(num_ranks, planes_per_rank, elems_per_side, elems_per_side)
    
shape = final_buffer[0].shape
print(f"\nShape of rank[0] final buffer: {shape}")

# Reorganize data into a continuous layout
for rank in range(num_ranks):
    for idx_z in range(elems_per_side):
        for idx_y in range(elems_per_side):
            for idx_x in range(planes_per_rank):
                dst_idx_x = idx_x + (idx_z // planes_per_rank) * planes_per_rank
                dst_idx_z = idx_z % planes_per_rank
                final_buffer[rank][dst_idx_z, idx_y, dst_idx_x] = recv_buffer[rank][idx_z, idx_y, idx_x]
                
print(f"Rank[0] final buffer: \n{final_buffer[0]}")
print("Transposition complete")

Initial shape of recveive buffer: (8, 8, 2)

Shape of rank[0] final buffer: (2, 8, 8)
Rank[0] final buffer: 
[[[  0.  64. 128. 192. 256. 320. 384. 448.]
  [  1.  65. 129. 193. 257. 321. 385. 449.]
  [  2.  66. 130. 194. 258. 322. 386. 450.]
  [  3.  67. 131. 195. 259. 323. 387. 451.]
  [  4.  68. 132. 196. 260. 324. 388. 452.]
  [  5.  69. 133. 197. 261. 325. 389. 453.]
  [  6.  70. 134. 198. 262. 326. 390. 454.]
  [  7.  71. 135. 199. 263. 327. 391. 455.]]

 [[  8.  72. 136. 200. 264. 328. 392. 456.]
  [  9.  73. 137. 201. 265. 329. 393. 457.]
  [ 10.  74. 138. 202. 266. 330. 394. 458.]
  [ 11.  75. 139. 203. 267. 331. 395. 459.]
  [ 12.  76. 140. 204. 268. 332. 396. 460.]
  [ 13.  77. 141. 205. 269. 333. 397. 461.]
  [ 14.  78. 142. 206. 270. 334. 398. 462.]
  [ 15.  79. 143. 207. 271. 335. 399. 463.]]]
Transposition complete


In [68]:
# Now as functions

def prepare_send_buffer(input_buffer):
    send_buffer = np.zeros(elems_per_domain).reshape(num_ranks, elems_per_side, elems_per_side, planes_per_rank)
    
    for rank in range(num_ranks):
        for idx_z in range(planes_per_rank):
            for idx_y in range(elems_per_side):
                for idx_x in range(elems_per_side):
                    send_buffer[rank][idx_y, idx_x, idx_z] = input_buffer[rank][idx_z, idx_y, idx_x]
                
    return send_buffer


def communicate_all_to_all(send_buffer):
    recv_buffer = np.zeros_like(send_buffer)

    for src_rank in range(num_ranks):
        for dst_rank in range(num_ranks):
            src_slice = slice(dst_rank * planes_per_rank, (dst_rank + 1) * planes_per_rank)
            dst_slice = slice(src_rank * planes_per_rank, (src_rank + 1) * planes_per_rank)
            recv_buffer[dst_rank][dst_slice] = send_buffer[src_rank][src_slice]
        
    return recv_buffer


def reorganize_recv_buffer(recv_buffer):
    final_buffer = np.zeros(elems_per_domain).reshape(num_ranks, planes_per_rank, elems_per_side, elems_per_side)
        
    for rank in range(num_ranks):
        for idx_z in range(elems_per_side):
            for idx_y in range(elems_per_side):
                for idx_x in range(planes_per_rank):
                    dst_idx_x = idx_x + (idx_z // planes_per_rank) * planes_per_rank
                    dst_idx_z = idx_z % planes_per_rank
                    final_buffer[rank][dst_idx_z, idx_y, dst_idx_x] = recv_buffer[rank][idx_z, idx_y, idx_x]
                
    return final_buffer


def real_dist_transposition(input_buffer):
    send_buffer = prepare_send_buffer(input_buffer)
    recv_buffer = communicate_all_to_all(send_buffer)
    final_buffer = reorganize_recv_buffer(recv_buffer)
    
    return final_buffer

In [69]:
# Verify real distributed transposition

# Step 1 real distributed: Original (X continuous)
united_arrays = np.concatenate(split_arrays, axis=0)
print("\nStep 1 real distributed: Original array (X continuous)")
print(f"Verification Step 1 real distributed: Arrays are identical: {np.array_equal(numbers_3d, united_arrays)}")
print(split_arrays[0])

# Step 2: X → Z (make Z continuous)
# This means: (X,Y,Z) → (Z,X,Y)
split_step2 = real_dist_transposition(split_arrays)
step2_dist = np.concatenate(split_step2, axis=0)
print("\nStep 2 real distributed: X→Z transition (Z continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
print(f"Verification Step 2 real distributed: Arrays are identical: {np.array_equal(step2, step2_dist)}")
print(split_step2[0])

# Step 3: Z → Y (make Y continuous) 
# Starting from (Z,X,Y), we want Z continuous: (Z,X,Y) → (Y,Z,X)
split_step3 = real_dist_transposition(split_step2)
step3_dist = np.concatenate(split_step3, axis=0)
print("\nStep 3 real distributed: Z→Y transition (Y continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
print(f"Verification Step 3 real distributed: Arrays are identical: {np.array_equal(step3, step3_dist)}")
print(split_step3[0])

# Step 4: Y → X (back to X continuous, should match original)
# Starting from (Y,Z,X), we want X continuous: (Y,Z,X) → (X,Y,Z)
split_step4 = real_dist_transposition(split_step3)
step4_dist = np.concatenate(split_step4, axis=0)
print("\nStep 4 real distributed: Y→X transition (back to X continuous)")
print(f"Transpose: (0,1,2) → (1,2,0)")
print(f"Verification Step 4 real distributed: Arrays are identical: {np.array_equal(step4, step4_dist)}")
print(split_step4[0])


Step 1 real distributed: Original array (X continuous)
Verification Step 1 real distributed: Arrays are identical: True
[[[  0   1   2   3   4   5   6   7]
  [  8   9  10  11  12  13  14  15]
  [ 16  17  18  19  20  21  22  23]
  [ 24  25  26  27  28  29  30  31]
  [ 32  33  34  35  36  37  38  39]
  [ 40  41  42  43  44  45  46  47]
  [ 48  49  50  51  52  53  54  55]
  [ 56  57  58  59  60  61  62  63]]

 [[ 64  65  66  67  68  69  70  71]
  [ 72  73  74  75  76  77  78  79]
  [ 80  81  82  83  84  85  86  87]
  [ 88  89  90  91  92  93  94  95]
  [ 96  97  98  99 100 101 102 103]
  [104 105 106 107 108 109 110 111]
  [112 113 114 115 116 117 118 119]
  [120 121 122 123 124 125 126 127]]]

Step 2 real distributed: X→Z transition (Z continuous)
Transpose: (0,1,2) → (1,2,0)
Verification Step 2 real distributed: Arrays are identical: True
[[[  0.  64. 128. 192. 256. 320. 384. 448.]
  [  1.  65. 129. 193. 257. 321. 385. 449.]
  [  2.  66. 130. 194. 258. 322. 386. 450.]
  [  3.  67. 131.