In [1]:
import ttnn 
import torch
import time 

dim = 2048
num_iters = 100
device = ttnn.open_device(device_id=0)

2025-01-31 15:46:30.203 | DEBUG    | ttnn:<module>:82 - Initial ttnn.CONFIG:
Config{cache_path=/home/bach/.cache/ttnn,model_cache_path=/home/bach/.cache/ttnn/models,tmp_dir=/tmp/ttnn,enable_model_cache=false,enable_fast_runtime_mode=true,throw_exception_on_fallback=false,enable_logging=false,enable_graph_report=false,enable_detailed_buffer_report=false,enable_detailed_tensor_report=false,enable_comparison_mode=false,comparison_mode_pcc=0.9999,root_report_path=generated/ttnn/reports,report_name=std::nullopt,std::nullopt}


[38;2;000;128;000m                 Device[0m | [1m[38;2;100;149;237mINFO    [0m | Opening user mode device driver

[32m2025-01-31 15:46:31.195[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Opened PCI device 0; KMD version: 1.30.0, IOMMU: disabled
[32m2025-01-31 15:46:31.196[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Detected PCI devices: [0]
[32m2025-01-31 15:46:31.196[0m | [1m[38;2;100;149;237mINFO    [0m | [36mSiliconDriver  [0m - Using local chip ids: {0} and remote chip ids {}
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0. Program cache is NOT enabled
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | AI CLK for device 0 is:   1000 MHz
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Profiler started on device 0


In [21]:
dim = 512
a = torch.randn((dim, dim)).bfloat16()
a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
a_t = ttnn.to_device(a_t, device=device)
start = time.time()
a_t = ttnn.tilize(a_t)
print(time.time() - start)

0.002329587936401367


### from_torch(ROW_MAJOR) -> to_device() = from_torch(ROW_MAJOR, device) 

In [20]:
tot_from_torch_to_dev = 0
tot_from_torch_dev = 0

for i in range(num_iters):

    # from_torch(ROW_MAJOR) -> to_device()
    a = torch.randn((dim, dim)).bfloat16()
    start = time.time()
    a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT)
    a_t = ttnn.to_device(a_t, device=device)
    tot_from_torch_to_dev += time.time() - start
    
    # from_torch(ROW_MAJOR, device)
    a = torch.randn((dim, dim)).bfloat16()
    start = time.time()
    a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
    tot_from_torch_dev += time.time() - start
    
tot_from_torch_to_dev /= num_iters
tot_from_torch_dev /= num_iters

print(tot_from_torch_to_dev, tot_from_torch_dev)

0.0016105008125305176 0.0015828204154968262


### tilize(DRAM) = tilize(L1)

In [18]:
tot_tilize_dram = 0
tot_tilize_l1 = 0

for i in range(num_iters):
    
    # tilize(DRAM)
    a = torch.randn((dim, dim)).bfloat16()
    a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
    a_t = ttnn.to_device(a_t, device=device)
    start = time.time()
    a_t = ttnn.tilize(a_t)
    tot_tilize_dram += time.time() - start
    
    # tilize(L1)
    a = torch.randn((dim, dim)).bfloat16()
    a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG)
    a_t = ttnn.to_device(a, device=device)
    start = time.time()
    a_t = ttnn.tilize(a_t)
    tot_tilize_l1 += time.time() - start
    
tot_tilize_dram /= num_iters
tot_tilize_l1 /= num_iters

print(tot_tilize_dram, tot_tilize_l1)

0.00423729658126831 0.004347784519195557


### matmul (DRAM) > matmul (L1)

In [5]:
tot_matmul_dram = 0
tot_matmul_l1 = 0

for i in range(num_iters):
    
    # matmul(DRAM)
    a = torch.randn((dim, dim)).bfloat16()
    a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
    a_t = ttnn.to_device(a_t, device=device)
    a_t = ttnn.tilize(a_t)

    b = torch.randn((dim, dim)).bfloat16()
    b_t = ttnn.from_torch(b, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
    b_t = ttnn.to_device(b_t, device=device)
    b_t = ttnn.tilize(b_t)

    start = time.time()
    c_t = ttnn.matmul(a_t, b_t, memory_config=ttnn.DRAM_MEMORY_CONFIG)
    tot_matmul_dram += time.time() - start
    
    # matmul(L1)
    a = torch.randn((dim, dim)).bfloat16()
    a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG)
    a_t = ttnn.to_device(a_t, device=device)
    a_t = ttnn.tilize(a_t)
    
    b = torch.randn((dim, dim)).bfloat16()
    b_t = ttnn.from_torch(b, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG)
    b_t = ttnn.to_device(b_t, device=device)
    b_t = ttnn.tilize(b_t)

    start = time.time()
    c_t = ttnn.matmul(a_t, b_t, memory_config=ttnn.L1_MEMORY_CONFIG)
    tot_matmul_l1 += time.time() - start
    
tot_matmul_dram /= num_iters
tot_matmul_l1 /= num_iters

print(tot_matmul_dram, tot_matmul_l1)

0.006408035755157471 0.005533232688903809


### Tilize on device <<< on host, but dosn't work with sharded memory configs

In [None]:
a = torch.randn((dim, dim)).bfloat16()

in0_memory_config = ttnn.create_sharded_memory_config(
    (1, 1, dim, dim),
    core_grid=ttnn.CoreGrid(y=8, x=8),
    strategy=ttnn.ShardStrategy.BLOCK,
    orientation=ttnn.ShardOrientation.ROW_MAJOR,
)
dtype=ttnn.DataType.BFLOAT16

c= ttnn.from_torch(
    a,
    tile=ttnn.Tile((32, 32)),
    # dtype=dtype,
    layout=ttnn.ROW_MAJOR_LAYOUT,
    device=device,
    # memory_config=in0_memory_config
)


c = ttnn.to_layout(c, layout=ttnn.TILE_LAYOUT)#, memory_config = in0_memory_config)
# d = ttnn.tilize(c)
# c

In [21]:
tot_tilize_host = 0
tot_tilize = 0
tot_move = 0
for i in range(num_iters):
    a = torch.randn((dim, dim)).bfloat16()

    start = time.time()
    c = ttnn.from_torch(a, layout=ttnn.TILE_LAYOUT, device=device)
    tot_tilize_host = time.time() - start
    
    a = torch.randn((dim, dim)).bfloat16()

    start = time.time()
    c = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
    tot_move = time.time() - start
    
    start = time.time()
    # d = ttnn.tilize(c)
    d = ttnn.to_layout(c, device=device, memory_config=in0_memory_config)
    tot_tilize = time.time() - start

tot_tilize_host /= num_iters
tot_tilize /= num_iters
tot_move /= num_iters

TypeError: __call__(): incompatible function arguments. The following argument types are supported:
    1. (self: ttnn._ttnn.operations.core.to_layout_t, tensor: ttnn._ttnn.tensor.Tensor, layout: ttnn._ttnn.tensor.Layout, dtype: Optional[ttnn._ttnn.tensor.DataType] = None, memory_config: Optional[ttnn._ttnn.tensor.MemoryConfig] = None, device: ttnn._ttnn.device.Device = None) -> ttnn._ttnn.tensor.Tensor
    2. (self: ttnn._ttnn.operations.core.to_layout_t, tensor: ttnn._ttnn.tensor.Tensor, layout: ttnn._ttnn.tensor.Layout, dtype: Optional[ttnn._ttnn.tensor.DataType] = None, memory_config: Optional[ttnn._ttnn.tensor.MemoryConfig] = None, device: ttnn._ttnn.multi_device.MeshDevice = None) -> ttnn._ttnn.tensor.Tensor

Invoked with: <ttnn._ttnn.operations.core.to_layout_t object at 0x7fcf98890ef0>, ttnn.Tensor([[ 0.27930, -0.09619,  ..., -0.24805,  0.15820],
             [-0.38867,  0.91797,  ...,  2.59375, -0.43359],
             ...,
             [-0.20508,  0.45312,  ...,  1.17969,  1.07031],
             [-1.24219,  1.66406,  ..., -0.05640,  0.49609]], shape=Shape([2048, 2048]), dtype=DataType::BFLOAT16, layout=Layout::ROW_MAJOR); kwargs: device=<ttnn._ttnn.device.Device object at 0x7fcf7e739730>, memory_config=MemoryConfig(memory_layout=TensorMemoryLayout::BLOCK_SHARDED,buffer_type=BufferType::L1,shard_spec=ShardSpec(grid={[(x=0,y=0) - (x=7,y=7)]},shape={128, 128},orientation=ShardOrientation::ROW_MAJOR,halo=0,mode=ShardMode::PHYSICAL,physical_shard_shape=std::nullopt))

Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,
<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic
conversions are optional and require extra headers to be included
when compiling your pybind11 module.

In [18]:
tot_tilize + tot_move

0.003919928073883057

In [19]:
tot_tilize + tot_move

0.003919928073883057

In [10]:
tot_tilize_host

0.003970417976379394

In [8]:
a = dict.fromkeys([1])
a

{1: None}

In [42]:
a = torch.randn((dim, dim)).bfloat16()
b = torch.randn((dim, dim)).bfloat16()


start = time.time()
a_t = ttnn.from_torch(a, layout=ttnn.TILE_LAYOUT, device=device)
b_t = ttnn.from_torch(b, layout=ttnn.TILE_LAYOUT, device=device)
c_1 = ttnn.matmul(a_t, b_t)
tot_tilize_host = time.time() - start

start = time.time()
a_t = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
a_t = ttnn.tilize(a_t)
b_t = ttnn.from_torch(b, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
b_t = ttnn.tilize(b_t)
c_2 = ttnn.matmul(a_t, b_t) 
tot_move = time.time() - start


ttnn.Tensor([[73.50000, -39.50000,  ..., -4.96875, 79.00000],
             [-57.25000, 22.62500,  ..., 36.50000, -46.25000],
             ...,
             [-61.75000, 26.12500,  ..., -77.00000, -30.62500],
             [-32.25000, -51.75000,  ..., -27.00000, 115.00000]], shape=Shape([2048, 2048]), dtype=DataType::BFLOAT16, layout=Layout::TILE)
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Closing device 0
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Disabling and clearing program cache on device 0
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Initializing device 0. Program cache is NOT enabled
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | AI CLK for device 0 is:   1000 MHz
[38;2;000;128;000m                  Metal[0m | [1m[38;2;100;149;237mINFO    [0m | Profiler started on device 0


In [45]:
tot_tilize_host, tot_move

(0.791964054107666, 0.011814594268798828)

In [None]:
a = torch.randn((dim, dim)).bfloat16()
num_iters = 100
start = time.time()
for i in range(num_iters):
    c = ttnn.to_device(ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT), device=device)
t_avg = (time.time() - start)/num_iters
t_avg

0.0024967384338378906

In [None]:
(dim*dim*16) / (t_avg * 1e9)

27.01391284629646

In [None]:
start = time.time()
for i in range(num_iters):
    # a = torch.randn((dim, dim)).bfloat16()
    c = ttnn.from_torch(a, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
t_avg = (time.time() - start)/num_iters
t_avg

0.0025472283363342286