Skip to content

[NPU]: optimize tvd implementation#1039

Merged
Tcc0403 merged 1 commit intolinkedin:mainfrom
TianHao324:TVD
Jan 22, 2026
Merged

[NPU]: optimize tvd implementation#1039
Tcc0403 merged 1 commit intolinkedin:mainfrom
TianHao324:TVD

Conversation

@TianHao324
Copy link
Contributor

@TianHao324 TianHao324 commented Jan 22, 2026

Summary

  1. Use a smaller grid and keep it as close to npu_core_count as possible, so that the same kernel can process as many rows as possible, thereby enhancing performance.
  2. The grads are initialized as float32 to temporarily address the calculation errors of the NPU on the bf16 platform.
  3. The precision test that was previously unsuccessful has now been passed. [NPU]: Add NPU support for the tvd operator #998
image
  • Hardware Type: Ascend NPU 910B4
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@TianHao324
Copy link
Contributor Author

benckmark:

**************************************
     BENCHMARKING MEMORY for TVD
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "tvd",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      1536.06640625,
      3072.06640625,
      6144.06640625,
      12288.06640625
    ],
    "y_values_20": [
      1536.06640625,
      3072.06640625,
      6144.06640625,
      12288.06640625
    ],
    "y_values_80": [
      1536.06640625,
      3072.06640625,
      6144.06640625,
      12288.06640625
    ],
    "timestamp": "2026-01-22 04:25:41",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "tvd",
    "kernel_provider": "torch",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      2048.0048828125,
      4096.0048828125,
      8192.0048828125,
      16384.00390625
    ],
    "y_values_20": [
      2048.0048828125,
      4096.0048828125,
      8192.0048828125,
      16384.00390625
    ],
    "y_values_80": [
      2048.0048828125,
      4096.0048828125,
      8192.0048828125,
      16384.00390625
    ],
    "timestamp": "2026-01-22 04:25:41",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  }
]
**************************************
     BENCHMARKING SPEED for TVD
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "tvd",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      2.2209901809692383,
      4.357100009918213,
      8.623979568481445,
      17.134260177612305
    ],
    "y_values_20": [
      2.219007968902588,
      4.351079940795898,
      8.614139556884766,
      17.12923240661621
    ],
    "y_values_80": [
      2.224436044692993,
      4.360680103302002,
      8.627639770507812,
      17.16514778137207
    ],
    "timestamp": "2026-01-22 04:25:43",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "tvd",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      3.534359931945801,
      6.87477970123291,
      13.494179725646973,
      26.710479736328125
    ],
    "y_values_20": [
      3.5331361293792725,
      6.8681440353393555,
      13.489888191223145,
      26.700424194335938
    ],
    "y_values_80": [
      3.5370359420776367,
      6.885335922241211,
      13.496811866760254,
      26.71310806274414
    ],
    "timestamp": "2026-01-22 04:25:44",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "tvd",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      5.083230018615723,
      9.829299926757812,
      19.312719345092773,
      39.10369110107422
    ],
    "y_values_20": [
      5.079416275024414,
      9.822976112365723,
      19.29350471496582,
      39.05944061279297
    ],
    "y_values_80": [
      5.088088035583496,
      9.833479881286621,
      19.314844131469727,
      39.14794158935547
    ],
    "timestamp": "2026-01-22 04:25:45",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "tvd",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      9.448189735412598,
      18.739500045776367,
      37.219078063964844,
      74.85005950927734
    ],
    "y_values_20": [
      9.44322395324707,
      18.738704681396484,
      37.21904373168945,
      74.85005950927734
    ],
    "y_values_80": [
      9.451315879821777,
      18.743871688842773,
      37.2191162109375,
      74.85005950927734
    ],
    "timestamp": "2026-01-22 04:25:47",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "tvd",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      2.9736199378967285,
      5.583799839019775,
      10.848340034484863,
      22.094820022583008
    ],
    "y_values_20": [
      2.9664199352264404,
      5.579659938812256,
      10.837471961975098,
      22.053462982177734
    ],
    "y_values_80": [
      2.99534010887146,
      5.599912166595459,
      10.86776065826416,
      22.104991912841797
    ],
    "timestamp": "2026-01-22 04:25:48",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "tvd",
    "kernel_provider": "torch",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      6.006710052490234,
      11.966720581054688,
      23.85382080078125,
      48.260040283203125
    ],
    "y_values_20": [
      6.003660202026367,
      11.965044021606445,
      23.849403381347656,
      48.24405288696289
    ],
    "y_values_80": [
      6.0122199058532715,
      11.972151756286621,
      23.866653442382812,
      48.276023864746094
    ],
    "timestamp": "2026-01-22 04:25:49",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  }
]

Copy link
Collaborator

@Tcc0403 Tcc0403 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm

@Tcc0403 Tcc0403 merged commit 7b51e56 into linkedin:main Jan 22, 2026
3 of 7 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants