Skip to content

Conversation

@noemotiovon
Copy link
Contributor

@noemotiovon noemotiovon commented Jan 8, 2026

Rewrite group norm forward kernel to use explicit channel offsets instead of mutating X/Y base pointers inside loops.

This improves Triton compiler optimization opportunities, enables more predictable memory access patterns, and avoids loop-carried pointer dependencies.

  • Hardware Type: NVIDIA A100-SXM4-80GB
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

Rewrite group norm forward kernel to use explicit channel offsets
instead of mutating X/Y base pointers inside loops.

This improves Triton compiler optimization opportunities, enables
more predictable memory access patterns, and avoids loop-carried
pointer dependencies.
@noemotiovon
Copy link
Contributor Author

Test Result:
image

@noemotiovon noemotiovon marked this pull request as ready for review January 8, 2026 03:27
@noemotiovon
Copy link
Contributor Author

noemotiovon commented Jan 8, 2026

Benchmark script:

python benchmark/scripts/benchmark_group_norm.py

Log:

[
  {
    "kernel_name": "group_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.034304000437259674,
      0.047200001776218414,
      0.07638400048017502,
      0.12940800189971924,
      0.23535999655723572,
      0.44678398966789246,
      0.8600959777832031
    ],
    "y_values_20": [
      0.029920000582933426,
      0.04645119979977608,
      0.07526399940252304,
      0.12825599312782288,
      0.2335360050201416,
      0.4446144104003906,
      0.8572480082511902
    ],
    "y_values_80": [
      0.034591998904943466,
      0.047968000173568726,
      0.07756800204515457,
      0.13068799674510956,
      0.23705600202083588,
      0.4490239918231964,
      0.8628671765327454
    ],
    "timestamp": "2026-01-08 04:31:10",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.04073600098490715,
      0.0666240006685257,
      0.11763200163841248,
      0.21542400121688843,
      0.41040000319480896,
      0.7994239926338196,
      1.5798399448394775
    ],
    "y_values_20": [
      0.04054399952292442,
      0.06643199920654297,
      0.11740799993276596,
      0.21516799926757812,
      0.4100799858570099,
      0.7990400195121765,
      1.5793343782424927
    ],
    "y_values_80": [
      0.04095999896526337,
      0.066880002617836,
      0.11791999638080597,
      0.21568000316619873,
      0.41071999073028564,
      0.7998080253601074,
      1.5803455114364624
    ],
    "timestamp": "2026-01-08 04:31:14",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      1.0972800254821777,
      1.148576021194458,
      1.1502399444580078,
      1.163040041923523,
      1.1836159229278564,
      1.1771199703216553,
      2.314527988433838
    ],
    "y_values_20": [
      0.7845888137817383,
      0.768127977848053,
      1.1428031921386719,
      1.1429439783096313,
      1.176633596420288,
      1.1740543842315674,
      2.310227155685425
    ],
    "y_values_80": [
      1.1573760509490967,
      1.16867196559906,
      1.1709247827529907,
      1.1788288354873657,
      1.1968960762023926,
      1.180012822151184,
      2.319014549255371
    ],
    "timestamp": "2026-01-08 04:31:18",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.2964479923248291,
      0.19699199497699738,
      0.32156801223754883,
      0.5366719961166382,
      0.9813119769096375,
      1.8728959560394287,
      3.696928024291992
    ],
    "y_values_20": [
      0.1151999980211258,
      0.19577600061893463,
      0.31749120354652405,
      0.5358719825744629,
      0.9804800152778625,
      1.8710848093032837,
      3.693772792816162
    ],
    "y_values_80": [
      0.4941439926624298,
      0.5419008135795593,
      0.39460480213165283,
      0.537337601184845,
      0.982099175453186,
      1.8745791912078857,
      3.7002689838409424
    ],
    "timestamp": "2026-01-08 04:31:21",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.5736960172653198,
      0.5688639879226685,
      0.5622720122337341,
      0.3649600148200989,
      0.484112024307251,
      0.7421119809150696,
      1.456544041633606
    ],
    "y_values_20": [
      0.5664063692092896,
      0.5544576048851013,
      0.5504512190818787,
      0.21260160207748413,
      0.4127807915210724,
      0.7399359941482544,
      1.4538367986679077
    ],
    "y_values_80": [
      0.5882751941680908,
      0.576479971408844,
      0.5737088322639465,
      0.548359751701355,
      0.5075200200080872,
      0.7443839907646179,
      1.4595520496368408
    ],
    "timestamp": "2026-01-08 04:31:24",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.30851200222969055,
      0.16145598888397217,
      0.3112320005893707,
      0.33456000685691833,
      0.575007975101471,
      1.0777920484542847,
      2.123231887817383
    ],
    "y_values_20": [
      0.30537599325180054,
      0.15086719393730164,
      0.30508801341056824,
      0.33001598715782166,
      0.5742847919464111,
      1.0761791467666626,
      2.119744062423706
    ],
    "y_values_80": [
      0.31397759914398193,
      0.3075968027114868,
      0.3327679932117462,
      0.34519681334495544,
      0.5759360194206238,
      1.0794175863265991,
      2.126131296157837
    ],
    "timestamp": "2026-01-08 04:31:28",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  }
]
**************************************
     BENCHMARKING MEMORY for GROUP_NORM
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "group_norm",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "y_values_20": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "y_values_80": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "timestamp": "2026-01-08 04:31:28",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "huggingface",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "y_values_20": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "y_values_80": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "timestamp": "2026-01-08 04:31:28",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "y_values_20": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "y_values_80": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "timestamp": "2026-01-08 04:31:28",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "huggingface",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "y_values_20": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "y_values_80": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "timestamp": "2026-01-08 04:31:28",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "y_values_20": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "y_values_80": [
      40.01171875,
      80.01953125,
      160.03515625,
      320.0703125,
      640.140625,
      1280.28125,
      2560.5625
    ],
    "timestamp": "2026-01-08 04:31:29",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "group_norm",
    "kernel_provider": "huggingface",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "C",
    "x_label": "num_channels",
    "x_values": [
      32,
      64,
      128,
      256,
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "y_values_20": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "y_values_80": [
      40.06640625,
      80.12890625,
      160.25390625,
      320.5078125,
      641.015625,
      1282.03125,
      2564.0625
    ],
    "timestamp": "2026-01-08 04:31:29",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 128, \"H\": 512, \"channels_per_group\": 4, \"dtype\": \"torch.float32\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  }
]

@noemotiovon
Copy link
Contributor Author

Hi @Tcc0403, could you please help me review my code?

Copy link
Collaborator

@Tcc0403 Tcc0403 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@Tcc0403 Tcc0403 merged commit 9b80613 into linkedin:main Jan 12, 2026
3 of 7 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants