# Effective flops under q4_0

In [7]:
n = 1024
# 1. estimate the longest time to issue all instructions
num_calculation_blocks = (n * n) * (n / 64)
clocks_for_each_block = 6
total_clocks = num_calculation_blocks * clocks_for_each_block

total_flops = 2 * n ** 3
frequency = 5.1 * 10 ** 9
total_time = total_clocks / frequency
print(f"total time: {total_time:.5f} seconds")
effective_flops = total_flops / total_time / 1e9
print(f"effective flops: {effective_flops:.2f}")

total time: 0.01974 seconds
effective flops: 108.80


# Theorectical Limits of Aq80Wq10

In [8]:
def calculate_theoretical_peak_flops(m, n, k):
    total_flops = 2 * m * n * k # to compute the effective flops
    frequency = 5.1 * 10 ** 9
    peak_flops = {}
    # collect the all total uops 
    uops_dict = {}

    # stage 1: The minimum data preparation operations without repetition
    # A: int8
    q_blk_size = 32
    num_q_blocks_a = m * k / q_blk_size
    uops_load_sa = num_q_blocks_a # vmovss                                          # p2/p3/p11
    uops_load_int8_a = num_q_blocks_a # vmovdqu                                     # p2/p3/p11
    uops_sign_a = num_q_blocks_a # get absolute value                               # p0 / p1
    uops_dict['uops_load_sa'] = uops_load_sa
    uops_dict['uops_load_int8_a'] = uops_load_int8_a
    uops_dict['uops_sign_a'] = uops_sign_a

    # weight: int4
    num_q_blocks_b = n * k / q_blk_size
    uops_load_sw = num_q_blocks_b # vmovss                                          # p2/p3/p11
    uops_load_int4_w = num_q_blocks_b / 2 # vmovdqu                                 # p2/p3/p11
    uops_shift = num_q_blocks_b / 2 * 1 # vpsrlw                                    # p0 / p1
    uops_mask = num_q_blocks_b / 2 * 2 # vpand                                      # p0 / p1 / p5
    uops_sub = num_q_blocks_b / 2 * 2 # vpaddb                                      # p0 / p1 / p5
    uops_dict['uops_load_sw'] = uops_load_sw
    uops_dict['uops_load_int4_w'] = uops_load_int4_w
    uops_dict['uops_shift'] = uops_shift
    uops_dict['uops_mask'] = uops_mask
    uops_dict['uops_sub'] = uops_sub
    total_data_preparation_uops = uops_load_sa + uops_load_int8_a + uops_load_sw + uops_load_int4_w + uops_shift + uops_sign_a + uops_mask + uops_sub

    
    # stage 2: indispensible calculation 
    unit_size = 64
    num_unit = (m * n) * ( k / unit_size )
    uops_fused_s_per_unit = 2 # vmulss
    uops_broadcast_fused_s_per_unit = uops_fused_s_per_unit # vbroadcastss
    uops_int8_mul_sum_per_unit = 2  # vpdpbusd
    uops_cvt_int32_to_float_per_unit = 2 # vcvtps2dq
    uops_fma_per_unit = 2 # vfmadd231ps
    uops_sign_a_b_per_unit = 2 # vpsignb
    
    total_uops_fused_s = num_unit * uops_fused_s_per_unit                           # p0 / p1
    total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit       # p5
    total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit                 # p0 / p1
    total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit     # p0 / p1
    total_uops_fma = num_unit * uops_fma_per_unit                                   # p0 / p1
    total_uops_sign_a_b = num_unit * uops_sign_a_b_per_unit                         # p0 / p1
    uops_dict['total_uops_fused_s'] = total_uops_fused_s
    uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
    uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
    uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
    uops_dict['total_uops_fma'] = total_uops_fma
    uops_dict['total_uops_sign_a_b'] = total_uops_sign_a_b
    
    total_computation_uops = (total_uops_fused_s + total_uops_broadcast_fused_s + total_uops_int8_mul_sum + total_uops_cvt_int32_to_float + total_uops_fma + total_uops_sign_a_b)
    
    
    # 1. dispatch all uops
    total_uops = total_data_preparation_uops + total_computation_uops
    dispatch_width = 6
    total_clocks = total_uops / dispatch_width
    total_time = total_clocks / frequency
    # print(f"total time: {total_time:.5f} seconds")
    effective_flops = total_flops / total_time / 1e9
    # print(f"effective flops: {effective_flops:.2f}")
    peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    # 2. p0,p1 with minimum stress. vpand and vpaddb all go to p5
    total_uops_p0_p1 = uops_sign_a + uops_shift + total_uops_fused_s + total_uops_int8_mul_sum + total_uops_cvt_int32_to_float + total_uops_sign_a_b + total_uops_fma
    p0_p1_width = 2
    total_clocks_p0_p1 = total_uops_p0_p1 / p0_p1_width
    total_time_p0_p1 = total_clocks_p0_p1 / frequency
    effective_flops = total_flops / total_time_p0_p1 / 1e9
    peak_flops['p0_p1_based_peak_flops'] = effective_flops
    
    # 3. p0, p1, p5
    total_uops_p0_p1_p5 = total_uops_p0_p1 + total_uops_broadcast_fused_s + uops_mask + uops_sub
    p0_p1_p5_width = 3
    total_clocks_p0_p1_p5 = total_uops_p0_p1_p5 / p0_p1_p5_width
    total_time_p0_p1_p5 = total_clocks_p0_p1_p5 / frequency
    effective_flops = total_flops / total_time_p0_p1_p5 / 1e9
    peak_flops['p0_p1_p5_based_peak_flops'] = effective_flops
    
    # 4. memory load operation: p2/p3/p11
    total_uops_p2_p3_p11 = uops_load_sa + uops_load_int8_a + uops_load_sw + uops_load_int4_w
    p2_p3_p11_width = 3
    total_clocks_p2_p3_p11 = total_uops_p2_p3_p11 / p2_p3_p11_width
    total_time_p2_p3_p11 = total_clocks_p2_p3_p11 / frequency
    effective_flops = total_flops / total_time_p2_p3_p11 / 1e9
    peak_flops['p2_p3_p11_based_peak_flops (memory load)'] = effective_flops
    
    # 5. memory bandwidth peak flops
    mem_frequency = 3600 * 10 ** 6
    bit_per_transfer = 64 * 2
    B = 8
    KB = 1024 * B
    MB = 1024 * KB
    GB = 1024 * MB
    memory_bandwidth = (mem_frequency * bit_per_transfer) / (GB)
    print(f"Memory bandwidth: {memory_bandwidth:.2f} GB/s")
    a_bits = m * k * 8
    sa_bit = m * k * 32 / q_blk_size
    w_bits = n * k * 4
    sw_bit = n * k * 32 / q_blk_size
    c_bits = m * n * 32 # write
    data_transfer_time = (a_bits + sa_bit + w_bits + sw_bit + c_bits) / (memory_bandwidth * GB)
    flops_limits = total_flops / data_transfer_time / 1e9
    peak_flops['memory_bandwidth_peak_flops'] = flops_limits

    # print dictionary by pandas
    import pandas as pd
    peak_flops_df = pd.DataFrame(peak_flops, index=[0])
    peak_flops_df.index = ['peak_flops']
    peak_flops_df.columns = peak_flops.keys()
    # transpose
    peak_flops_df = peak_flops_df.transpose()
    
    uops_df = pd.DataFrame(uops_dict, index=[0])
    uops_df.index = ['uops']
    uops_df.columns = uops_dict.keys()
    uops_df = uops_df.transpose()
    

    return peak_flops_df, uops_df





In [9]:
from IPython.display import display
peak_flops, uops_df = calculate_theoretical_peak_flops(1024, 1024, 1024)
display(peak_flops)
print('='*100)
display(uops_df)

Memory bandwidth: 53.64 GB/s


Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,326.028548
p0_p1_based_peak_flops,130.521761
p0_p1_p5_based_peak_flops,163.107084
p2_p3_p11_based_peak_flops (memory load),286485.942857
memory_bandwidth_peak_flops,20515.617391




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_sign_a,32768.0
uops_load_sw,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
uops_sub,32768.0
total_uops_fused_s,33554432.0
total_uops_broadcast_fused_s,33554432.0


In [10]:
peak_flops, uops_df = calculate_theoretical_peak_flops(1, 1024, 1024)
display(peak_flops)
print('='*100)
display(uops_df)

Memory bandwidth: 53.64 GB/s


Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.782642
p0_p1_based_peak_flops,118.669838
p0_p1_p5_based_peak_flops,115.186766
p2_p3_p11_based_peak_flops (memory load),651.951105
memory_bandwidth_peak_flops,182.855726




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_sign_a,32.0
uops_load_sw,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
uops_sub,32768.0
total_uops_fused_s,32768.0
total_uops_broadcast_fused_s,32768.0


In [None]:
peak_flops, uops_df = calculate_theoretical_peak_flops(1, 4096, 4096)
display(peak_flops)
print(f'{" uops statistics":=^80}')
display(uops_df)

Memory bandwidth: 53.64 GB/s


Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.825657
p0_p1_based_peak_flops,118.685641
p0_p1_p5_based_peak_flops,115.196691
p2_p3_p11_based_peak_flops (memory load),652.587569
memory_bandwidth_peak_flops,183.951737




Unnamed: 0,uops
uops_load_sa,128.0
uops_load_int8_a,128.0
uops_sign_a,128.0
uops_load_sw,524288.0
uops_load_int4_w,262144.0
uops_shift,262144.0
uops_mask,524288.0
uops_sub,524288.0
total_uops_fused_s,524288.0
total_uops_broadcast_fused_s,524288.0


In [12]:
total_uops_per_calculation_block = 26
issue_width_for_po_p1_p5 = 6
clocks_for_each_calculation_block = total_uops_for_p0_p1_p5_per_calculation_block / issue_width_for_po_p1_p5
flops_1 = calculate_theoretical_peak_flops(clocks_for_each_calculation_block)

NameError: name 'total_uops_for_p0_p1_p5_per_calculation_block' is not defined

## p0, p1, p5

In [None]:
total_uops_for_p0_p1_per_calculation_block = 25
issue_width_for_po_p1_p5 = 3
clocks_for_each_calculation_block = total_uops_for_p0_p1_per_calculation_block / issue_width_for_po_p1_p5
flops_1 = calculate_theoretical_peak_flops(clocks_for_each_calculation_block)

## p0, p1

In [None]:
min_total_uops = 13
issue_width = 2
clocks_for_each_calculation_block =  min_total_uops / issue_width
flops = calculate_theoretical_peak_flops(clocks_for_each_calculation_block)

## float-point

In [None]:
n = 1024
total_flops = 2 * n ** 3
flops_per_fmd_instruction = 8 * 2
total_fma_operations = total_flops / flops_per_fmd_instruction
fma_cpi = 0.5 # the cycle per instruction for FMA
total_clocks = total_fma_operations * fma_cpi
frequency = 5.1 * 10 ** 9
total_time = total_clocks / frequency
print(f"Total time: {total_time:.5f} seconds")
effective_flops = total_flops / total_time / 1e9
print(f"Effective FLOPS: {effective_flops:.2f}")

## Ideal upper bound

In [None]:
n = 1024
# 1. estimate the longest time to issue all instructions
num_calculation_blocks = (n * n) * (n / 64)
total_uops_per_block = 30
decode_width = 6
clocks_for_each_block = total_uops_per_block / decode_width
total_clocks = num_calculation_blocks * clocks_for_each_block

total_flops = 2 * n ** 3
frequency = 5.1 * 10 ** 9
total_time = total_clocks / frequency
print(f"Total time: {total_time:.5f} seconds")
effective_flops = total_flops / total_time / 1e9
print(f"Effective FLOPS: {effective_flops:.2f}")


# Memory bound case

In [None]:
frequency = 3600 * 10 ** 6
bit_per_transfer = 64 * 2
B = 8
KB = 1024 * B
MB = 1024 * KB
GB = 1024 * MB
memory_bandwidth = (frequency * bit_per_transfer) / (GB)
print(f"Memory bandwidth: {memory_bandwidth:.2f} GB/s")

## Aq80_Wq4_0

In [None]:
# matrix A: 1 x 1000; B: 1000 * 1000
M = 1; N = 1000; K = 1000
# M = 1; N = 4096; K = 4096
Q_BLK_SIZE = 32

read_bits_A = M * K * 8 # int8
read_bits_sA = M * K * 32 / Q_BLK_SIZE# float
read_bits_B = N * K * 4 # int4
read_bits_sB = N * K * 32 / Q_BLK_SIZE # int4
write_bits_C = M * N * 32 # float
data_transfer_time = (read_bits_A + read_bits_B + write_bits_C + read_bits_sA + read_bits_sB) / (memory_bandwidth * GB)
print(f"Data transfer time: {data_transfer_time:.5f} seconds")

total_flops = 2 * M * N * K
flops_limits = total_flops / data_transfer_time / 1e9
print(f"FLOPS limits: {flops_limits:.2f} GFLOPS")

## A_fp32 W_fp32

In [None]:
# matrix A: 1 x 1000; B: 1000 * 1000
M = 1; N = 1000; K = 1000
# M = 1; N = 4096; K = 4096
Q_BLK_SIZE = 32

read_bits_A = M * K * 32 # float
read_bits_B = N * K * 32 # float
write_bits_C = M * N * 32 # float
data_transfer_time = (read_bits_A + read_bits_B + write_bits_C) / (memory_bandwidth * GB)
print(f"Data transfer time: {data_transfer_time:.5f} seconds")

total_flops = 2 * M * N * K
flops_limits = total_flops / data_transfer_time / 1e9
print(f"FLOPS limits: {flops_limits:.2f} GFLOPS")

In [None]:
# matrix A: 1 x 1000; B: 1000 * 1000
M = 1000; N = 1000; K = 1000
# M = 1; N = 4096; K = 4096
Q_BLK_SIZE = 32

read_bits_A = M * K * 32 # float
read_bits_B = N * K * 32 # float
write_bits_C = M * N * 32 # float
data_transfer_time = (read_bits_A + read_bits_B + write_bits_C) / ( 19 * GB)
print(f"Data transfer time: {data_transfer_time:.5f} seconds")

total_flops = 2 * M * N * K
flops_limits = total_flops / data_transfer_time / 1e9
print(f"FLOPS limits: {flops_limits:.2f} GFLOPS")

# Theorectical Limits of Aq81Wq41

In [None]:
def calculate_theoretical_peak_flops(m, n, k):
    """
    A: m x k ; Activation
    W: k x n ; Weights
    """
    total_flops = 2 * m * n * k # to compute the effective flops
    # cpu frequency
    frequency = 5.1 * 10 ** 9
    # cpu dispatch width
    dispatch_width = 6

    # collect results
    peak_flops = {}
    uops_dict = {}

    """ 
    Part 1: Data Preparation.
    The minimum necessary data preparation operations without repetition to fetch data from memory.
    We assume that data could be shared as if there are limitless registers
    """
    # A: int8
    q_blk_size = 32
    num_q_blocks_a = m * k / q_blk_size
    uops_load_sa = num_q_blocks_a # vmovss                                          # p2/p3/p11
    uops_load_scaled_sum_a = num_q_blocks_a # vmovss                                          # p2/p3/p11
    uops_load_int8_a = num_q_blocks_a # vmovdqu                                     # p2/p3/p11
    # NOTE: No need to get absolute value for Aq81Wq41
    # uops_sign_a = num_q_blocks_a # get absolute value                             
    uops_dict['uops_load_sa'] = uops_load_sa
    uops_dict['uops_load_int8_a'] = uops_load_int8_a
    uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
    # uops_dict['uops_sign_a'] = uops_sign_a

    # weight: int4
    num_q_blocks_b = n * k / q_blk_size
    uops_load_sw = num_q_blocks_b # vmovss                                          # p2/p3/p11
    uops_load_min_b = num_q_blocks_b # vmovss                                       # p2/p3/p11
    # each ymm accommodates 64 int4 (2 q blocks)
    uops_load_int4_w = num_q_blocks_b / 2 # vmovdqu                                 # p2/p3/p11
    # Necessary operations to unpack 2-int4 into 2-int8
    uops_shift = num_q_blocks_b / 2 * 1 # vpsrlw                                    # p0 / p1
    uops_mask = num_q_blocks_b / 2 * 2 # vpand                                      # p0 / p1 / p5
    # NOTE: No need to subtract min_b for Aq81Wq41
    # uops_sub = num_q_blocks_b / 2 * 2 # vpaddb                                      
    uops_dict['uops_load_sw'] = uops_load_sw
    uops_dict['uops_load_min_b'] = uops_load_min_b
    uops_dict['uops_load_int4_w'] = uops_load_int4_w
    uops_dict['uops_shift'] = uops_shift
    uops_dict['uops_mask'] = uops_mask
    # uops_dict['uops_sub'] = uops_sub
    total_minimum_data_preparation_uops = \
        uops_load_sa + \
        uops_load_int8_a + \
        uops_load_scaled_sum_a + \
        uops_load_sw + \
        uops_load_min_b + \
        uops_load_int4_w + \
        uops_shift + \
        uops_mask

    
    """ 
    Part 2: Calculation.
    Among all calculation operations, some of them could be shared among iterations, e.g. data-unpack operations when loop unrolling is performed, while some of them could not be shared or optimized out,
    as they gaurantee the correctness of the algorithm, e.g. multiplications in gemm.
    Those indispensible calculations are targeted in this part.
    """
    # stage 2: indispensible calculation operations to ensure the correctness of the result
    unit_size = 64
    num_unit = (m * n) * ( k / unit_size )
    uops_fused_s_per_unit = 2 # vmulss
    uops_broadcast_fused_s_per_unit = uops_fused_s_per_unit # vbroadcastss
    uops_int8_mul_sum_per_unit = 2  # vpdpbusd
    uops_cvt_int32_to_float_per_unit = 2 # vcvtps2dq
    # NOTE: No need to sign_a_b for Aq81Wq41
    # uops_sign_a_b_per_unit = 2 # vpsignb
    # NOTE: we need two extra float multiplication and add
    uops_fma_per_unit = 2 + 2 # 2 * vfmadd231ps + 2 * vfmadd231ss
    
    total_uops_fused_s = num_unit * uops_fused_s_per_unit                           # p0 / p1
    total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit       # p5
    total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit                 # p0 / p1
    total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit     # p0 / p1
    total_uops_fma = num_unit * uops_fma_per_unit                                   # p0 / p1
    # total_uops_sign_a_b = num_unit * uops_sign_a_b_per_unit                       
    uops_dict['total_uops_fused_s'] = total_uops_fused_s
    uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
    uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
    uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
    uops_dict['total_uops_fma'] = total_uops_fma
    # uops_dict['total_uops_sign_a_b'] = total_uops_sign_a_b
    
    total_computation_uops = (
        total_uops_fused_s + 
        total_uops_broadcast_fused_s + 
        total_uops_int8_mul_sum + 
        total_uops_cvt_int32_to_float + 
        total_uops_fma 
    )
    
    """
    Once the minimum number data and computation operations are calculated, mininum total number of uops could be derived.
    Considering the limit of dispatch width, the first peak flops, when the program is bounded by dispatch width, could be calculated.
    """
    total_uops = total_minimum_data_preparation_uops + total_computation_uops
    total_clocks = total_uops / dispatch_width
    total_time = total_clocks / frequency
    # print(f"total time: {total_time:.5f} seconds")
    effective_flops = total_flops / total_time / 1e9
    # print(f"effective flops: {effective_flops:.2f}")
    peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    # 2. p0,p1 with minimum stress. vpand and vpaddb all go to p5
    total_uops_p0_p1 = (
        uops_shift + total_uops_fused_s + 
        total_uops_int8_mul_sum + total_uops_cvt_int32_to_float +
        total_uops_fma
    )
    # NOTE: p0 and p1 together could issue 2 uops per clock
    p0_p1_width = 2
    total_clocks_p0_p1 = total_uops_p0_p1 / p0_p1_width
    total_time_p0_p1 = total_clocks_p0_p1 / frequency
    effective_flops = total_flops / total_time_p0_p1 / 1e9
    peak_flops['p0_p1_based_peak_flops'] = effective_flops
    
    # 3. p0, p1, p5
    total_uops_p0_p1_p5 = total_uops_p0_p1 + total_uops_broadcast_fused_s + uops_mask
    p0_p1_p5_width = 3
    total_clocks_p0_p1_p5 = total_uops_p0_p1_p5 / p0_p1_p5_width
    total_time_p0_p1_p5 = total_clocks_p0_p1_p5 / frequency
    effective_flops = total_flops / total_time_p0_p1_p5 / 1e9
    peak_flops['p0_p1_p5_based_peak_flops'] = effective_flops
    
    # 4. memory load operation: p2/p3/p11
    total_uops_p2_p3_p11 = (
        uops_load_sa + uops_load_scaled_sum_a +  uops_load_int8_a + 
        uops_load_sw + uops_load_min_b + uops_load_int4_w
    )
    p2_p3_p11_width = 3
    total_clocks_p2_p3_p11 = total_uops_p2_p3_p11 / p2_p3_p11_width
    total_time_p2_p3_p11 = total_clocks_p2_p3_p11 / frequency
    effective_flops = total_flops / total_time_p2_p3_p11 / 1e9
    peak_flops['p2_p3_p11_based_peak_flops (memory load)'] = effective_flops
    
    # 5. memory bandwidth peak flops
    mem_frequency = 3600 * 10 ** 6
    bit_per_transfer = 64 * 2
    B = 8
    KB = 1024 * B
    MB = 1024 * KB
    GB = 1024 * MB
    memory_bandwidth = (mem_frequency * bit_per_transfer) / (GB)
    print(f"Memory bandwidth: {memory_bandwidth:.2f} GB/s")
    a_bits = m * k * 8
    sa_bit = m * k * 32 / q_blk_size
    scaled_sum_a_bit = m * k * 32 / q_blk_size
    w_bits = n * k * 4
    sw_bit = n * k * 32 / q_blk_size
    min_b_bit = n * k * 32 / q_blk_size
    c_bits = 2 * m * n * 32 # read and write
    data_transfer_time = (a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits) / (memory_bandwidth * GB)
    flops_limits = total_flops / data_transfer_time / 1e9
    peak_flops['memory_bandwidth_peak_flops'] = flops_limits
    """
    In practice, the elements of A and B must be loaded from (main memory or cache) with repetition. Specifically, elements of A must be re-loaded
    to calculate new elements of C for gemv. Our control lies in designing a memory access pattern so that they are mostly reloaded from
    cache, ideally from the lowest level, as much as possible.
    
    For GEMV, the elements of A are reloaded, while elements of B are loaded only once.
    What happens if all of them are loaded from main memory in the GEMV operation?
    """
    if m == 1:
        real_memory_bandwidth = memory_bandwidth / 2
        total_bits_A = n * (a_bits + sa_bit + scaled_sum_a_bit)
        total_bits_B = 1 * (w_bits + sw_bit + min_b_bit)
        total_bits_C = 1 * (c_bits)
        data_transfer_time = (total_bits_A + total_bits_B + total_bits_C) / (real_memory_bandwidth * GB)
        flops_limits = total_flops / data_transfer_time / 1e9
        peak_flops['Worst memory_bandwidth_peak_flops (GEMV) '] = flops_limits

    
    
    
    """
    Experiment 6: 
    what happens if we replace all memory load operations with pseudo value?
    At first glance, this replacement only affects the memory load operations, e.g. _mm256_loadu_si256,
    However, it is revealed that mask, shift operations, fuse scaling factors, min*scaled_sum operations are also optimized out.
    
    We compute the peak flops of this case just to prove the validity of the theoretical analysis.
    
    However in practice, we find it is hard to disentangle the memory load operations while leaving the other operations intact.

    """
    uops_fma_per_unit_exp_6 = 2 # 2 * vfmadd231ps
    total_uops_fma_exp_6 = num_unit * uops_fma_per_unit_exp_6                                   # p0 / p1
    total_uops_p0_p1_exp_6 = (
        # uops_shift + 
        # total_uops_fused_s + 
        total_uops_int8_mul_sum + 
        total_uops_cvt_int32_to_float +
        total_uops_fma_exp_6
    )
    # NOTE: p0 and p1 together could issue 2 uops per clock
    p0_p1_width = 2
    total_clocks_p0_p1_exp_6 = total_uops_p0_p1_exp_6 / p0_p1_width
    total_time_p0_p1_exp_6 = total_clocks_p0_p1_exp_6 / frequency
    effective_flops = total_flops / total_time_p0_p1_exp_6 / 1e9
    peak_flops['p0_p1_based_peak_flops_exp_6'] = effective_flops
    
    """
    Experiment 7:
    Add an extra `sign` operation.
    """
    extra_uops_sign_a_b_per_unit = 2 # vpsignb
    extra_total_uops_sign_a_b = num_unit * extra_uops_sign_a_b_per_unit                         # p0 / p1
    total_uops_p0_p1_exp_7 = total_uops_p0_p1 + extra_total_uops_sign_a_b
    p0_p1_width = 2
    total_clocks_p0_p1_exp_7 = total_uops_p0_p1_exp_7 / p0_p1_width
    total_time_p0_p1_exp_7 = total_clocks_p0_p1_exp_7 / frequency
    effective_flops = total_flops / total_time_p0_p1_exp_7 / 1e9
    peak_flops['p0_p1_based_peak_flops_exp_7'] = effective_flops


    # print dictionary by pandas
    import pandas as pd
    peak_flops_df = pd.DataFrame(peak_flops, index=[0])
    peak_flops_df.index = ['peak_flops']
    peak_flops_df.columns = peak_flops.keys()
    # transpose
    peak_flops_df = peak_flops_df.transpose()
    
    uops_df = pd.DataFrame(uops_dict, index=[0])
    uops_df.index = ['uops']
    uops_df.columns = uops_dict.keys()
    uops_df = uops_df.transpose()
    

    return peak_flops_df, uops_df





Examine the theorectical limits for gemm

In [None]:
peak_flops, uops_df = calculate_theoretical_peak_flops(1024, 1024, 1024)
display(peak_flops)
print(f'{"=":^50}')
display(uops_df)


NameError: name 'calculate_theoretical_peak_flops' is not defined

Examine the theorectical limits for gemv

In [None]:
peak_flops, uops_df = calculate_theoretical_peak_flops(1, 1024, 1024)
display(peak_flops)
print('='*100)
display(uops_df)


Memory bandwidth: 53.64 GB/s


Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.782642
p0_p1_based_peak_flops,118.690909
p0_p1_p5_based_peak_flops,130.56
p2_p3_p11_based_peak_flops (memory load),391.221537
memory_bandwidth_peak_flops,151.772017
Worst memory_bandwidth_peak_flops (GEMV),28.687938
p0_p1_based_peak_flops_exp_6,217.6
p0_p1_based_peak_flops_exp_7,100.430769




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,32.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
total_uops_fused_s,32768.0
total_uops_broadcast_fused_s,32768.0


In [None]:
peak_flops, uops_df = calculate_theoretical_peak_flops(1, 10240, 10240)
display(peak_flops)
print('='*100)
display(uops_df)

In [29]:
import pandas as pd

class Aq81Wq41:
    def __init__(self, m, n, k):
        self.m = m
        self.n = n
        self.k = k
        self.cpu_name = 'intel-13600kf'
        self.q_blk_size = 32
        self.frequency = 5.1e9  # 5.1 GHz
        self.dispatch_width = 6
        self.unit_size = 64  # each ymm holds 32 int_8 elements, corresponding to 64 int_4 weights
        self.total_flops = 2 * m * n * k
        self.peak_flops = {}
        self.uops_dict = {}
        self.total_minimum_data_preparation_uops = 0
        self.total_necessary_computation_uops = 0

        self.mem_frequency = 3600e6  # 3600 MHz
        bit_per_transfer = 64 * 2
        self.B = 8
        self.GB = 1024 ** 3 * self.B  # 1 GB in bits
        self.memory_bandwidth = (self.mem_frequency * bit_per_transfer) / self.GB

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask
    
    def _calculate_data_prep_o_uops(self) -> None:
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation is not vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = self.m * self.n # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        num_unit = (self.m * self.n) * (self.k / self.unit_size)
        # to fuse scaling factor: s_a * s_w
        uops_fused_s_per_unit = 2
        # broadcast single fused scaling factor among ymm
        uops_broadcast_fused_s_per_unit = 2
        # perform 32 pairs of int8 multiplication and addition, resulting in 4 pair of int32 partial sum
        uops_int8_mul_sum_per_unit = 2
        # convert int32 partial sum to float
        uops_cvt_int32_to_float_per_unit = 2
        # There are 4 pairs of FMA operations, a feature of the Aq81Wq41 algorithm.
        uops_fma_per_unit = 4

        total_uops_fused_s = num_unit * uops_fused_s_per_unit
        total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit
        total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit
        total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit
        total_uops_fma = num_unit * uops_fma_per_unit

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        total_uops_p0_p1_p5 = (
            total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask']
        )
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w']
        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = (self.m * self.k * 32) / self.q_blk_size #float
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = (self.n * self.k * 32) / self.q_blk_size # float
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits

    def calculate_peak_flops(self) -> None:

        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

    def get_peak_flops_df(self):
        df = pd.DataFrame(self.peak_flops, index=[0]).transpose()
        df.columns = ['peak_flops']
        return df

    def get_uops_df(self):
        df = pd.DataFrame(self.uops_dict, index=[0]).transpose()
        df.columns = ['uops']
        return df
    
    def report(self) -> None:
        print(f'{" Peak Effective FLOPS ":=^80}')
        display(self.get_peak_flops_df())
        print(f'{" uops statistics ":=^80}')
        display(self.get_uops_df())


In [30]:
# Example usage:
aq = Aq81Wq41(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report() 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.782642
p0_p1 bound peak flops,118.690909
p0_p1_p5 bound peak flops,130.56
memory ports bound peak flops,391.221537
memory bandwidth bound peak flops,152.557129




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,32.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
total_uops_fused_s,32768.0
total_uops_broadcast_fused_s,32768.0


In [31]:
# Example usage:
aq = Aq81Wq41(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report() 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,326.028548
p0_p1 bound peak flops,130.547251
p0_p1_p5 bound peak flops,163.160166
memory ports bound peak flops,182309.236364
memory bandwidth bound peak flops,19660.8




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,32768.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
total_uops_fused_s,33554432.0
total_uops_broadcast_fused_s,33554432.0


In [42]:
from abc import ABC, abstractmethod

class BaseCPUModel:
    def __init__(self, m, n, k):
        self.m = m
        self.n = n
        self.k = k
        self.input_meta = f'[{m}x{n}x{k}]'
        self.cpu_name = 'intel-13600kf'
        self.q_blk_size = 32
        self.frequency = 5.1e9  # 5.1 GHz
        self.dispatch_width = 6
        self.unit_size = 64  # each ymm holds 32 int_8 elements, corresponding to 64 int_4 weights
        self.total_flops = 2 * m * n * k
        self.peak_flops = {}
        self.uops_dict = {}
        self.total_minimum_data_preparation_uops = 0
        self.total_necessary_computation_uops = 0

        self.mem_frequency = 3600e6  # 3600 MHz
        bit_per_transfer = 64 * 2
        self.B = 8
        self.GB = 1024 ** 3 * self.B  # 1 GB in bits
        self.memory_bandwidth = (self.mem_frequency * bit_per_transfer) / self.GB


    @abstractmethod
    def _calculate_data_prep_a_uops(self):
        pass

    @abstractmethod
    def _calculate_data_prep_w_uops(self):
        pass
    
    @abstractmethod
    def _calculate_data_store_o_uops(self):
        pass


    @abstractmethod
    def _calculate_necessary_data_preparation_uops(self) -> None:
        pass

    @abstractmethod
    def _calculate_necessary_computation_uops(self) -> None:
        pass

    @abstractmethod
    def _compute_dispatch_width_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_p0_p1_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_p0_p1_p5_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_memory_port_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_memory_bandwidth_peak_flops(self):
        pass

    def calculate_peak_flops(self) -> None:
        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

    def get_peak_flops_df(self):
        df = pd.DataFrame(self.peak_flops, index=[0]).transpose()
        df.columns = ['peak_flops']
        return df

    def get_uops_df(self):
        df = pd.DataFrame(self.uops_dict, index=[0]).transpose()
        df.columns = ['uops']
        return df
    
    def report(self, prefix) -> None:
        mark = f" {prefix}-{self.input_meta}: Peak Effective FLOPS "
        print(f'{mark:=^80}')
        display(self.get_peak_flops_df())
        mark = f" {prefix}-{self.input_meta}: Uops Statistics "
        print(f'{mark:=^80}')
        display(self.get_uops_df())


In [43]:
import pandas as pd

class Aq81Wq41(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq81Wq41, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation is not vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = self.m * self.n # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        num_unit = (self.m * self.n) * (self.k / self.unit_size)
        # to fuse scaling factor: s_a * s_w
        uops_fused_s_per_unit = 2
        # broadcast single fused scaling factor among ymm
        uops_broadcast_fused_s_per_unit = 2
        # perform 32 pairs of int8 multiplication and addition, resulting in 4 pair of int32 partial sum
        uops_int8_mul_sum_per_unit = 2
        # convert int32 partial sum to float
        uops_cvt_int32_to_float_per_unit = 2
        # There are 4 pairs of FMA operations, a feature of the Aq81Wq41 algorithm.
        uops_fma_per_unit = 4

        total_uops_fused_s = num_unit * uops_fused_s_per_unit
        total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit
        total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit
        total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit
        total_uops_fma = num_unit * uops_fma_per_unit

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        total_uops_p0_p1_p5 = (
            total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask']
        )
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w']
        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = (self.m * self.k * 32) / self.q_blk_size #float
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = (self.n * self.k * 32) / self.q_blk_size # float
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [44]:
# Example usage:
aq = Aq81Wq41(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.172905
p0_p1 bound peak flops,118.690909
p0_p1_p5 bound peak flops,130.56
memory ports bound peak flops,391.221537
memory bandwidth bound peak flops,152.557129




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,32.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
uops_store_result,1024.0
total_uops_fused_s,32768.0


In [37]:
# Example usage:
aq = Aq81Wq41(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report() 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,324.341194
p0_p1 bound peak flops,130.547251
p0_p1_p5 bound peak flops,163.160166
memory ports bound peak flops,182309.236364
memory bandwidth bound peak flops,19660.8




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,32768.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0
uops_store_result,1048576.0
total_uops_fused_s,33554432.0
