# Introduction
This notebook intends to analyze the theoretical peak FLOPS for SGEMM/SGEMV and quantized GEMM/GEMV.

Consider the following general formula:

$$
O = A * W
$$
where $A$ stands for the activation, $W$ the weights, and $O$ the resulting output.

# Kernels

In [16]:
import pandas as pd
from abc import ABC, abstractmethod

class BaseCPUModel:
    def __init__(self, m, n, k):
        self.m = m
        self.n = n
        self.k = k
        self.input_meta = f'[{m}x{n}x{k}]'
        self.cpu_name = 'intel-13600kf'
        self.q_blk_size = 32
        self.frequency = 5.1e9  # 5.1 GHz
        self.dispatch_width = 6
        self.unit_size = 64  # each ymm holds 32 int_8 elements, corresponding to 64 int_4 weights
        self.total_flops = 2 * m * n * k
        self.peak_flops = {}
        self.uops_dict = {}
        self.total_minimum_data_preparation_uops = 0
        self.total_necessary_computation_uops = 0

        self.mem_frequency = 3600e6  # 3600 MT/S
        bit_per_transfer = 64 * 2    # 64 bits per channel, dual-channel
        self.B = 8
        self.GB = 1024 ** 3 * self.B  # 1 GB in bits
        self.memory_bandwidth = (self.mem_frequency * bit_per_transfer) / self.GB

        # init uops of interest
        self.uops_dict['uops_load_sa'] = 0
        self.uops_dict['uops_load_int8_a'] = 0
        self.uops_dict['uops_load_scaled_sum_a'] = 0
        self.uops_dict['uops_get_abs_a'] = 0
        self.uops_dict['uops_load_f32_a'] = 0
        self.uops_dict['uops_load_sw'] = 0
        self.uops_dict['uops_load_min_b'] = 0
        self.uops_dict['uops_load_int4_w'] = 0
        self.uops_dict['uops_shift'] = 0
        self.uops_dict['uops_mask'] = 0
        self.uops_dict['uops_sub'] = 0
        self.uops_dict['uops_load_fp32_w'] = 0
        self.uops_dict['uops_store_result'] = 0
        self.uops_dict['uops_fused_s'] = 0
        self.uops_dict['uops_broadcast_fused_s'] = 0
        self.uops_dict['uops_int8_mul_sum'] = 0
        self.uops_dict['uops_cvt_int32_to_float'] = 0
        self.uops_dict['uops_fma'] = 0
        self.uops_dict['uops_sign_a_to_b'] = 0


    @abstractmethod
    def _calculate_data_prep_a_uops(self):
        pass

    @abstractmethod
    def _calculate_data_prep_w_uops(self):
        pass
    
    @abstractmethod
    def _calculate_data_store_o_uops(self):
        pass


    @abstractmethod
    def _calculate_necessary_data_preparation_uops(self) -> None:
        pass

    @abstractmethod
    def _calculate_necessary_computation_uops(self) -> None:
        pass

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['uops_fused_s'] +
            self.uops_dict['uops_int8_mul_sum'] +
            self.uops_dict['uops_cvt_int32_to_float'] +
            self.uops_dict['uops_fma'] +
            self.uops_dict['uops_sign_a_to_b'] + 
            self.uops_dict['uops_get_abs_a']
            
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask'] +
            self.uops_dict['uops_sub']
        )
        self.total_uops_p0_p1_p5 = total_uops_p0_p1_p5
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_memory_ports = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_f32_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_load_fp32_w'] +
            self.uops_dict['uops_store_result']
        )
        # 2,3,4,7,8,9,11
        memory_port_width = 7
        total_clocks = total_uops_memory_ports / memory_port_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    @abstractmethod
    def _compute_memory_bandwidth_peak_flops(self):
        pass

    def calculate_peak_flops(self) -> None:
        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

    def get_peak_flops_df(self):
        df = pd.DataFrame(self.peak_flops, index=[0]).transpose()
        df.columns = ['peak_flops']
        return df

    def get_uops_df(self):
        df = pd.DataFrame(self.uops_dict, index=[0]).transpose()
        df.columns = ['uops']
        return df
    
    def report(self, prefix) -> None:
        mark = f" {prefix}-{self.input_meta}: Peak Effective FLOPS "
        print(f'{mark:=^80}')
        display(self.get_peak_flops_df())
        mark = f" {prefix}-{self.input_meta}: Uops Statistics "
        print(f'{mark:=^80}')
        display(self.get_uops_df())


## Af32Wfp32
Baseline. Both activations and weights are of type float32.

In [17]:
class Af32Wf32(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Af32Wf32, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()
    
    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        uops_load_sa = 0
        uops_load_int8_a = 0
        uops_load_fp32_a = self.m * self.k / 8
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = 0
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        self.uops_dict['uops_load_f32_a'] = uops_load_fp32_a 

        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a + uops_load_fp32_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = 0
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = 0
        uops_load_fp32_w = self.k * self.n / 8 
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        self.uops_dict['uops_load_fp32_w'] = uops_load_fp32_w
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub + uops_load_fp32_w
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Ideally, there is no need to read from output memory.
        """
        # _mm256_storeu_ps could store 8 float 
        uops_store_result = self.m * self.n / 8
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        total_uops_fused_s = 0
        total_uops_int8_mul_sum = 0
        total_uops_cvt_int32_to_float = 0
        total_uops_sign_a_to_b = 0
        total_uops_broadcast_fused_s = 0
        total_flops = self.m * self.n * self.k * 2
        # one fma in ymm handles 16 flops
        total_uops_fma = total_flops / 16

        self.uops_dict['uops_fused_s'] = total_uops_fused_s
        self.uops_dict['uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['uops_fma'] = total_uops_fma
        self.uops_dict['uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 32 # fp32
        sa_bit = 0
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 32 # fp32
        sw_bit = 0
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [18]:
# Example usage:
pa_aq80wq80_gemv = Af32Wf32(m=1024, n=1024, k=1024)
pa_aq80wq80_gemv.calculate_peak_flops()
pa_aq80wq80_gemv.report("Af32Wf32") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,488.169815
p0_p1 bound peak flops,163.2
p0_p1_p5 bound peak flops,244.8
memory ports bound peak flops,194969.6
memory bandwidth bound peak flops,9830.4




Unnamed: 0,uops
uops_load_sa,0.0
uops_load_int8_a,0.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,0.0
uops_load_f32_a,131072.0
uops_load_sw,0.0
uops_load_min_b,0.0
uops_load_int4_w,0.0
uops_shift,0.0
uops_mask,0.0


In [19]:
# Example usage:
pa_aq80wq80_gemv = Af32Wf32(m=1, n=1024, k=1024)
pa_aq80wq80_gemv.calculate_peak_flops()
pa_aq80wq80_gemv.report("Af32Wf32") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,244.561171
p0_p1 bound peak flops,163.2
p0_p1_p5 bound peak flops,244.8
memory ports bound peak flops,570.08655
memory bandwidth bound peak flops,28.74386




Unnamed: 0,uops
uops_load_sa,0.0
uops_load_int8_a,0.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,0.0
uops_load_f32_a,128.0
uops_load_sw,0.0
uops_load_min_b,0.0
uops_load_int4_w,0.0
uops_shift,0.0
uops_mask,0.0


## Aq80Wq40
Activations are quantized into `int8` using `q80` quantization scheme, while weights are quantized into `int4` using `q40` quantization scheme.

In [20]:
from typing import Dict

class Aq80Wq40(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq80Wq40, self).__init__(m, n, k)

        self.total_uops_p0_p1 = 0
        self.total_uops_p0_p1_p5 = 0

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    # consider one pair of quantization block of size (1x32)
    def get_necessary_ops_per_q_blk(self) -> Dict[str, int]:
        q_block_size = 32
        """
        due to the nature of Aq80Wq40 and limit of _mm256_dpbusd_avx_epi32, 
        a should get absolute value and its sign should be transferred to b. 
        Getting absolute value of a could be shared among some q blocks, but sign migration is inevitable. 
        """
        sign_a_to_b = q_block_size
        
        """
        Subtraction is categorized into data preparation part, since it could be shared among multiple pairs, if we unroll the loop.
        """
        # subtraction = q_block_size
        
        # int8 multiplication and addition ops
        iops = q_block_size * 2
        # convert result to float
        cvt_ops = 1
        # compute fused scaling factor
        fused_s_ops = 1
        # multiply converted sum with fused scaling factor, add back to accumulator
        fma_ops = 1
        statistics = {
            "iops": iops,
            "cvt_ops": cvt_ops,
            "fused_s_ops": fused_s_ops,
            "fma_ops": fma_ops,
            "sign_a_to_b_ops": sign_a_to_b,
            # "subtraction": subtraction
        }
        return statistics

    # the capability of instructions
    def get_uops_capability(self) -> Dict[str, int]:
        # _mm256_dpbusd_avx_epi32
        iops_per_ins = 8 * (4 + 4)
        # _mm256_cvtepi32_ps
        cvt_ops_per_ins = 8
        # _mm256_mul_ps
        fused_s_ops_per_ins = 8
        # one _mm256_fmadd_ps deals with 8 pairs of float32 fma operations
        fma_ops_per_ins = 8
        # migrate sign of a to b: _mm256_sign_epi8
        sign_a_to_b_ops_per_ins = 32
        # _mm256_sub_epi8
        # sub_per_ins = 32
        
        statistics = {
            "iops_per_ins": iops_per_ins,
            "cvt_ops_per_ins": cvt_ops_per_ins,
            "fused_s_ops_per_ins": fused_s_ops_per_ins,
            "fma_ops_per_ins": fma_ops_per_ins,
            "sign_a_to_b_ops_per_ins": sign_a_to_b_ops_per_ins,
            # "subtraction_per_ins": sub_per_ins
        }
        return statistics

    # get instructions needed for one quantization block
    def get_ins_per_q_blk(self) -> Dict[str, int]:
        ops_per_q_blk = self.get_necessary_ops_per_q_blk()
        ops_per_ins = self.get_uops_capability()
        ins_per_q_blk = {}
        for k, v in ops_per_q_blk.items():
            ins_per_q_blk[k] = v / ops_per_ins[k + "_per_ins"]
        ins_per_q_blk['total_ins'] = sum(ins_per_q_blk.values())
        return ins_per_q_blk

    # get instructions needed for all quantization blocks
    def get_ins_all(self) -> Dict[str, int]:
        ins_per_q_blk = self.get_ins_per_q_blk()
        num_q_pairs = (self.m * self.n * self.k) / 32
        ins_per_q_blk_all = {k: v * num_q_pairs for k, v in ins_per_q_blk.items()}
        return ins_per_q_blk_all

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = 0
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Ideally, there is no need to read from output memory.
        """
        # _mm256_storeu_ps could store 8 float 
        uops_store_result = self.m * self.n / 8
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        uops_dict = self.get_ins_all()

        total_uops_fused_s = uops_dict['fused_s_ops']
        total_uops_int8_mul_sum = uops_dict['iops']
        total_uops_cvt_int32_to_float = uops_dict['cvt_ops']
        total_uops_fma = uops_dict['fma_ops']
        total_uops_sign_a_to_b = uops_dict['sign_a_to_b_ops']

        # ignore extra operations that are dependent on algorithm design as we are calculating the upper bound
        total_uops_broadcast_fused_s = 0

        self.uops_dict['uops_fused_s'] = total_uops_fused_s
        self.uops_dict['uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['uops_fma'] = total_uops_fma
        self.uops_dict['uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits

    def calculate_peak_flops(self) -> None:
        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

In [21]:
# Example usage:
pa_aq80wq80_gemv = Aq80Wq40(m=1024, n=1024, k=1024)

pa_aq80wq80_gemv.calculate_peak_flops()
pa_aq80wq80_gemv.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,820.876627
p0_p1 bound peak flops,274.693733
p0_p1_p5 bound peak flops,411.702238
memory ports bound peak flops,311951.36
memory bandwidth bound peak flops,20515.617391




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32768.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [22]:
# Example usage:
pa_aq80wq80_gemv = Aq80Wq40(m=1, n=1024, k=1024)

pa_aq80wq80_gemv.calculate_peak_flops()
pa_aq80wq80_gemv.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,306.870941
p0_p1 bound peak flops,226.983769
p0_p1_p5 bound peak flops,200.82131
memory ports bound peak flops,1517.273152
memory bandwidth bound peak flops,182.855726




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


## Aq81Wq41
Activations are quantized into `int8` using `q81` quantization scheme, while weights are quantized into `int4` using `q40` quantization scheme.

In [23]:
class Aq81Wq41(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq81Wq41, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    # consider one pair of quantization block of size (1x32)
    def get_necessary_ops_per_q_blk(self) -> Dict[str, int]:
        q_block_size = 32
        # no need to migrate sign of a to b in Aq81Wq41
        sign_a_to_b = 0
        # int8 multiplication and addition ops
        iops = q_block_size * 2
        # convert int result to float
        cvt_ops = 1
        # compute fused scaling factor
        fused_s_ops = 1
        # 1st fma: multiply converted sum with fused scaling factor, add back to accumulator
        # 2nd fma: multiply min with scaled sum, add back to accumulator
        fma_ops = 2
        statistics = {
            "iops": iops,
            "cvt_ops": cvt_ops,
            "fused_s_ops": fused_s_ops,
            "fma_ops": fma_ops,
            "sign_a_to_b_ops": sign_a_to_b
        }
        return statistics

    # the capability of instructions
    def get_uops_capability(self) -> Dict[str, int]:
        # _mm256_dpbusd_avx_epi32
        iops_per_ins = 8 * (4 + 4)
        # _mm256_cvtepi32_ps
        cvt_ops_per_ins = 8
        # _mm256_mul_ps
        fused_s_ops_per_ins = 8
        # one _mm256_fmadd_ps deals with 8 pairs of float32 fma operations
        fma_ops_per_ins = 8
        # migrate sign of a to b: _mm256_sign_epi8
        sign_a_to_b_ops_per_ins = 32
        statistics = {
            "iops_per_ins": iops_per_ins,
            "cvt_ops_per_ins": cvt_ops_per_ins,
            "fused_s_ops_per_ins": fused_s_ops_per_ins,
            "fma_ops_per_ins": fma_ops_per_ins,
            "sign_a_to_b_ops_per_ins": sign_a_to_b_ops_per_ins
        }
        return statistics

    # get instructions needed for one quantization block
    def get_ins_per_q_blk(self) -> Dict[str, int]:
        ops_per_q_blk = self.get_necessary_ops_per_q_blk()
        ops_per_ins = self.get_uops_capability()
        ins_per_q_blk = {}
        for k, v in ops_per_q_blk.items():
            ins_per_q_blk[k] = v / ops_per_ins[k + "_per_ins"]
        ins_per_q_blk['total_ins'] = sum(ins_per_q_blk.values())
        return ins_per_q_blk

    # get instructions needed for all quantization blocks
    def get_ins_all(self) -> Dict[str, int]:
        ins_per_q_blk = self.get_ins_per_q_blk()
        num_q_pairs = (self.m * self.n * self.k) / 32
        ins_per_q_blk_all = {k: v * num_q_pairs for k, v in ins_per_q_blk.items()}
        return ins_per_q_blk_all
    
    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = num_q_blocks_a
        uops_get_abs_a = 0
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        return uops_load_sa + uops_load_int8_a +uops_load_scaled_sum_a + uops_get_abs_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = 0
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Ideally, there is no need to read from output memory.
        """
        # _mm256_storeu_ps could store 8 float 
        uops_store_result = self.m * self.n / 8
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        uops_dict = self.get_ins_all()

        total_uops_fused_s = uops_dict['fused_s_ops']
        total_uops_int8_mul_sum = uops_dict['iops']
        total_uops_cvt_int32_to_float = uops_dict['cvt_ops']
        total_uops_fma = uops_dict['fma_ops']
        total_uops_sign_a_to_b = uops_dict['sign_a_to_b_ops']
        total_uops_broadcast_fused_s = 0

        self.uops_dict['uops_fused_s'] = total_uops_fused_s
        self.uops_dict['uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['uops_fma'] = total_uops_fma
        self.uops_dict['uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = (self.m * self.k * 32) / self.q_blk_size #float
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = (self.n * self.k * 32) / self.q_blk_size # float
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [24]:
# Example usage:
pa_aq80wq80_gemv = Aq81Wq41(m=1024, n=1024, k=1024)
pa_aq80wq80_gemv.calculate_peak_flops()
pa_aq80wq80_gemv.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,1296.316484
p0_p1 bound peak flops,435.058379
p0_p1_p5 bound peak flops,652.163122
memory ports bound peak flops,246277.389474
memory bandwidth bound peak flops,19660.8




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,32768.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [25]:
# Example usage:
pa_aq80wq80_gemv = Aq81Wq41(m=1, n=1024, k=1024)
pa_aq80wq80_gemv.calculate_peak_flops()
pa_aq80wq80_gemv.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,355.630715
p0_p1 bound peak flops,326.4
p0_p1_p5 bound peak flops,326.4
memory ports bound peak flops,911.427815
memory bandwidth bound peak flops,152.557129




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,32.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


# Overall Comparison

In [26]:
import pandas as pd

# Assuming the classes Af32Wf32, Aq81Wq41, Aq80Wq40, and BaseCPUModel are already defined in the notebook

def get_peak_flops_and_uops(model, name):
    model.calculate_peak_flops()
    peak_flops = model.get_peak_flops_df()
    uops = model.get_uops_df()
    peak_flops.columns = pd.MultiIndex.from_product([[name], peak_flops.columns])
    uops.columns = pd.MultiIndex.from_product([[name], uops.columns])
    return peak_flops, uops

def show_comparison(m, n, k):
    # Instantiate models
    af32wf32 = Af32Wf32(m, n, k)
    aq81wq41 = Aq81Wq41(m, n, k)
    aq80wq40 = Aq80Wq40(m, n, k)

    # Collect data
    peak_flops_af32, uops_af32 = get_peak_flops_and_uops(af32wf32, "Af32Wf32")
    peak_flops_aq81, uops_aq81 = get_peak_flops_and_uops(aq81wq41, "Aq81Wq41")
    peak_flops_aq80, uops_aq80 = get_peak_flops_and_uops(aq80wq40, "Aq80Wq40")

    # Combine into comparison tables
    peak_flops_comparison = pd.concat([peak_flops_af32, peak_flops_aq81, peak_flops_aq80], axis=1)
    uops_comparison = pd.concat([uops_af32, uops_aq81, uops_aq80], axis=1)


    # Display results
    print("=== Peak Performance Comparison (GFLOPS) ===")
    display(peak_flops_comparison.round(2))
    # Add port column
    port_map = {
        'uops_load_sa': 'Memory',
        'uops_load_int8_a': 'Memory',
        'uops_load_scaled_sum_a': 'Memory',
        'uops_get_abs_a': 'ALU (p0/p1)',
        'uops_load_f32_a': 'Memory',
        'uops_load_sw': 'Memory',
        'uops_load_min_b': 'Memory',
        'uops_load_int4_w': 'Memory',
        'uops_shift': 'ALU (p0/p1)',
        'uops_mask': 'ALU (p0/p1/p5)',
        'uops_sub': 'ALU (p0/p1/p5)',
        'uops_load_fp32_w': 'Memory',
        'uops_store_result': 'Memory',
        'uops_fused_s': 'ALU (p0/p1)',
        'uops_broadcast_fused_s': 'ALU (p5)',
        'uops_int8_mul_sum': 'ALU (p0/p1)',
        'uops_cvt_int32_to_float': 'ALU (p0/p1)',
        'uops_fma': 'ALU (p0/p1)',
        'uops_sign_a_to_b': 'ALU (p0/p1)'
    }
    uops_with_port = uops_comparison.copy()
    uops_with_port.insert(0, 'port', uops_with_port.index.map(lambda x: port_map.get(x, 'Other')))
    print("\n=== Uops Statistics Comparison ===")
    display(uops_with_port.round(2))

## GEMM

In [27]:

# Set matrix sizes
m = n = k = 1024
show_comparison(m, n, k)

=== Peak Performance Comparison (GFLOPS) ===


Unnamed: 0_level_0,Af32Wf32,Aq81Wq41,Aq80Wq40
Unnamed: 0_level_1,peak_flops,peak_flops,peak_flops
dispatch_width_based_peak_flops,488.17,1296.32,820.88
p0_p1 bound peak flops,163.2,435.06,274.69
p0_p1_p5 bound peak flops,244.8,652.16,411.7
memory ports bound peak flops,194969.6,246277.39,311951.36
memory bandwidth bound peak flops,9830.4,19660.8,20515.62



=== Uops Statistics Comparison ===


Unnamed: 0_level_0,port,Af32Wf32,Aq81Wq41,Aq80Wq40
Unnamed: 0_level_1,Unnamed: 1_level_1,uops,uops,uops
uops_load_sa,Memory,0.0,32768.0,32768.0
uops_load_int8_a,Memory,0.0,32768.0,32768.0
uops_load_scaled_sum_a,Memory,0.0,32768.0,0.0
uops_get_abs_a,ALU (p0/p1),0.0,0.0,32768.0
uops_load_f32_a,Memory,131072.0,0.0,0.0
uops_load_sw,Memory,0.0,32768.0,32768.0
uops_load_min_b,Memory,0.0,32768.0,0.0
uops_load_int4_w,Memory,0.0,16384.0,16384.0
uops_shift,ALU (p0/p1),0.0,16384.0,16384.0
uops_mask,ALU (p0/p1/p5),0.0,32768.0,32768.0


## GEMV

In [28]:

# Set matrix sizes
m = 1
n = k = 1024
show_comparison(m, n, k)

=== Peak Performance Comparison (GFLOPS) ===


Unnamed: 0_level_0,Af32Wf32,Aq81Wq41,Aq80Wq40
Unnamed: 0_level_1,peak_flops,peak_flops,peak_flops
dispatch_width_based_peak_flops,244.56,355.63,306.87
p0_p1 bound peak flops,163.2,326.4,226.98
p0_p1_p5 bound peak flops,244.8,326.4,200.82
memory ports bound peak flops,570.09,911.43,1517.27
memory bandwidth bound peak flops,28.74,152.56,182.86



=== Uops Statistics Comparison ===


Unnamed: 0_level_0,port,Af32Wf32,Aq81Wq41,Aq80Wq40
Unnamed: 0_level_1,Unnamed: 1_level_1,uops,uops,uops
uops_load_sa,Memory,0.0,32.0,32.0
uops_load_int8_a,Memory,0.0,32.0,32.0
uops_load_scaled_sum_a,Memory,0.0,32.0,0.0
uops_get_abs_a,ALU (p0/p1),0.0,0.0,32.0
uops_load_f32_a,Memory,128.0,0.0,0.0
uops_load_sw,Memory,0.0,32768.0,32768.0
uops_load_min_b,Memory,0.0,32768.0,0.0
uops_load_int4_w,Memory,0.0,16384.0,16384.0
uops_shift,ALU (p0/p1),0.0,16384.0,16384.0
uops_mask,ALU (p0/p1/p5),0.0,32768.0,32768.0
