In [29]:
import pandas as pd
from abc import ABC, abstractmethod

class BaseCPUModel:
    def __init__(self, m, n, k):
        self.m = m
        self.n = n
        self.k = k
        self.input_meta = f'[{m}x{n}x{k}]'
        self.cpu_name = 'intel-13600kf'
        self.q_blk_size = 32
        self.frequency = 5.1e9  # 5.1 GHz
        self.dispatch_width = 6
        self.unit_size = 64  # each ymm holds 32 int_8 elements, corresponding to 64 int_4 weights
        self.total_flops = 2 * m * n * k
        self.peak_flops = {}
        self.uops_dict = {}
        self.total_minimum_data_preparation_uops = 0
        self.total_necessary_computation_uops = 0

        self.mem_frequency = 3600e6  # 3600 MT/S
        bit_per_transfer = 64 * 2    # 64 bits per channel, dual-channel
        self.B = 8
        self.GB = 1024 ** 3 * self.B  # 1 GB in bits
        self.memory_bandwidth = (self.mem_frequency * bit_per_transfer) / self.GB

        # init uops of interest
        self.uops_dict['uops_load_sa'] = 0
        self.uops_dict['uops_load_int8_a'] = 0
        self.uops_dict['uops_load_scaled_sum_a'] = 0
        self.uops_dict['uops_get_abs_a'] = 0
        self.uops_dict['uops_load_f32_a'] = 0
        self.uops_dict['uops_load_sw'] = 0
        self.uops_dict['uops_load_min_b'] = 0
        self.uops_dict['uops_load_int4_w'] = 0
        self.uops_dict['uops_shift'] = 0
        self.uops_dict['uops_mask'] = 0
        self.uops_dict['uops_sub'] = 0
        self.uops_dict['uops_load_fp32_w'] = 0
        self.uops_dict['uops_store_result'] = 0
        self.uops_dict['total_uops_fused_s'] = 0
        self.uops_dict['total_uops_broadcast_fused_s'] = 0
        self.uops_dict['total_uops_int8_mul_sum'] = 0
        self.uops_dict['total_uops_cvt_int32_to_float'] = 0
        self.uops_dict['total_uops_fma'] = 0
        self.uops_dict['total_uops_sign_a_to_b'] = 0


    @abstractmethod
    def _calculate_data_prep_a_uops(self):
        pass

    @abstractmethod
    def _calculate_data_prep_w_uops(self):
        pass
    
    @abstractmethod
    def _calculate_data_store_o_uops(self):
        pass


    @abstractmethod
    def _calculate_necessary_data_preparation_uops(self) -> None:
        pass

    @abstractmethod
    def _calculate_necessary_computation_uops(self) -> None:
        pass

    @abstractmethod
    def _compute_dispatch_width_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_p0_p1_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_p0_p1_p5_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_memory_port_based_peak_flops(self):
        pass

    @abstractmethod
    def _compute_memory_bandwidth_peak_flops(self):
        pass

    def calculate_peak_flops(self) -> None:
        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

    def get_peak_flops_df(self):
        df = pd.DataFrame(self.peak_flops, index=[0]).transpose()
        df.columns = ['peak_flops']
        return df

    def get_uops_df(self):
        df = pd.DataFrame(self.uops_dict, index=[0]).transpose()
        df.columns = ['uops']
        return df
    
    def report(self, prefix) -> None:
        mark = f" {prefix}-{self.input_meta}: Peak Effective FLOPS "
        print(f'{mark:=^80}')
        display(self.get_peak_flops_df())
        mark = f" {prefix}-{self.input_meta}: Uops Statistics "
        print(f'{mark:=^80}')
        display(self.get_uops_df())


# Aq81Wq41

In [30]:

class Aq81Wq41(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq81Wq41, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation is not vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = self.m * self.n # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        num_unit = (self.m * self.n) * (self.k / self.unit_size)
        # to fuse scaling factor: s_a * s_w
        uops_fused_s_per_unit = 2
        # broadcast single fused scaling factor among ymm
        uops_broadcast_fused_s_per_unit = 2
        # perform 32 pairs of int8 multiplication and addition, resulting in 4 pair of int32 partial sum
        uops_int8_mul_sum_per_unit = 2
        # convert int32 partial sum to float
        uops_cvt_int32_to_float_per_unit = 2
        # There are 4 pairs of FMA operations, a feature of the Aq81Wq41 algorithm.
        uops_fma_per_unit = 4

        total_uops_fused_s = num_unit * uops_fused_s_per_unit
        total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit
        total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit
        total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit
        total_uops_fma = num_unit * uops_fma_per_unit

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        total_uops_p0_p1_p5 = (
            total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask']
        )
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']
        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = (self.m * self.k * 32) / self.q_blk_size #float
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = (self.n * self.k * 32) / self.q_blk_size # float
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [31]:
# Example usage:
aq = Aq81Wq41(m=1, n=10240, k=10240)
aq.calculate_peak_flops()
aq.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.773085
p0_p1 bound peak flops,118.690909
p0_p1_p5 bound peak flops,130.56
memory ports bound peak flops,391.145231
memory bandwidth bound peak flops,153.495072




Unnamed: 0,uops
uops_load_sa,320.0
uops_load_int8_a,320.0
uops_load_scaled_sum_a,320.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,3276800.0
uops_load_min_b,3276800.0
uops_load_int4_w,1638400.0
uops_shift,1638400.0
uops_mask,3276800.0


In [32]:
# Example usage:
aq = Aq81Wq41(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,324.341194
p0_p1 bound peak flops,130.547251
p0_p1_p5 bound peak flops,163.160166
memory ports bound peak flops,26738.688
memory bandwidth bound peak flops,19660.8




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,32768.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


# Aq80Wq40

In [33]:

class Aq80Wq40(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq80Wq40, self).__init__(m, n, k)

        self.total_uops_p0_p1 = 0
        self.total_uops_p0_p1_p5 = 0

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = 0
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation is not vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = self.m * self.n # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        num_unit = (self.m * self.n) * (self.k / self.unit_size)
        # to fuse scaling factor: s_a * s_w
        uops_fused_s_per_unit = 2
        # broadcast single fused scaling factor among ymm
        uops_broadcast_fused_s_per_unit = 2
        # perform 32 pairs of int8 multiplication and addition, resulting in 4 pair of int32 partial sum
        uops_int8_mul_sum_per_unit = 2
        # convert int32 partial sum to float
        uops_cvt_int32_to_float_per_unit = 2
        # There are 4 pairs of FMA operations, a feature of the Aq81Wq41 algorithm.
        uops_fma_per_unit = 2
        # transfer the sign of a to b. This is related to performing `mul_sum_i8_pairs_float`
        uops_sign_a_to_b_per_unit = 2

        total_uops_fused_s = num_unit * uops_fused_s_per_unit
        total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit
        total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit
        total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit
        total_uops_fma = num_unit * uops_fma_per_unit
        total_uops_sign_a_to_b = num_unit * uops_sign_a_to_b_per_unit

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma
        self.uops_dict['total_uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma'] +
            self.uops_dict['total_uops_sign_a_to_b'] + 
            self.uops_dict['uops_get_abs_a']
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask'] +
            self.uops_dict['uops_sub']
        )
        self.total_uops_p0_p1_p5 = total_uops_p0_p1_p5
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']

        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [34]:
# Example usage:
aq = Aq80Wq40(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,195.172905
p0_p1 bound peak flops,118.669838
p0_p1_p5 bound peak flops,115.186766
memory ports bound peak flops,638.66293
memory bandwidth bound peak flops,182.855726




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [35]:
# Example usage:
aq = Aq80Wq40(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,324.341194
p0_p1 bound peak flops,130.521761
p0_p1_p5 bound peak flops,163.107084
memory ports bound peak flops,28245.092958
memory bandwidth bound peak flops,20515.617391




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32768.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


# Afp32Wfp32

In [36]:

class Afp32Wfp32(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Afp32Wfp32, self).__init__(m, n, k)

        self.total_uops_p0_p1 = 0
        self.total_uops_p0_p1_p5 = 0
        self.total_flops = self.m * self.n * self.k * 2

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = 0
        uops_load_sa = 0
        uops_load_int8_a = 0
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = 0
        
        # a ymm register holds 8 fp32; 1 vmovss consumes 8 fp32
        uops_load_fp32_a = (self.m * self.k ) / 8
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        self.uops_dict['uops_load_fp32_a'] = uops_load_fp32_a

        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a + uops_load_fp32_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = 0
        uops_load_sw = 0
        uops_load_min_b = 0
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = 0
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = 0
        uops_mask = 0
        uops_sub = 0
        uops_load_fp32_w = (self.k * self.n) / 8

        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        self.uops_dict['uops_load_fp32_w'] = uops_load_fp32_w
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub + uops_load_fp32_w
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation could be vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = (self.m * self.n) / 8 # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. There are only fma for arithemetic operation in the innermost loop.
        2. Only computations in the innermost loop are considered.
        """

        total_uops_fused_s = 0
        total_uops_broadcast_fused_s = 0
        total_uops_int8_mul_sum = 0
        total_uops_cvt_int32_to_float = 0
        total_uops_sign_a_to_b = 0
        
        # 1. compute the total flops needed
        # 2. how many flops does one fma uops could consume?
        flops_per_fma = (8 * 2) # 8 mul and 8 add
        total_uops_fma = self.total_flops / flops_per_fma 

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma
        self.uops_dict['total_uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma'] +
            self.uops_dict['total_uops_sign_a_to_b'] + 
            self.uops_dict['uops_get_abs_a']
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask'] +
            self.uops_dict['uops_sub']
        )
        self.total_uops_p0_p1_p5 = total_uops_p0_p1_p5
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']

        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 32 # float
        sa_bit = 0
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 32 # float
        sw_bit = 0
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [37]:
# Example usage:
aq = Afp32Wfp32(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Afp32Wfp32") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,244.561171
p0_p1 bound peak flops,163.2
p0_p1_p5 bound peak flops,244.8
memory ports bound peak flops,250675.2
memory bandwidth bound peak flops,28.74386




Unnamed: 0,uops
uops_load_sa,0.0
uops_load_int8_a,0.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,0.0
uops_load_min_b,0.0
uops_load_int4_w,0.0
uops_shift,0.0
uops_mask,0.0


In [38]:
# Example usage:
aq = Afp32Wfp32(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Afp32Wfp32") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,488.169815
p0_p1 bound peak flops,163.2
p0_p1_p5 bound peak flops,244.8
memory ports bound peak flops,250675.2
memory bandwidth bound peak flops,9830.4




Unnamed: 0,uops
uops_load_sa,0.0
uops_load_int8_a,0.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,0.0
uops_load_min_b,0.0
uops_load_int4_w,0.0
uops_shift,0.0
uops_mask,0.0


# Aq81W141 (lamma.cpp)

In [39]:

class Aq81Wq41Llama(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq81Wq41Llama, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        """
        NOTE: This is where the disctinction of unpacking exist between Llama and AWQ.
        For Llama, each quantization block needs:
        1. **1** vmovdqu to load int4 weights
        2. **1** vpsrlw
        3. **1** extra vinserti128
        4. 1 mask  operation
        
        By contrast, for AWQ, each **two** quantization blocks needs:
        1. 1 vmovdqu to load int4 weights. (Why the same ? Since llama uses xmm while awq uses ymm)
        2. 1 vpsrlw
        3. 2 mask operation
        """
        uops_load_int4_w = num_q_blocks_b
        # uops_shift = num_q_blocks_b / 2 * 1
        uops_shift = num_q_blocks_b 
        uops_mask = num_q_blocks_b
        uops_insert = num_q_blocks_b

        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_insert'] = uops_insert
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_insert
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation is not vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = self.m * self.n # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        self.unit_size = 32 # process 1 quantization block
        num_unit = (self.m * self.n) * (self.k / self.unit_size)
        # to fuse scaling factor: s_a * s_w
        uops_fused_s_per_unit = 1
        # broadcast single fused scaling factor among ymm
        uops_broadcast_fused_s_per_unit = 1
        # perform 32 pairs of int8 multiplication and addition, resulting in 4 pair of int32 partial sum
        uops_int8_mul_sum_per_unit = 1
        # convert int32 partial sum to float
        uops_cvt_int32_to_float_per_unit = 1
        # There are 4 pairs of FMA operations, a feature of the Aq81Wq41 algorithm.
        uops_fma_per_unit = 2

        total_uops_fused_s = num_unit * uops_fused_s_per_unit
        total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit
        total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit
        total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit
        total_uops_fma = num_unit * uops_fma_per_unit

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        total_uops_p0_p1_p5 = (
            total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask'] +
            # NOTE: there are extra `insert` operations
            self.uops_dict['uops_insert']
        )
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']
        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = (self.m * self.k * 32) / self.q_blk_size #float
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = (self.n * self.k * 32) / self.q_blk_size # float
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [40]:
# Example usage:
aq = Aq81Wq41Llama(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq81Wq41Llama") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,324.236314
p0_p1 bound peak flops,130.534505
p0_p1_p5 bound peak flops,163.120351
memory ports bound peak flops,26386.863158
memory bandwidth bound peak flops,19660.8




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,32768.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,32768.0
uops_shift,32768.0
uops_mask,32768.0


In [41]:
# Example usage:
aq = Aq81Wq41Llama(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq81Wq41Llama") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,162.736477
p0_p1 bound peak flops,108.8
p0_p1_p5 bound peak flops,108.8
memory ports bound peak flops,322.723141
memory bandwidth bound peak flops,152.557129




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,32.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,32768.0
uops_shift,32768.0
uops_mask,32768.0


# Revisit the correcness of current theoretical analysis

In [42]:
class Aq80Wq40(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq80Wq40, self).__init__(m, n, k)

        self.total_uops_p0_p1 = 0
        self.total_uops_p0_p1_p5 = 0

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = 0
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Store operation is not vectorized.
        2. Ideally, there is no need to read from output memory.
        """
        uops_store_result = self.m * self.n # vmovss
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """

        sub_m = 16
        sub_n = 8
        q_block_size = 32
        num_computation_block = (self.m / sub_m) * (self.n / sub_n)
        num_innermost_loop = (self.k) / q_block_size
        num_unit = num_computation_block * num_innermost_loop
        # to fuse scaling factor: s_a * s_w
        uops_fused_s_per_unit = 4 * 4
        # broadcast single fused scaling factor among ymm
        # NOTE: not sure
        uops_broadcast_fused_s_per_unit = 0 #
        # perform 32 pairs of int8 multiplication and addition, resulting in 4 pair of int32 partial sum
        uops_int8_mul_sum_per_unit = 4 * 32
        # convert int32 partial sum to float
        uops_cvt_int32_to_float_per_unit = 4 * 4
        # There are 4 pairs of FMA operations, a feature of the Aq81Wq41 algorithm.
        uops_fma_per_unit = 4 * 4
        # transfer the sign of a to b. This is related to performing `mul_sum_i8_pairs_float`
        uops_sign_a_to_b_per_unit = 4 * 32

        total_uops_fused_s = num_unit * uops_fused_s_per_unit
        total_uops_broadcast_fused_s = num_unit * uops_broadcast_fused_s_per_unit
        total_uops_int8_mul_sum = num_unit * uops_int8_mul_sum_per_unit
        total_uops_cvt_int32_to_float = num_unit * uops_cvt_int32_to_float_per_unit
        total_uops_fma = num_unit * uops_fma_per_unit
        total_uops_sign_a_to_b = num_unit * uops_sign_a_to_b_per_unit

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma
        self.uops_dict['total_uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma'] +
            self.uops_dict['total_uops_sign_a_to_b'] + 
            self.uops_dict['uops_get_abs_a']
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops


        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask'] +
            self.uops_dict['uops_sub']
        )
        self.total_uops_p0_p1_p5 = total_uops_p0_p1_p5
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']

        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits

    def calculate_peak_flops(self) -> None:
        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

In [43]:
# Example usage:
aq = Aq80Wq40(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,305.561725
p0_p1 bound peak flops,226.983769
p0_p1_p5 bound peak flops,200.82131
memory ports bound peak flops,638.66293
memory bandwidth bound peak flops,182.855726




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [44]:
# Example usage:
aq = Aq80Wq40(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,811.574909
p0_p1 bound peak flops,274.693733
p0_p1_p5 bound peak flops,411.702238
memory ports bound peak flops,28245.092958
memory bandwidth bound peak flops,20515.617391




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32768.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [45]:
m = 1024
n = 1024
k = 1024
sub_m = 16
sub_n = 8
q_block_size = 32
num_computation_block = (m / sub_m) * (n / sub_n)
fma_per_block = (k / q_block_size) * 16
total_fma = num_computation_block * fma_per_block
total_fma, total_fma * 8

(4194304.0, 33554432.0)

In [46]:
total_iops = 2 * m * n * k
iops_per_mulSumi8 = 8 * 8
total_mulSumi8 = total_iops / iops_per_mulSumi8
print(f"total_fma: {total_fma}, total_mulSumi8: {total_mulSumi8}")

total_fma: 4194304.0, total_mulSumi8: 33554432.0


In [47]:
from typing import Dict
# consider one pair of quantization block of size (1x32)
# perform int8 multiplication and addition
def get_necessary_ops_per_q_blk() -> Dict[str, int]:
    q_block_size = 32
    iops = 1 * 1 * q_block_size * 2
    # convert result to float
    cvt_ops = 1
    # compute fused scaling factor
    fused_s_ops = 1
    # multiply converted sum with fused scaling factor, add back to accumulator
    fma_ops = 1
    statistics = {
        "iops": iops,
        "cvt_ops": cvt_ops,
        "fused_s_ops": fused_s_ops,
        "fma_ops": fma_ops
    }
    return statistics

# the capability of instructions
def get_uops_capability() -> Dict[str, int]:
    # _mm256_dpbusd_avx_epi32
    iops_per_ins = 8 * (4 + 4)
    # _mm256_cvtepi32_ps
    cvt_ops_per_ins = 8
    # _mm256_mul_ps
    fused_s_ops_per_ins = 8
    # one _mm256_fmadd_ps deals with 8 pairs of float32 fma operations
    fma_ops_per_ins = 8
    statistics = {
        "iops_per_ins": iops_per_ins,
        "cvt_ops_per_ins": cvt_ops_per_ins,
        "fused_s_ops_per_ins": fused_s_ops_per_ins,
        "fma_ops_per_ins": fma_ops_per_ins
    }
    return statistics

# get instructions needed for one quantization block
def get_ins_per_q_blk() -> Dict[str, int]:
    ops_per_q_blk = get_necessary_ops_per_q_blk()
    ins_per_q_blk = {}
    ops_per_ins = get_uops_capability()
    for k, v in ops_per_q_blk.items():
        ins_per_q_blk[k] = v / ops_per_ins[k + "_per_ins"]
    ins_per_q_blk['total_ins'] = sum(ins_per_q_blk.values())
    return ins_per_q_blk

# get instructions needed for all quantization blocks
def get_ins_all(m: int, n: int, k: int) -> Dict[str, int]:
    ins_per_q_blk = get_ins_per_q_blk()
    num_q_blocks = (m * n * k) / 32
    ins_per_q_blk_all = {k: v * num_q_blocks for k, v in ins_per_q_blk.items()}
    return ins_per_q_blk_all

m = 1024
n = 1024
k = 1024
ins_total = get_ins_all(m, n, k)
print("Instructions needed for all quantization blocks:")
print(ins_total)

Instructions needed for all quantization blocks:
{'iops': 33554432.0, 'cvt_ops': 4194304.0, 'fused_s_ops': 4194304.0, 'fma_ops': 4194304.0, 'total_ins': 46137344.0}


## Aq80Wq40

In [48]:
from typing import Dict

class Aq80Wq40(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq80Wq40, self).__init__(m, n, k)

        self.total_uops_p0_p1 = 0
        self.total_uops_p0_p1_p5 = 0

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    # consider one pair of quantization block of size (1x32)
    def get_necessary_ops_per_q_blk(self) -> Dict[str, int]:
        q_block_size = 32
        """
        due to the nature of Aq80Wq40 and limit of _mm256_dpbusd_avx_epi32, 
        a should get absolute value and its sign should be transferred to b. 
        Getting absolute value of a could be shared among some q blocks, but sign migration is inevitable. 
        """
        sign_a_to_b = q_block_size
        
        """
        Subtraction is categorized into data preparation part, since it could be shared among multiple pairs, if we unroll the loop.
        """
        # subtraction = q_block_size
        
        # int8 multiplication and addition ops
        iops = q_block_size * 2
        # convert result to float
        cvt_ops = 1
        # compute fused scaling factor
        fused_s_ops = 1
        # multiply converted sum with fused scaling factor, add back to accumulator
        fma_ops = 1
        statistics = {
            "iops": iops,
            "cvt_ops": cvt_ops,
            "fused_s_ops": fused_s_ops,
            "fma_ops": fma_ops,
            "sign_a_to_b_ops": sign_a_to_b,
            # "subtraction": subtraction
        }
        return statistics

    # the capability of instructions
    def get_uops_capability(self) -> Dict[str, int]:
        # _mm256_dpbusd_avx_epi32
        iops_per_ins = 8 * (4 + 4)
        # _mm256_cvtepi32_ps
        cvt_ops_per_ins = 8
        # _mm256_mul_ps
        fused_s_ops_per_ins = 8
        # one _mm256_fmadd_ps deals with 8 pairs of float32 fma operations
        fma_ops_per_ins = 8
        # migrate sign of a to b: _mm256_sign_epi8
        sign_a_to_b_ops_per_ins = 32
        # _mm256_sub_epi8
        # sub_per_ins = 32
        
        statistics = {
            "iops_per_ins": iops_per_ins,
            "cvt_ops_per_ins": cvt_ops_per_ins,
            "fused_s_ops_per_ins": fused_s_ops_per_ins,
            "fma_ops_per_ins": fma_ops_per_ins,
            "sign_a_to_b_ops_per_ins": sign_a_to_b_ops_per_ins,
            # "subtraction_per_ins": sub_per_ins
        }
        return statistics

    # get instructions needed for one quantization block
    def get_ins_per_q_blk(self) -> Dict[str, int]:
        ops_per_q_blk = self.get_necessary_ops_per_q_blk()
        ops_per_ins = self.get_uops_capability()
        ins_per_q_blk = {}
        for k, v in ops_per_q_blk.items():
            ins_per_q_blk[k] = v / ops_per_ins[k + "_per_ins"]
        ins_per_q_blk['total_ins'] = sum(ins_per_q_blk.values())
        return ins_per_q_blk

    # get instructions needed for all quantization blocks
    def get_ins_all(self) -> Dict[str, int]:
        ins_per_q_blk = self.get_ins_per_q_blk()
        num_q_pairs = (self.m * self.n * self.k) / 32
        ins_per_q_blk_all = {k: v * num_q_pairs for k, v in ins_per_q_blk.items()}
        return ins_per_q_blk_all

    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = num_q_blocks_a
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = 0
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = num_q_blocks_b / 2 * 2
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Ideally, there is no need to read from output memory.
        """
        # _mm256_storeu_ps could store 8 float 
        uops_store_result = self.m * self.n / 8
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        uops_dict = self.get_ins_all()

        total_uops_fused_s = uops_dict['fused_s_ops']
        total_uops_int8_mul_sum = uops_dict['iops']
        total_uops_cvt_int32_to_float = uops_dict['cvt_ops']
        total_uops_fma = uops_dict['fma_ops']
        total_uops_sign_a_to_b = uops_dict['sign_a_to_b_ops']

        # ignore extra operations that are dependent on algorithm design as we are calculating the upper bound
        total_uops_broadcast_fused_s = 0

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma
        self.uops_dict['total_uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma'] +
            self.uops_dict['total_uops_sign_a_to_b'] + 
            self.uops_dict['uops_get_abs_a']
            
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops


    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask'] +
            self.uops_dict['uops_sub']
        )
        self.total_uops_p0_p1_p5 = total_uops_p0_p1_p5
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']

        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits

    def calculate_peak_flops(self) -> None:
        self._compute_dispatch_width_based_peak_flops()
        self._compute_p0_p1_based_peak_flops()
        self._compute_p0_p1_p5_based_peak_flops()
        self._compute_memory_port_based_peak_flops()
        self._compute_memory_bandwidth_peak_flops()

In [49]:
# Example usage:
aq = Aq80Wq40(m=1024, n=1024, k=1024)

aq.calculate_peak_flops()
aq.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,820.876627
p0_p1 bound peak flops,274.693733
p0_p1_p5 bound peak flops,411.702238
memory ports bound peak flops,133693.44
memory bandwidth bound peak flops,20515.617391




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32768.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [50]:
# Example usage:
aq = Aq80Wq40(m=1, n=1024, k=1024)

aq.calculate_peak_flops()
aq.report("Aq80Wq40") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,306.870941
p0_p1 bound peak flops,226.983769
p0_p1_p5 bound peak flops,200.82131
memory ports bound peak flops,650.259922
memory bandwidth bound peak flops,182.855726




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,32.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,0.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


## Aq81Wq41

In [51]:
class Aq81Wq41(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Aq81Wq41, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()

    # consider one pair of quantization block of size (1x32)
    def get_necessary_ops_per_q_blk(self) -> Dict[str, int]:
        q_block_size = 32
        # no need to migrate sign of a to b in Aq81Wq41
        sign_a_to_b = 0
        # int8 multiplication and addition ops
        iops = q_block_size * 2
        # convert int result to float
        cvt_ops = 1
        # compute fused scaling factor
        fused_s_ops = 1
        # 1st fma: multiply converted sum with fused scaling factor, add back to accumulator
        # 2nd fma: multiply min with scaled sum, add back to accumulator
        fma_ops = 2
        statistics = {
            "iops": iops,
            "cvt_ops": cvt_ops,
            "fused_s_ops": fused_s_ops,
            "fma_ops": fma_ops,
            "sign_a_to_b_ops": sign_a_to_b
        }
        return statistics

    # the capability of instructions
    def get_uops_capability(self) -> Dict[str, int]:
        # _mm256_dpbusd_avx_epi32
        iops_per_ins = 8 * (4 + 4)
        # _mm256_cvtepi32_ps
        cvt_ops_per_ins = 8
        # _mm256_mul_ps
        fused_s_ops_per_ins = 8
        # one _mm256_fmadd_ps deals with 8 pairs of float32 fma operations
        fma_ops_per_ins = 8
        # migrate sign of a to b: _mm256_sign_epi8
        sign_a_to_b_ops_per_ins = 32
        statistics = {
            "iops_per_ins": iops_per_ins,
            "cvt_ops_per_ins": cvt_ops_per_ins,
            "fused_s_ops_per_ins": fused_s_ops_per_ins,
            "fma_ops_per_ins": fma_ops_per_ins,
            "sign_a_to_b_ops_per_ins": sign_a_to_b_ops_per_ins
        }
        return statistics

    # get instructions needed for one quantization block
    def get_ins_per_q_blk(self) -> Dict[str, int]:
        ops_per_q_blk = self.get_necessary_ops_per_q_blk()
        ops_per_ins = self.get_uops_capability()
        ins_per_q_blk = {}
        for k, v in ops_per_q_blk.items():
            ins_per_q_blk[k] = v / ops_per_ins[k + "_per_ins"]
        ins_per_q_blk['total_ins'] = sum(ins_per_q_blk.values())
        return ins_per_q_blk

    # get instructions needed for all quantization blocks
    def get_ins_all(self) -> Dict[str, int]:
        ins_per_q_blk = self.get_ins_per_q_blk()
        num_q_pairs = (self.m * self.n * self.k) / 32
        ins_per_q_blk_all = {k: v * num_q_pairs for k, v in ins_per_q_blk.items()}
        return ins_per_q_blk_all
    
    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        num_q_blocks_a = self.m * self.k / self.q_blk_size
        uops_load_sa = num_q_blocks_a
        uops_load_int8_a = num_q_blocks_a
        uops_load_scaled_sum_a = num_q_blocks_a
        uops_get_abs_a = 0
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        return uops_load_sa + uops_load_int8_a +uops_load_scaled_sum_a + uops_get_abs_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = self.n * self.k / self.q_blk_size
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = 0
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Ideally, there is no need to read from output memory.
        """
        # _mm256_storeu_ps could store 8 float 
        uops_store_result = self.m * self.n / 8
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        uops_dict = self.get_ins_all()

        total_uops_fused_s = uops_dict['fused_s_ops']
        total_uops_int8_mul_sum = uops_dict['iops']
        total_uops_cvt_int32_to_float = uops_dict['cvt_ops']
        total_uops_fma = uops_dict['fma_ops']
        total_uops_sign_a_to_b = uops_dict['sign_a_to_b_ops']
        total_uops_broadcast_fused_s = 0

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma
        self.uops_dict['total_uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask']
        )
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']
        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 8 # int8 
        sa_bit = (self.m * self.k * 32) / self.q_blk_size # float
        scaled_sum_a_bit = (self.m * self.k * 32) / self.q_blk_size #float
        w_bits = self.n * self.k * 4 # int4
        sw_bit = (self.n * self.k * 32) / self.q_blk_size # float
        min_b_bit = (self.n * self.k * 32) / self.q_blk_size # float
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [52]:
# Example usage:
aq = Aq81Wq41(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,1296.316484
p0_p1 bound peak flops,435.058379
p0_p1_p5 bound peak flops,652.163122
memory ports bound peak flops,105547.452632
memory bandwidth bound peak flops,19660.8




Unnamed: 0,uops
uops_load_sa,32768.0
uops_load_int8_a,32768.0
uops_load_scaled_sum_a,32768.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


In [53]:
# Example usage:
aq = Aq81Wq41(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Aq81Wq41") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,355.630715
p0_p1 bound peak flops,326.4
p0_p1_p5 bound peak flops,326.4
memory ports bound peak flops,390.611921
memory bandwidth bound peak flops,152.557129




Unnamed: 0,uops
uops_load_sa,32.0
uops_load_int8_a,32.0
uops_load_scaled_sum_a,32.0
uops_get_abs_a,0.0
uops_load_f32_a,0.0
uops_load_sw,32768.0
uops_load_min_b,32768.0
uops_load_int4_w,16384.0
uops_shift,16384.0
uops_mask,32768.0


## Af32Wfp32

In [54]:
class Af32Wf32(BaseCPUModel):
    def __init__(self, m, n, k):
        super(Af32Wf32, self).__init__(m, n, k)

        self._calculate_necessary_data_preparation_uops()
        self._calculate_necessary_computation_uops()
    
    def _calculate_data_prep_a_uops(self):
        """
        Necessary data load uops for activation, assuming that they only need to be loaded once
        """
        uops_load_sa = 0
        uops_load_int8_a = 0
        uops_load_fp32_a = self.m * self.k / 8
        uops_load_scaled_sum_a = 0
        uops_get_abs_a = 0
        self.uops_dict['uops_load_sa'] = uops_load_sa
        self.uops_dict['uops_load_int8_a'] = uops_load_int8_a
        self.uops_dict['uops_load_scaled_sum_a'] = uops_load_scaled_sum_a
        self.uops_dict['uops_get_abs_a'] = uops_get_abs_a
        self.uops_dict['uops_load_f32_a'] = uops_load_fp32_a 

        return uops_load_sa + uops_load_int8_a + uops_load_scaled_sum_a + uops_get_abs_a + uops_load_fp32_a

    def _calculate_data_prep_w_uops(self):
        """
        Necessary data load uops for weight, assuming that they only need to be loaded once
        """
        num_q_blocks_b = 0
        uops_load_sw = num_q_blocks_b
        uops_load_min_b = num_q_blocks_b
        # 2 q_block could be loaded in one uops
        uops_load_int4_w = num_q_blocks_b / 2
        # These are operations to unpack a goup of packed int4 into two group of packed int8
        uops_shift = num_q_blocks_b / 2 * 1
        uops_mask = num_q_blocks_b / 2 * 2
        uops_sub = 0
        uops_load_fp32_w = self.k * self.n / 8 
        self.uops_dict['uops_load_sw'] = uops_load_sw
        self.uops_dict['uops_load_min_b'] = uops_load_min_b
        self.uops_dict['uops_load_int4_w'] = uops_load_int4_w
        self.uops_dict['uops_shift'] = uops_shift
        self.uops_dict['uops_mask'] = uops_mask
        self.uops_dict['uops_sub'] = uops_sub
        self.uops_dict['uops_load_fp32_w'] = uops_load_fp32_w
        return uops_load_sw + uops_load_min_b + uops_load_int4_w + uops_shift + uops_mask + uops_sub + uops_load_fp32_w
    
    def _calculate_data_store_o_uops(self):
        """
        Necessary data store uops to store results to output memory.
        NOTE:
        1. Ideally, there is no need to read from output memory.
        """
        # _mm256_storeu_ps could store 8 float 
        uops_store_result = self.m * self.n / 8
        self.uops_dict['uops_store_result'] = uops_store_result
        return uops_store_result


    def _calculate_necessary_data_preparation_uops(self) -> None:
        a_sum = self._calculate_data_prep_a_uops()
        w_sum = self._calculate_data_prep_w_uops()
        o_sum = self._calculate_data_store_o_uops()
        self.total_minimum_data_preparation_uops = a_sum + w_sum + o_sum

    def _calculate_necessary_computation_uops(self) -> None:
        """
        Indispensible computation uops.
        NOTE: 
        1. the operations to unpack int4 weights are involved in the `_calculate_data_prep_w_uops`.
        2. Only computations in the innermost loop are considered.
        """
        total_uops_fused_s = 0
        total_uops_int8_mul_sum = 0
        total_uops_cvt_int32_to_float = 0
        total_uops_sign_a_to_b = 0
        total_uops_broadcast_fused_s = 0
        total_flops = self.m * self.n * self.k * 2
        # one fma in ymm handles 16 flops
        total_uops_fma = total_flops / 16

        self.uops_dict['total_uops_fused_s'] = total_uops_fused_s
        self.uops_dict['total_uops_broadcast_fused_s'] = total_uops_broadcast_fused_s
        self.uops_dict['total_uops_int8_mul_sum'] = total_uops_int8_mul_sum
        self.uops_dict['total_uops_cvt_int32_to_float'] = total_uops_cvt_int32_to_float
        self.uops_dict['total_uops_fma'] = total_uops_fma
        self.uops_dict['total_uops_sign_a_to_b'] = total_uops_sign_a_to_b

        self.total_necessary_computation_uops = (
            total_uops_fused_s +
            total_uops_broadcast_fused_s +
            total_uops_int8_mul_sum +
            total_uops_cvt_int32_to_float +
            total_uops_fma +
            total_uops_sign_a_to_b
        )

    def _compute_dispatch_width_based_peak_flops(self):
        """
        This method estimates the effective peak FLOPS based on dispatch width, assuming the minimal number of total uops is performed and full dispatch width is achieved during the execution of program. This provides a lower bound on the number of clock cycles required to complete the matrix multiplication.
        """
        total_uops = self.total_minimum_data_preparation_uops + self.total_necessary_computation_uops
        total_clocks = total_uops / self.dispatch_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['dispatch_width_based_peak_flops'] = effective_flops

    def _compute_p0_p1_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        They are grouped together because they can be used interchangeably for arithmetic uops involved in this program.
        
        For uops that could be executed by p0, p1 and p5, we exclude them, because:
        $$
        min_clocks = (min_uops) / (max_uops_per_clock)
        effective_peak_flops = (total flops) / (min_clocks / frequency)
        $$
        
        We try to assign as less uops to p0 and p1 as possible to maximize the effective peak flops bound by them.
        """
        total_uops_p0_p1 = (
            self.uops_dict['uops_shift'] +
            self.uops_dict['total_uops_fused_s'] +
            self.uops_dict['total_uops_int8_mul_sum'] +
            self.uops_dict['total_uops_cvt_int32_to_float'] +
            self.uops_dict['total_uops_fma']
        )
        self.total_uops_p0_p1 = total_uops_p0_p1
        # each port issue 1 uops per cycle
        p0_p1_width = 2
        total_clocks = total_uops_p0_p1 / p0_p1_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1 bound peak flops'] = effective_flops

    def _compute_p0_p1_p5_based_peak_flops(self):
        """
        This method estimates the effective peak flops when program is bottlenecked by port 0 and 1.
        """
        if (self.total_uops_p0_p1 == 0):
            raise ValueError("total_uops_p0_p1 should be called first")
        total_uops_p0_p1_p5 = (
            self.total_uops_p0_p1 +
            self.uops_dict['total_uops_broadcast_fused_s'] +
            self.uops_dict['uops_mask']
        )
        p0_p1_p5_width = 3
        total_clocks = total_uops_p0_p1_p5 / p0_p1_p5_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['p0_p1_p5 bound peak flops'] = effective_flops

    def _compute_memory_port_based_peak_flops(self):
        """
        This method estimates the peak flops bound by the number of memory ports.
        """
        total_uops_p2_p3_p11 = (
            self.uops_dict['uops_load_sa'] +
            self.uops_dict['uops_load_scaled_sum_a'] +
            self.uops_dict['uops_load_int8_a'] +
            self.uops_dict['uops_load_sw'] +
            self.uops_dict['uops_load_min_b'] +
            self.uops_dict['uops_load_int4_w'] +
            self.uops_dict['uops_store_result']
        )
        p2_p3_p11_width = 3
        total_clocks = total_uops_p2_p3_p11 / p2_p3_p11_width
        total_time = total_clocks / self.frequency
        effective_flops = self.total_flops / total_time / 1e9
        self.peak_flops['memory ports bound peak flops'] = effective_flops

    def _compute_memory_bandwidth_peak_flops(self):
        a_bits = self.m * self.k * 32 # fp32
        sa_bit = 0
        scaled_sum_a_bit = 0
        w_bits = self.n * self.k * 32 # fp32
        sw_bit = 0
        min_b_bit = 0
        c_bits = 1 * self.m * self.n * 32 # float; write once, ideally
        total_bits = a_bits + sa_bit + scaled_sum_a_bit + w_bits + sw_bit + min_b_bit + c_bits
        data_transfer_time = total_bits / (self.memory_bandwidth * self.GB)
        flops_limits = self.total_flops / data_transfer_time / 1e9
        self.peak_flops['memory bandwidth bound peak flops'] = flops_limits


In [55]:
# Example usage:
aq = Af32Wf32(m=1024, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Af32Wf32") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,488.169815
p0_p1 bound peak flops,163.2
p0_p1_p5 bound peak flops,244.8
memory ports bound peak flops,250675.2
memory bandwidth bound peak flops,9830.4




Unnamed: 0,uops
uops_load_sa,0.0
uops_load_int8_a,0.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,0.0
uops_load_f32_a,131072.0
uops_load_sw,0.0
uops_load_min_b,0.0
uops_load_int4_w,0.0
uops_shift,0.0
uops_mask,0.0


In [56]:
# Example usage:
aq = Af32Wf32(m=1, n=1024, k=1024)
aq.calculate_peak_flops()
aq.report("Af32Wf32") 



Unnamed: 0,peak_flops
dispatch_width_based_peak_flops,244.561171
p0_p1 bound peak flops,163.2
p0_p1_p5 bound peak flops,244.8
memory ports bound peak flops,250675.2
memory bandwidth bound peak flops,28.74386




Unnamed: 0,uops
uops_load_sa,0.0
uops_load_int8_a,0.0
uops_load_scaled_sum_a,0.0
uops_get_abs_a,0.0
uops_load_f32_a,128.0
uops_load_sw,0.0
uops_load_min_b,0.0
uops_load_int4_w,0.0
uops_shift,0.0
uops_mask,0.0
