<a href="https://colab.research.google.com/github/jinsunghub/HPC-System-Optimization/blob/main/Matrix_Multiplication_AVX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls

hello	   matmul      matmul_avx.cpp  matmul_opt      sample_data
hello.cpp  matmul_avx  matmul.cpp      matmul_opt.cpp


In [None]:
!pwd

/content


In [None]:
!lscpu

Architecture:                x86_64
  CPU op-mode(s):            32-bit, 64-bit
  Address sizes:             48 bits physical, 48 bits virtual
  Byte Order:                Little Endian
CPU(s):                      2
  On-line CPU(s) list:       0,1
Vendor ID:                   AuthenticAMD
  Model name:                AMD EPYC 7B12
    CPU family:              23
    Model:                   49
    Thread(s) per core:      2
    Core(s) per socket:      1
    Socket(s):               1
    Stepping:                0
    BogoMIPS:                4499.99
    Flags:                   fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pg
                             e mca cmov pat pse36 clflush mmx fxsr sse sse2 ht s
                             yscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constan
                             t_tsc rep_good nopl nonstop_tsc cpuid extd_apicid t
                             sc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 s
                             se4_2 

In [None]:
%%writefile hello.cpp
#include <iostream>
using namespace std;
int main() {
cout << "Hello, Colab!" << endl;
return 0;
}

Overwriting hello.cpp


In [None]:
ls

[0m[01;32mhello[0m*     [01;32mmatmul[0m*      matmul_avx.cpp  [01;32mmatmul_opt[0m*     [01;34msample_data[0m/
hello.cpp  [01;32mmatmul_avx[0m*  matmul.cpp      matmul_opt.cpp


In [None]:
!g++ -O2 -std=c++11 hello.cpp -o hello

In [None]:
!./hello

Hello, Colab!


In [None]:
%%writefile matmul.cpp
#include <iostream>
#include <chrono>
#include <cstdlib>
#include <ctime>
using namespace std;
void matmul(float* A, float* B, float* C, int n) { // matrix multiplication function
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
float accum = 0;
for (int k = 0; k < n; k++) {
accum += A[i*n+k] * B[k*n+j];
}
C[i*n+j] = accum;
}
}
}
void init(float* A, int n, int m, bool zero) { // matrix initialization function
for (int i = 0; i < n * m; i++) {
A[i] = zero ? 0 : rand() / float(RAND_MAX);
}
}
int main() { // main function
int n = 1024; // matrix size
float *A = new float[n*n]; // matrix alloc
float *B = new float[n*n];
float *C = new float[n*n];
init(A, n, n, false); // matrix initialization
init(B, n, n, false);
init(C, n, n, true);

auto start = chrono::high_resolution_clock::now(); // time stamp

matmul(A, B, C, n); // matrix multiplication

auto end = chrono::high_resolution_clock::now(); // time stamp
chrono::duration<double> diff = end - start;
cout << "Execution time: " << diff.count() << " s" << endl;
cout << "C[0][0]: " << C[0] << endl;

delete[] A;
delete[] B;
delete[] C;
}

Overwriting matmul.cpp


In [None]:
!g++ -O2 -std=c++11 matmul.cpp -o matmul

In [None]:
!./matmul

Execution time: 5.09369 s
C[0][0]: 264.159


In [None]:
%%writefile matmul_opt.cpp
#include <iostream>
#include <chrono>
#include <cstdlib>
#include <ctime>
using namespace std;

void matmul_optimized(float* A, float* B, float* C, int n) {
    float* Bt = new float[n * n];

    for (int k = 0; k < n; k++) {
        for (int j = 0; j < n; j++) {
            Bt[j * n + k] = B[k * n + j];
        }
    }

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            float accum = 0;
            for (int k = 0; k < n; k++) {
                accum += A[i * n + k] * Bt[j * n + k];
            }
            C[i * n + j] = accum;
        }
    }
}

void init(float* A, int n,int m, bool zero) {
    for (int i = 0; i < n * m; i++) {
        A[i] = zero ? 0 : rand() / (float)RAND_MAX;
    }
}

int main() {
    int n = 1024;
    float *A = new float[n * n];
    float *B = new float[n * n];
    float *C = new float[n * n];
    init(A, n , n, false);
    init(B, n , n, false);
    init(C, n , n, true);

    auto start = chrono::high_resolution_clock::now();

    matmul_optimized(A, B, C, n);

    auto end = chrono::high_resolution_clock::now();
    chrono::duration<double> diff = end - start;
    cout << "Execution time: " << diff.count() << " s" << endl;
    cout << "C[0][0]: " << C[0] << endl;

    delete[] A;
    delete[] B;
    delete[] C;

}

Overwriting matmul_opt.cpp


In [None]:
!g++ -O2 -std=c++11 matmul_opt.cpp -o matmul_opt

In [None]:
!./matmul_opt

Execution time: 0.993136 s
C[0][0]: 264.159


In [None]:
%%writefile matmul_avx.cpp
#include <iostream>
#include <chrono>
#include <cstdlib>
#include <ctime>
#include <immintrin.h>

using namespace std;

float hsum_avx(__m256 X) {
    __m128 lo = _mm256_castps256_ps128(X); // 하위 128비트
    __m128 hi = _mm256_extractf128_ps(X, 1); //  상위 128비트
    __m128 sum4 = _mm_add_ps(lo, hi);
    sum4 = _mm_hadd_ps(sum4, sum4);
    sum4 = _mm_hadd_ps(sum4, sum4);
    return _mm_cvtss_f32(sum4);
}

void matmul_avx(float* A, float* B, float* C, int n) {

    float* Bt = new float[n * n];

    for (int k = 0; k < n; k++) {
        for (int j = 0; j < n; j++) {
            Bt[j * n + k] = B[k * n + j];
        }
    }

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            __m256 X = _mm256_setzero_ps();
            for (int k = 0; k < n; k += 8) {
                const __m256 AV = _mm256_loadu_ps(A + i * n + k);
                const __m256 BV = _mm256_loadu_ps(Bt + j * n + k);
                X = _mm256_fmadd_ps(AV, BV, X);
            }
            C[i * n + j] = hsum_avx(X);
        }
    }
}

void init(float* M, int n, int m, bool zero) {
    for (int i = 0; i < n * m; i++) {
        M[i] = zero ? 0 : rand() / (float)RAND_MAX;
    }
}

int main() {
    int n=1024;
    float *A = new float[n * n];
    float *B = new float[n * n];
    float *C = new float[n * n];
    init(A, n, n, false);
    init(B, n, n, false);
    init(C, n, n, true);

    auto start = chrono::high_resolution_clock::now();

    matmul_avx(A, B, C, n);

    auto end = chrono::high_resolution_clock::now();
    chrono::duration<double> diff = end - start;
    cout << "Execution time: " << diff.count() << " s" << endl;
    cout << "C[0][0]: " << C[0] << endl;

    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}

Overwriting matmul_avx.cpp


In [None]:
!g++ -O2 -std=c++11 -mavx2 -mfma -march=native matmul_avx.cpp -o matmul_avx

In [None]:
!./matmul_avx

Execution time: 0.214204 s
C[0][0]: 264.159


In [None]:
print("Hello World")

Hello World


This is a text shell