한국전산유체공학회 제 14차 CFD 단기강좌 (2024.05.29.-2024.05.30.)

파이선 병렬프로그래밍: 수치 해석 예제 실습
===================================================


### 한국과학기술정보연구원 강지훈

***

### 필요 패키지

  - mpi4py
  - numpy
  - random
  - scikit-learn
  - matplotlib

***


# 1. 벡터와 행렬 연산

## 1.1. 행렬/벡터 만들기

In [84]:
import numpy as np

np.set_printoptions(linewidth=np.inf)

n = 10

A = np.random.rand(n, n)
B = np.random.rand(n, n)
v = np.random.rand(n)
w = np.random.rand(n)

np.save("A", A)
np.save("B", B)
np.save("v", v)
np.save("w", w)



## 1.2. 벡터 내적

1. 순차코드
   
   <img src = "images/image01.png">

2. 병렬코드 - 등분할

  <img src = "images/image02.png">

In [85]:
%%writefile v.py
import numpy as np
from mpi4py import MPI

np.set_printoptions(linewidth=np.inf,precision=3)

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

if rank == 0:
    v = np.load("v.npy")
    w = np.load("w.npy")
    n = v.size
else :
    v = None
    w = None
    n = 0

n = comm.bcast(n, root = 0)


##### n_row 크기 정하기 
n_row = int(n / size)# FIX ME

v_row = np.empty(n_row, dtype = np.float64)
w_row = np.empty(n_row, dtype = np.float64)

##### Scatter 함수 호출
comm.Scatter(v, v_row, root = 0) # FIX ME
comm.Scatter(w, w_row, root = 0) # FIX ME

##### 프로세스별 Local sum 
s = np.dot(v_row,w_row) # FIX ME

##### reduce를 이용한 Global sum
s_all = comm.allreduce(s, MPI.SUM) # FIX ME

#if rank == 1:
print(rank, s_all)


Writing v.py


In [86]:
! mpirun -np 2 python v.py

1 2.752616003351552
0 2.752616003351552


3. 병렬코드 - 비등분할

    <img src = "images/image03.png">

In [87]:
%%writefile v_var.py
import numpy as np
from mpi4py import MPI

def para_range(n, size, rank) :
    iwork = divmod(n, size) 
    ista = rank * iwork[0] + min(rank, iwork[1])
    iend = ista + iwork[0] - 1
    if iwork[1] > rank :
        iend = iend + 1
    return ista, iend

comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()

if rank == 0:
    v = np.load("v.npy")
    w = np.load("w.npy")
    n = v.size
else :
    v = None
    w = None
    n = 0

n = comm.bcast(n, root = 0)

##### 프로세스별 범위 할당
ista, iend = para_range(n, size, rank) # FIX ME
n_row =  (iend - ista + 1) # FIX ME

n_rows = comm.gather(n_row, root = 0)

v_row = np.empty(n_row, dtype = np.float64)
w_row = np.empty(n_row, dtype = np.float64)

##### Scatter
comm.Scatterv((v, n_rows), v_row, root = 0) #FIX ME
comm.Scatterv((w, n_rows), w_row, root = 0) #FIX ME

s = np.dot(v_row,w_row)

##### reduce를 이용한 Global sum
s_all = comm.reduce(s, MPI.SUM, root = 0) #FIX ME

if rank == 0:
    print(n_rows)
    print(s_all)


Writing v_var.py


In [88]:
! mpirun -np 3 python v_var.py

[4, 3, 3]
2.752616003351552


4. para_range 저장

In [89]:
%%writefile tools.py

def para_range(n, size, rank) :
    iwork = divmod(n, size) 
    ista = rank * iwork[0] + min(rank, iwork[1])
    iend = ista + iwork[0] - 1
    if iwork[1] > rank :
        iend = iend + 1
    return ista, iend


Writing tools.py


## 1.3. 행렬-벡터곱

1. 순차코드
   
    <img src = "images/image04.png">

In [90]:
A = np.load("A.npy")
v = np.load("v.npy")

b = np.matmul(A,v)
print (b)


[3.163 2.114 2.295 2.789 2.391 3.18  2.149 1.994 2.602 2.257]


2. 행렬의 행 등분할

    <img src = "images/image05.png">

In [91]:
%%writefile Av.py

import numpy as np
from mpi4py import MPI

comm = MPI.COMM_WORLD

rank = comm.Get_rank()
size = comm.Get_size()

if rank == 0 :
    A = np.load("A.npy")
    v = np.load("v.npy")
    n = v.size
    n = comm.bcast(n, root = 0)
else :
    A = None
    n = 0
    n = comm.bcast(n, root = 0)
    v = np.empty(n, dtype = np.float64)

n_row = int(n / size)

A_row = np.empty((n_row, n), dtype = np.float64)

##### 행렬의 행 분할
comm.Scatter(A, A_row, root = 0) #FIX ME

comm.Bcast(v, root = 0)

##### 분할된 행렬과의 연산
b = np.matmul(A_row,v) #FIX ME

print(b, rank)

Writing Av.py


In [92]:
! mpirun -np 2 python Av.py

[3.16303659 2.11393077 2.29522827 2.78901388 2.39100735] 0
[3.18024415 2.14935869 1.99365074 2.60156784 2.2574437 ] 1


3. 행렬의 행 비등분할

    <img src = "images/image06.png">

In [93]:
%%writefile Avar.py

# Matrix A의 Row decomposition

import numpy as np
from mpi4py import MPI
from tools import para_range

comm = MPI.COMM_WORLD

rank = comm.Get_rank()
size = comm.Get_size()

if rank == 0 :
    A = np.load("A.npy")
    v = np.load("v.npy")
    n = v.size
    n = comm.bcast(n, root = 0)

else :
    A = None
    n = 0
    n = comm.bcast(n, root = 0)
    v = np.empty(n, dtype = np.float64)

ista, iend = para_range(n, size, rank)

n_row = (iend - ista + 1)

A_row = np.empty((n_row, n), dtype = np.float64)

##### 행렬의 행 분할 및 각 프로세스별 크기 지정
n_rows = comm.gather(n_row * n, root = 0) #FIX ME

comm.Scatterv((A, n_rows), A_row, root = 0) #FIX ME
comm.Bcast(v, root = 0)

b = np.matmul(A_row,v)

print(b, rank)

Writing Avar.py


In [94]:
! mpirun -np 3 python Avar.py

[2.39100735 3.18024415 2.14935869] 1
[3.16303659 2.11393077 2.29522827 2.78901388] 0
[1.99365074 2.60156784 2.2574437 ] 2


4. 행렬/벡터의 행 비등분할

    <img src = "images/image07.png">

In [95]:
%%writefile Av_var.py

# Matrix A의 Row decomposition

from tools import para_range
import numpy as np
from mpi4py import MPI

comm = MPI.COMM_WORLD

rank = comm.Get_rank()
size = comm.Get_size()

if rank == 0 :
    A = np.load("A.npy")
    v = np.load("v.npy")
    n = v.size
    n = comm.bcast(n, root = 0)

else :
    A = None
    v = None
    n = 0
    n = comm.bcast(n, root = 0)

ista, iend = para_range(n, size, rank)

n_row = (iend - ista + 1)

A_row = np.empty((n_row, n), dtype = np.float64)
v_row = np.empty(n_row, dtype = np.float64)

##### 행렬의 행 분할 및 각 프로세스별 크기 지정
n_chunks = comm.gather(n_row * n, root = 0) #FIX ME
n_rows = comm.allgather(n_row) #FIX ME

comm.Scatterv([A, n_chunks], A_row, root = 0) #FIX ME
comm.Scatterv([v, n_rows], v_row, root = 0) #FIX ME

##### 분할된 벡터 곱 범위 지정
vsta_list = []
vend_list = []

for i in range(size) :
    vsta_list.append(sum(n_rows[:i])) #FIX ME
    vend_list.append(sum(n_rows[:i])+n_rows[i]) #FIX ME

##### Local MV (최초 자신의 벡터부분)
b = np.matmul(A_row[:,vsta_list[rank]:vend_list[rank]], v_row) #FIX ME

##### 송수신 프로세스 지정
inext = rank + 1 if rank < size - 1 else 0 #FIX ME
iprev = rank - 1 if rank > 0 else size - 1 #FIX ME

for i in range(size - 1) :
    iloc = iprev - i if iprev >= i else iprev - i + size
    v_recv = np.empty(n_rows[iloc], dtype = np.float64)
    ##### 통신
    comm.Sendrecv(v_row, inext, 1, v_recv, iprev, 1) #FIX ME
    v_row = np.copy(v_recv)
    b += np.matmul(A_row[:,vsta_list[iloc]:vend_list[iloc]], v_row)

print(b, rank)

Writing Av_var.py


In [96]:
! mpirun -np 3 python Av_var.py

[2.39100735 3.18024415 2.14935869] 1
[1.99365074 2.60156784 2.2574437 ] 2
[3.16303659 2.11393077 2.29522827 2.78901388] 0


## 3. 행렬-행렬 곱

1. 순차코드

    <img src = "images/image08.png">

In [97]:
A = np.load("A.npy")
B = np.load("B.npy")

C = np.matmul(A, B)
print (C)


[[4.606 2.986 2.47  3.747 3.817 3.884 3.262 3.314 2.911 3.112]
 [2.906 1.931 1.928 2.505 2.193 2.146 2.294 2.449 2.152 1.8  ]
 [2.621 2.255 1.576 2.159 2.662 2.03  2.188 1.734 2.15  1.372]
 [3.758 2.632 2.56  3.372 2.765 2.783 3.183 3.1   2.906 2.747]
 [2.715 2.4   1.902 2.623 2.501 2.035 2.772 2.462 2.119 1.819]
 [3.714 3.2   2.32  3.101 3.399 3.148 3.44  2.985 2.812 2.863]
 [2.82  1.923 1.559 1.984 2.331 2.551 2.012 1.937 2.102 2.155]
 [3.037 1.728 1.935 2.514 2.065 2.22  2.155 2.316 2.431 2.155]
 [3.42  2.499 2.143 2.849 2.727 2.739 2.8   2.723 2.766 2.638]
 [3.076 2.95  1.674 2.4   2.972 2.097 2.506 2.238 2.422 1.881]]


2. 행렬A의 행분할

    <img src = "images/image09.png">

In [81]:
%%writefile AB.py

import numpy as np
from mpi4py import MPI

comm = MPI.COMM_WORLD

rank = comm.Get_rank()
size = comm.Get_size()

if rank == 0 :
    A = np.load("A.npy")
    B = np.load("B.npy")
    n = A[0].size
    n = comm.bcast(n, root = 0)

else :
    n = 0
    n = comm.bcast(n, root = 0)
    A = None
    B = np.empty((n, n), dtype = np.float64)

n_row = int(n / size)

A_row = np.empty((n_row, n), dtype = np.float64)

##### 행렬 A의 분할
comm.Scatter(A, A_row, root = 0) #FIX ME
comm.Bcast(B, root = 0)

C = np.matmul(A_row,B)

print(C, rank)

Overwriting AB.py


In [82]:
! mpirun -np 2 python AB.py

[[2.69916159 2.47204502 3.27975552 2.56940371 3.19767608 1.70176397
  2.61989855 1.59647467 1.97800051 1.74942573]
 [2.09867137 2.84023611 3.39471367 3.1088171  3.07289424 2.40916921
  2.5614235  1.50640979 2.64935056 2.24321662]
 [2.08583177 2.63874904 3.05078496 2.76644929 3.13745238 1.75830709
  2.50632991 1.4356154  2.4349085  1.75615955]
 [2.25681734 1.99273886 2.96551011 2.2719347  2.68383425 1.60809973
  2.26971785 1.31398728 1.82431383 1.33905337]
 [1.70690039 1.42280778 1.58714457 1.585664   1.38737796 1.30630367
  1.39203559 0.82911637 1.56560635 0.8729322 ]] 1
[[1.76896579 1.95673003 2.91428565 2.40766831 2.79088505 1.46788058
  2.07759027 1.51737634 2.08489361 1.29823131]
 [1.70970921 2.04227922 2.44610518 2.95880146 2.4742027  2.01586441
  1.65587624 1.46345338 2.4303007  1.45882619]
 [2.5496413  3.24900312 3.5233571  3.37971823 3.45834488 2.52138544
  2.80416212 1.7998071  3.1094257  2.15363409]
 [1.68613117 1.70284265 2.23353992 2.12886037 2.11851263 1.35107712
  1.68462

3. 행렬A의 행분할 (비등분할)

    <img src = "images/image10.png">

In [83]:
%%writefile AvarB.py

# Matrix A의 Row decomposition

import numpy as np
from mpi4py import MPI
from tools import para_range

comm = MPI.COMM_WORLD

rank = comm.Get_rank()
size = comm.Get_size()

if rank == 0 :
    A = np.load("A.npy")
    B = np.load("B.npy")
    n = A[0].size
    n = comm.bcast(n, root = 0)
else :
    n = 0
    n = comm.bcast(n, root = 0)
    A = None
    B = np.empty((n, n), dtype = np.float64)

##### 행렬 A의 분할 범위
ista, iend = para_range(n, size, rank) #FIX ME

n_row = (iend - ista + 1) #FIX ME

A_row = np.empty((n_row, n), dtype = np.float64)
n_list = comm.gather(n_row * n, root = 0)

##### 행렬 A의 분할
comm.Scatterv([A, n_list], A_row, root = 0) #FIX ME
comm.Bcast(B, root = 0)

C_rows = np.matmul(A_row,B)

print(C_rows, rank)

Overwriting AvarB.py


In [18]:
! mpirun -np 3 python AvarB.py

[[1.44037115 3.31827533 2.92517102 1.86479362 2.53008495 1.53059247
  3.34352489 2.98392088 2.37076482 3.23990657]
 [1.66554917 3.06949679 2.84532682 1.53823713 2.30325062 1.47269568
  2.87928876 3.19997697 2.27305808 2.64284028]
 [1.85688976 2.63312505 2.34679526 1.19698907 2.3514161  1.39533282
  2.78151044 2.63373584 2.43650295 3.09731581]] 2
[[2.03449936 3.04199541 2.42564749 1.04003725 2.14287679 1.96038686
  2.46942191 2.76228889 2.91445576 3.29665576]
 [1.53074496 2.4508694  2.96165655 1.73039022 1.92381604 1.80916168
  2.9346203  2.83875436 2.99581836 3.1974691 ]
 [1.08387847 1.65746033 1.79090881 0.69602538 0.89073309 0.98233926
  1.18544942 1.61258337 0.91170825 1.55920602]] 1
[[1.34300868 2.31298613 2.72341006 1.28282659 1.51185798 1.96444364
  2.56129285 2.47346425 2.53125842 3.06850103]
 [1.3827853  2.55306804 2.13075155 1.29772738 1.87852316 1.22732442
  2.16746477 2.33213402 2.18971814 1.96498295]
 [2.02768215 4.27635844 3.69756696 2.32621091 2.94124852 2.40003098
  4.11

4. 행렬A의 행분할, 행렬 B의 열분할

    <img src = "images/image11.png">

In [100]:
%%writefile ABvar.py

import numpy as np
from mpi4py import MPI
from tools import para_range

comm = MPI.COMM_WORLD

rank = comm.Get_rank()
size = comm.Get_size()

if rank == 0 :
    A = np.load("A.npy")
    B = np.load("B.npy")
    BT = np.transpose(B).copy()
    n = A[0].size
    n = comm.bcast(n, root = 0)

else :
    n = 0
    n = comm.bcast(n, root = 0)
    A = None
    BT = None

ista, iend = para_range(n, size, rank)

n_row = (iend - ista + 1)

A_row = np.empty((n_row, n), dtype = np.float64)
BT_row = np.empty((n_row, n), dtype = np.float64)
n_rows = comm.allgather(n_row * n)

##### 행렬 A, B의 분할
comm.Scatterv([A, n_rows], A_row, root = 0) #FIX ME
comm.Scatterv([BT, n_rows], BT_row, root = 0) #FIX ME
B_col = np.transpose(BT_row)

inext = rank + 1 if rank < size - 1 else 0
iprev = rank - 1 if rank > 0 else size - 1

C_unordered_rows = np.matmul(A_row, B_col)

for i in range(size - 1) :
    iloc = iprev - i if iprev >= i else iprev - i + size
    B_recv = np.empty(n_rows[iloc], dtype = np.float64)

    ##### 분할된 행렬 B를 송수신하고 A의 분할된 부분과 곱하여 C에 저장
    comm.Sendrecv(B_col, inext, 1, B_recv, iprev, 1) #FIX ME
    B_col = np.copy(B_recv)
    B_col = np.reshape(B_col, (n, int(n_rows[iloc]/n))) #FIX ME
    C_block = np.matmul(A_row, B_col) #FIX ME
    C_unordered_rows = np.append(C_unordered_rows, C_block, axis = 1) #FIX ME

print(C_unordered_rows, rank)

Overwriting ABvar.py


In [99]:
! mpirun -np 3 python ABvar.py

[[2.316 2.431 2.155 1.85  2.716 1.819 2.878 2.191 2.16  2.21 ]
 [2.723 2.766 2.638 2.441 3.442 2.363 3.332 2.363 2.479 2.712]
 [2.238 2.422 1.881 2.099 3.228 2.37  2.824 2.516 2.665 2.257]] 2
[[2.501 2.035 2.772 2.931 2.322 2.309 2.256 2.091 1.802 2.404]
 [3.399 3.148 3.44  3.403 2.677 3.03  2.954 2.49  2.385 3.387]
 [2.331 2.551 2.012 2.337 1.644 2.039 2.136 1.717 1.542 2.65 ]] 1
[[4.606 2.986 2.47  3.747 3.017 2.759 3.892 3.344 4.666 3.247]
 [2.906 1.931 1.928 2.505 2.107 1.742 2.535 1.867 2.861 1.961]
 [2.621 2.255 1.576 2.159 2.299 1.762 2.167 2.136 2.876 2.298]
 [3.758 2.632 2.56  3.372 2.627 2.536 3.404 2.094 3.578 2.199]] 0


5. 직접 해보기
    - 올바른 크기의 C를 미리 선언하고 적절한 위치에 C_block 을 배치
    - 3의 C_rows와 같은 결과를 얻음
