# Loop tiling

---
**Requirements:**

- [Get started](./Get_started.ipynb)
- [Data Management](./Data_management.ipynb)
---

Nested loops often reuse the same data across their iterations and keeping the working set inside the caches can improve performance.
Tiling is a partitioning method of the loops into blocks. It reorders the loops so that each block will repeatedly hit the cache.
A first usage restriction will thus be on the loops' nature itself: not all loops can benefit from tiling, only the ones that will reuse data while showing a poor data locality, thus leading to frequent cache misses.

<img alt="Tiles example" src="../../pictures/tiles.png" style="float:none"/>

OpenACC allows to improve data locality inside loops with the dedicated _tile_ clause.
It specifies the compiler to split each loop in the nest into 2 loops, with an outer set of tile loops and an inner set of element loops.

## Syntax

The tile clause may appear with the _loop_ directive for nested loops.
For _N_ nested loops, the tile clause can take _N_ arguments. The first one being the size of the inner loop of the nest, the last one being the size of the outer loop.

```fortran
!$acc loop tile(32,32)
do i = 1, size_i
    do j = 1, size_j
        ! A Fabulous calculation
    enddo
enddo
```

## Restrictions

- the tile size (corresponding to the product of the arguments of the tile clause) can be up to 1024
- for better performance the size for the inner loop is a power of 2 (best with 32 to fit a cuda warp)
- if the vector clause is specified, it is then applied to the element loop
- if the gang clause is specified, it is then applied to the tile loop
- the worker clause is applied to the element loop only if the vector clause is not specified

## Example

In the following example, tiling is used to solve a matrix multiplication followed by an addition. Let us take a look at the performance of the naïve algorithm and the manual tiling on CPU.

Example stored in: `../../examples/Fortran/Loop_tiling_example_cpu.f90`

In [None]:
%%idrrun
program tiles
    use ISO_Fortran_env, only : INT32, REAL64
    implicit none
    integer(kind=INT32), parameter :: ni=4280, nj=4024, nk=1960, ntimes=30
    real(kind=REAL64)              :: a(ni,nk), b(nk,nj), c(ni,nj), d(ni,nj)
    real(kind=REAL64)              :: summation, t1, t2
    integer(kind=INT32)            :: nt, i, j, k
    integer(kind=INT32)            :: ichunk, l, ii, jj, kk   

    call random_number(a)
    call random_number(b)
    
    a = 4.0_real64*a - 2.0_real64
    b = 8.0_real64*b - 4.0_real64
    c = 2.0_real64 
    d = 0.0_real64
    
    print *, "Start calculation"

    call cpu_time(t1)
    do nt = 1, ntimes
      d = c + matmul(a,b)
    end do
    call cpu_time(t2)
    print *, "CPU matmul"
    print *, "elapsed",t2-t1

    print *,sum(d)
    d = 0.0_real64
    print *, " "

       
    call cpu_time(t1)
    do nt = 1, ntimes
      do j=1,nj
         do i =1,ni
           summation = 0.0_real64
           do k=1,nk
              summation = summation + a(i,k) * b(k,j)
           enddo
           d(i,j) = summation + c(i,j)
         enddo
      enddo
    enddo
    call cpu_time(t2)
    print *, "CPU naive loop"
    print *, "elapsed",t2-t1
    print *,sum(d)
    d = 0.0_real64
    print *, " "    


    call cpu_time(t1)    
    l=size(a,dim=2)
    ichunk = 256
    do nt = 1, ntimes
    d = 0.0_real64
    do jj = 1, nj, ichunk
        do kk = 1, l, ichunk
           do j=jj,min(jj+ichunk-1,nj)
             do k=kk,min(kk+ichunk-1,l)
              do i=1,ni
                d(i,j) = d(i,j) + a(i,k) * b(k,j)
              enddo
             enddo
            enddo
         enddo
    enddo
    do j = 1, nj
        do i = 1, ni
            d(i,j) = d(i,j) +c(i,j)
        enddo
    enddo
    enddo
    call cpu_time(t2)
    print *, "CPU manual loop tiling 256"
    print *, "elapsed",t2-t1
    print *,sum(d)
    d = 0.0_real64
    print *, " "

    
    call cpu_time(t1)
    l=size(a,dim=2)
    ichunk = 512
    do nt = 1, ntimes
    d = 0.0_real64
    do jj = 1, nj, ichunk
        do kk = 1, l, ichunk
           do j=jj,min(jj+ichunk-1,nj)
             do k=kk,min(kk+ichunk-1,l)
              do i=1,ni
                d(i,j) = d(i,j) + a(i,k) * b(k,j)
              enddo
             enddo
            enddo
         enddo
    enddo
    do j = 1, nj
        do i = 1, ni
            d(i,j) = d(i,j) +c(i,j)
        enddo
    enddo
    enddo
    call cpu_time(t2)
    print *, "CPU manual loop tiling 512"
    print *, "elapsed",t2-t1
    print *,sum(d)
    d = 0.0_real64

end program tiles


And now it's GPU implementation.

Example stored in: `../../examples/Fortran/Loop_tiling_example_gpu.f90`

In [None]:
%%idrrun -a
program tiles
    use ISO_Fortran_env, only : INT32, REAL64
    implicit none
    integer(kind=INT32), parameter :: ni=4280, nj=4024, nk=1960, ntimes=30
    real(kind=REAL64)              :: a(ni,nk), b(nk,nj), c(ni,nj), d(ni,nj)
    real(kind=REAL64)              :: summation, t1, t2
    integer(kind=INT32)            :: nt, i, j, k
    integer(kind=INT32)            :: ichunk, l, ii, jj, kk   

    call random_number(a)
    call random_number(b)
    
    a = 4.0_real64*a - 2.0_real64
    b = 8.0_real64*b - 4.0_real64
    c = 2.0_real64 
    d = 0.0_real64
    
    print *, "Start calculation"

    !$acc enter data copyin(a,b,c) create(d)

    call cpu_time(t1)
    l=size(a,dim=2)
    ichunk = 256
    do nt = 1, ntimes
    !$acc parallel loop default(present)
    do j = 1, nj
        !$acc loop
        do i = 1, ni
            d(i,j) = 0.0_real64
        enddo
    enddo

    !$acc parallel loop default(present)
    do jj = 1, nj, ichunk
        do kk = 1, l, ichunk
           do j=jj,min(jj+ichunk-1,nj)
             do k=kk,min(kk+ichunk-1,l)
              !$acc loop vector
              do i=1,ni
                d(i,j) = d(i,j) + a(i,k) * b(k,j)
              enddo
             enddo
            enddo
         enddo
    enddo
   !$acc parallel loop default(present)
    do j = 1, nj
        !$acc loop
        do i = 1, ni
            d(i,j) = d(i,j) +c(i,j)
        enddo
    enddo
    enddo    
    call cpu_time(t2)
    print *, "GPU manual loop tiling 256"
    print *, "elapsed",t2-t1
    !$acc update self(d(:,:))
    print *,sum(d)
    !$acc kernels
    d(:,:) = 0.0_real64
    !$acc end kernels
    print *, " "


    call cpu_time(t1)
    l=size(a,dim=2)
    ichunk = 512 
    do nt = 1, ntimes
    !$acc parallel loop default(present)
    do j = 1, nj
        !$acc loop
        do i = 1, ni
            d(i,j) = 0.0_real64
        enddo
    enddo

    !$acc parallel loop gang default(present)
    do jj = 1, nj, ichunk
        !$acc loop seq
        do kk = 1, l, ichunk
           !$acc loop seq
           do j=jj,min(jj+ichunk-1,nj)
             do k=kk,min(kk+ichunk-1,l)
              !$acc loop vector
              do i=1,ni
                d(i,j) = d(i,j) + a(i,k) * b(k,j)
              enddo
             enddo
            enddo
         enddo
    enddo
    !$acc parallel loop default(present)
    do j = 1, nj
        !$acc loop
        do i = 1, ni
            d(i,j) = d(i,j) +c(i,j)
        enddo
    enddo
    enddo
    call cpu_time(t2)
    print *, "GPU manual loop tiling 512"
    print *, "elapsed",t2-t1
    !$acc update self(d(:,:))
    print *,sum(d)
    !$acc kernels
    d(:,:) = 0.0_real64
    !$acc end kernels
    print *, " "

    call cpu_time(t1)
    l=size(a,dim=2)
    ichunk = 16
    do nt = 1, ntimes
    !$acc parallel loop default(present)
    do j = 1, nj
        !$acc loop
        do i = 1, ni
            d(i,j) = 0.0_real64
        enddo
    enddo

    !$acc parallel loop gang default(present)
    do jj = 1, nj, ichunk
        !$acc loop seq
        do kk = 1, l, ichunk
           !$acc loop seq
           do j=jj,min(jj+ichunk-1,nj)
             do k=kk,min(kk+ichunk-1,l)
              !$acc loop vector
              do i=1,ni
                d(i,j) = d(i,j) + a(i,k) * b(k,j)
              enddo
             enddo
            enddo
         enddo
    enddo
    !$acc parallel loop default(present)
    do j = 1, nj
        !$acc loop
        do i = 1, ni
            d(i,j) = d(i,j) +c(i,j)
        enddo
    enddo
    enddo
    call cpu_time(t2)
    print *, "GPU manual loop tiling 16"
    print *, "elapsed",t2-t1
    !$acc update self(d(:,:))
    print *,sum(d)
    !$acc kernels
    d(:,:) = 0.0_real64
    !$acc end kernels
    print *, " "


    call cpu_time(t1)
    do nt = 1, ntimes
      !$acc parallel loop default(present)
      do j=1,nj
         !$acc loop
         do i =1,ni
           summation = 0.0_real64
           !$acc loop seq
           do k=1,nk
              summation = summation + a(i,k) * b(k,j)
           enddo
           d(i,j) = summation + c(i,j)
         enddo
      enddo
    enddo

    call cpu_time(t2)
    print *, "GPU naive parallel loop"
    print *, "elapsed",t2-t1

    !$acc update self(d(:,:))
    print *,sum(d)
    print *, " "
    d(:,:) = 0.0_real64

    call cpu_time(t1)
    do nt = 1, ntimes
      !$acc parallel loop tile(32,32) default(present)
      do j=1,nj
         do i=1,ni
           summation = 0.0_real64
           !$acc loop seq
           do k=1,nk
              summation = summation + a(i,k) * b(k,j)
           enddo
           d(i,j) = summation + c(i,j)
         enddo
      enddo
    enddo

    call cpu_time(t2)
    print *, "GPU naive parallel loop tiled"
    print *, "elapsed",t2-t1

    !$acc exit data copyout(d(:,:))

    print *,sum(d)
end program tiles


## Exercise

In this exercise, you will try to accelerate the numerical resolution of the 2D Laplace's equation with tiles. You can see that tiles parameter should be chosen wisely in order not to deteriorate performance.

Example stored in: `../../examples/Fortran/Loop_tiling_exercise.f90`

In [None]:
%%idrrun -a
program laplace2d 
use ISO_FORTRAN_ENV, only : real64, int32
!use openacc
implicit none

! Calculated solution for (E,B) fields
real   (kind=real64), dimension(:,:), allocatable :: T, T_new
! Dimension of the system
integer(kind=int32 )                              :: nx, ny ! number of points

integer(kind=int32 )                              :: i, j, it
real   (kind=real64)                              :: erreur

nx = 20000 !30000
ny = 10000 !30000

allocate(T(nx,ny),T_new(nx,ny))

! initial conditions
do j=2,ny-1
    do i=2,nx-1
        T(i,j)     = 0.0_real64
        T_new(i,j) = 0.0_real64
    enddo
enddo
!
do i=1,nx
    T(i, 1) = 100.0_real64
    T(i,ny) = 0.0_real64
enddo
!
do i=1,ny
    T(1 ,i) = 0.0_real64
    T(nx,i) = 0.0_real64
enddo

! add acc directive
do it = 1, 10000
  erreur = 0.0_real64
  !add acc directive
  do j= 2,ny-1
      do i= 2,nx-1
         T_new(i,j) =  0.25_real64*(T(i+1,j)+T(i-1,j) + &
                                    T(i,j+1)+T(i,j-1))
         erreur = max(erreur, abs(T_new(i,j) - T(i,j)))
      enddo
  enddo

  if (mod(it,100) .eq. 0) print *, "iteration: ",it," erreur: ",erreur

  !add acc directive
  do j= 2,ny-1
  do i= 2,nx-1
     T(i,j) =  T_new(i,j)
  enddo
  enddo  
enddo


deallocate(T, T_new)

end program laplace2d

## Solution

Example stored in: `../../examples/Fortran/Loop_tiling_solution.f90`

In [None]:
%%idrrun -a
program laplace2d 
use ISO_FORTRAN_ENV, only : real64, int32
!use openacc
implicit none

! Calculated solution for (E,B) fields
real   (kind=real64), dimension(:,:), allocatable :: T, T_new
! Dimension of the system
integer(kind=int32 )                              :: nx, ny ! number of points

integer(kind=int32 )                              :: i, j, it
real   (kind=real64)                              :: erreur

nx = 20000 !30000
ny = 10000 !30000

allocate(T(nx,ny),T_new(nx,ny))

! initial conditions
do j=2,ny-1
    do i=2,nx-1
        T(i,j)     = 0.0_real64
        T_new(i,j) = 0.0_real64
    enddo
enddo
!
do i=1,nx
    T(i, 1) = 100.0_real64
    T(i,ny) = 0.0_real64
enddo
!
do i=1,ny
    T(1 ,i) = 0.0_real64
    T(nx,i) = 0.0_real64
enddo

!$acc data copy(T) create(T_new)
do it = 1, 10000
  erreur = 0.0_real64
  !$acc parallel loop tile(32,32) reduction(max:erreur)
  do j= 2,ny-1
      do i= 2,nx-1
         T_new(i,j) =  0.25_real64*(T(i+1,j)+T(i-1,j) + &
                                    T(i,j+1)+T(i,j-1))
         erreur = max(erreur, abs(T_new(i,j) - T(i,j)))
      enddo
  enddo

  if (mod(it,100) .eq. 0) print *, "iteration: ",it," erreur: ",erreur

  !$acc parallel loop tile(32,32)
  do j= 2,ny-1
  do i= 2,nx-1
     T(i,j) =  T_new(i,j)
  enddo
  enddo  
enddo
!$acc end data

deallocate(T, T_new)

end program laplace2d