# Deep copy

---
**Requirements:**

- [Get Started](./Get_started.ipynb)
- [Data Management](./Data_management.ipynb)
- [Atomic Operations](./Atomic_operations.ipynb)

---

Complex data structures, including struct and classes in C or derived datatypes with pointers and allocatable in Fortran, are frequent. Ways to managed them include:

- using CUDA unified memory with the compilation flag `-gpu:managed`, but the cost of memory allocation will be higher and it will apply to all allocatable variables
- flatten the derived datatypes by using temporary variables and then perform data transfers on the temporary variables
- using deep copy.

Two ways are possible to manage deep copy:

- top-down deep copy with an implicit attach behavior
- bottom-up deep copy with an explicit attach behavior

## Top-down deep copy

In order to implement the top-down deep copy, we should copy to the device the base structure first and then the children structures.
For each children transfer, the compiler's implementation will check if the pointers to the children (they are transferred with the parent structure) are present. If they are, an implicit attach behavior is performed and the parents on the device will point toward the children that are newly put on the device.

Please note that it is not mandatory to transfer all the children structure, only the ones that calculations on the device require.

### Syntax

```fortran
type velocity
    real, dimension(:), allocatable :: vx, vy, vz
end type

...
type(velocity) :: U

allocate(U%vx(sizeX), U%vy(sizeY), U%vz(sizeZ))
...

!$acc enter data copy(U)
!$acc enter data copy(U%vx(1:sizeX), U%vy(1:sizeY), U%vz(1:sizeZ))

! A humonguous calculation

```

### Example

In this example we store 2 arrays in a structure/derived type and use a deep copy to make them available on the GPU.

Example stored in: `../../examples/Fortran/Deep_copy_example.f90`

In [None]:
%%idrrun -a
program vector_addition
    use iso_fortran_env, only : INT32, REAL64
    use openacc
    implicit none

    type :: vectors
        real(kind=REAL64), dimension(:), allocatable :: s, c
    end type

    type(vectors) :: vec

    integer(kind=INT32 ), parameter              :: system_size = 1e5
    real   (kind=REAL64), dimension(system_size) :: array_sum
    real   (kind=REAL64)                         :: fortran_pi
    integer(kind=INT32 )                         :: i

    fortran_pi = acos(-1.0_real64)
    allocate(vec%s(system_size), vec%c(system_size))

    !$acc enter data create(vec, array_sum(:))
    !$acc enter data create(vec%s(1:system_size), vec%c(1:system_size))


    !$acc parallel
    !$acc loop
    do i = 1, system_size
        vec%s(i) = sin(i*fortran_pi/system_size) * sin(i*fortran_pi/system_size)
        vec%c(i) = cos(i*fortran_pi/system_size) * cos(i*fortran_pi/system_size)
    enddo
    !$acc end parallel

    !$acc parallel
    !$acc loop
    do i = 1, system_size - 1 
        array_sum(i) = vec%s(i) + vec%c(system_size - i)
    enddo
    !$acc end parallel

    !$acc exit data delete(vec%s, vec%c)
    !$acc exit data delete(vec) copyout(array_sum(:))
    
    write(0,"(a10,f10.8)") "sum(42) = ",array_sum(42)
end program vector_addition 

### Exercise

In this exercise, we determine the radial distribution function (RDF) for an ensemble of particles that is read from a file. The position of the particles can be use as a demonstration on the implementation of the deep copy.
You can run this example at the end of the next exercise to check the structure of the box at the end of the simulation.

First you need to copy some files:

In [None]:
%%bash
cp ../../examples/Fortran/OUTPUT ../../examples/Fortran/CONFIG .

You need to add `--cliopts "0.5 15.5 0"` to idrrun.

Example stored in: `../../examples/Fortran/Deep_copy_exercise.f90`

In [None]:
%%idrrun -a --cliopts "0.5 15.5"
module utils_rdf
    use ISO_FORTRAN_ENV, only : REAL64, INT32
    implicit none

    type :: Location
        real(kind=REAL64), dimension(:), allocatable :: x, y, z
    end type

    type(Location) :: particle
    integer(kind= INT32), dimension(:), allocatable :: hist
    real   (kind=REAL64), dimension(:), allocatable :: gr
    real   (kind=REAL64) :: Rij, deltaR, rcut, nideal, rho, Lx, Ly, Lz
    integer(kind= INT32) :: d, read_restart, Natoms

    contains
    subroutine read_config()
        integer(kind= INT32)              :: i, ierr
        real   (kind=REAL64)              :: yy
        logical                           :: res = .false.
        inquire(file="OUTPUT", exist=res) 
        if (res) then
            open(unit=20, file='OUTPUT', iostat=ierr)    
            write(0,*) "Reading from OUTPUT file"
        else     
            open(unit=20, file='CONFIG', iostat=ierr)
            write(0,*) "Reading from CONFIG file"
        endif
        read(20,*) yy, Natoms, Lx, Ly, Lz

        allocate(particle%x(Natoms), particle%y(Natoms), particle%z(Natoms))
      
        do i = 1, Natoms
            read(20,*) particle%x(i), particle%y(i), particle%z(i)
            if (read_restart .eq. 1) read(20,*) yy, yy, yy
            if (read_restart .eq. 2) read(20,*) yy, yy, yy
        enddo
        close(20)
        ! Add OpenACC directives here
    end subroutine read_config

    subroutine usage
    ! Only prints how to run the program
    print *, "You should provide three arguments to run rdf : "
    print *, "./rdf deltaR rcut read_restart"
    print *, "- deltaR is the length of each bin, represented by a real*8 "
    print *, "- rcut is the total length on which you determine the rdf ( rcut < box_length / 2 )"
    print *, "- read_restart defined if the file contains :"
    print *, "   - positions and velocities         (read_restart = 1)"
    print *, "   - positions, velocities and forces (read_restart2)"
    print *, "   - position only (input anything that is not 1 or 2)"                         
    print *, "example : ./rdf 0.5 15.5 0"
    stop
    end subroutine usage

end module utils_rdf

program rdf
    use utils_rdf
    implicit none
    integer(kind= INT32)          :: i, j, nargs, long, max_bin
    character(len=:), allocatable :: arg

    nargs = COMMAND_ARGUMENT_COUNT()
    if (nargs .eq. 3) then    
        call GET_COMMAND_ARGUMENT(NUMBER=1, LENGTH=long)
        allocate(CHARACTER(len=long) :: arg)
        call GET_COMMAND_ARGUMENT(NUMBER=1, VALUE=arg)
        read(arg,'(f20.8)') deltaR
        deallocate(arg)
        call GET_COMMAND_ARGUMENT(NUMBER=2, LENGTH=long)
        allocate(CHARACTER(len=long) :: arg)
        call GET_COMMAND_ARGUMENT(NUMBER=2, VALUE=arg)
        read(arg,'(f20.8)') rcut
        deallocate(arg)
        call GET_COMMAND_ARGUMENT(NUMBER=3, LENGTH=long)
        allocate(CHARACTER(len=long) :: arg)
        call GET_COMMAND_ARGUMENT(NUMBER=3, VALUE=arg)
        read(arg,'(i10)') read_restart
        deallocate(arg)
    else
        call usage()
    endif

    call read_config()
    max_bin = int( rcut/deltaR ) + 1
    allocate(hist(max_bin), gr(max_bin))

    ! Add OpenACC directives here

    ! Add OpenACC directives here
    do i = 1, max_bin
        hist(i) = 0
    enddo
     
    ! Add OpenACC directives here
    do j = 1, Natoms
        ! Add OpenACC directives here
        do i = 1, Natoms
            if (j .ne. i) then
                xij = particle%x(j)-particle%x(i)
                xij = xij - anint(xij/Lx) * Lx
                yij = particle%y(j)-particle%y(i)
                yij = yij - anint(yij/Lz) * Lz
                zij = particle%z(j)-particle%z(i)
                zij = zij - anint(zij/Lz) * Lz

                Rij = xij*xij + yij*yij + zij*zij
                d = int( sqrt(Rij)/deltaR ) + 1

                if (d .le. max_bin) then
                    ! Add OpenACC directives here
                    hist(d) = hist(d) + 1
                endif
            endif
        enddo
    enddo

    rho = dble(Natoms) / (Lx * Ly * Lz)
    rho =  4.0_real64 / 3.0_real64 * acos(-1.0_real64) * rho
    
    ! Add OpenACC directives here
    do i = 1, max_bin
        nideal  = rho * ( (i*deltaR)**3 - ((i-1)*deltaR)**3)
        gr(i)   = dble(hist(i)) / (nideal * dble(Natoms))
    enddo


    ! Add OpenACC directives here

    open(unit=30, file='RDF')
        do i=1,max_bin
            write(30,'(2f20.8)') (i-1)*deltaR, gr(i)
        enddo
    close(30)

    deallocate(particle%x, particle%y, particle%z)

end program rdf

In [None]:
%%bash
cat $PWD/RDF

### Solution

Example stored in: `../../examples/Fortran/Deep_copy_solution.f90`

In [None]:
%%idrrun -a --cliopts "0.5 15.5"
module utils_rdf
    use ISO_FORTRAN_ENV, only : REAL64, INT32
    implicit none

    type :: Location
        real(kind=REAL64), dimension(:), allocatable :: x, y, z
    end type

    type(Location) :: particle
    integer(kind= INT32), dimension(:), allocatable :: hist
    real   (kind=REAL64), dimension(:), allocatable :: gr
    real   (kind=REAL64) :: Rij, deltaR, rcut, nideal, rho, Lx, Ly, Lz
    integer(kind= INT32) :: d, read_restart, Natoms

    contains
    subroutine read_config()
        integer(kind= INT32)              :: i, ierr
        real   (kind=REAL64)              :: yy
        logical                           :: res = .false.
        inquire(file="OUTPUT", exist=res) 
        if (res) then
            open(unit=20, file='OUTPUT', iostat=ierr)    
            write(0,*) "Reading from OUTPUT file"
        else     
            open(unit=20, file='CONFIG', iostat=ierr)
            write(0,*) "Reading from CONFIG file"
        endif
        read(20,*) yy, Natoms, Lx, Ly, Lz

        allocate(particle%x(Natoms), particle%y(Natoms), particle%z(Natoms))
      
        do i = 1, Natoms
            read(20,*) particle%x(i), particle%y(i), particle%z(i)
            if (read_restart .eq. 1) read(20,*) yy, yy, yy
            if (read_restart .eq. 2) read(20,*) yy, yy, yy
        enddo
        close(20)
        !$acc enter data copyin(particle)
        !$acc enter data copyin(particle%x(1:Natoms), particle%y(1:Natoms), particle%z(1:Natoms))
    end subroutine read_config

    subroutine usage
    ! Only prints how to run the program
    print *, "You should provide three arguments to run rdf : "
    print *, "./rdf deltaR rcut read_restart"
    print *, "- deltaR is the length of each bin, represented by a real*8 "
    print *, "- rcut is the total length on which you determine the rdf ( rcut < box_length / 2 )"
    print *, "- read_restart defined if the file contains :"
    print *, "   - positions and velocities         (read_restart = 1)"
    print *, "   - positions, velocities and forces (read_restart2)"
    print *, "   - position only (input anything that is not 1 or 2)"                         
    print *, "example : ./rdf 0.5 15.5 0"
    stop
    end subroutine usage

end module utils_rdf

program rdf
    use utils_rdf
    implicit none
    integer(kind= INT32)          :: i, j, nargs, long, max_bin
    character(len=:), allocatable :: arg

    nargs = COMMAND_ARGUMENT_COUNT()
    if (nargs .eq. 3) then    
        call GET_COMMAND_ARGUMENT(NUMBER=1, LENGTH=long)
        allocate(CHARACTER(len=long) :: arg)
        call GET_COMMAND_ARGUMENT(NUMBER=1, VALUE=arg)
        read(arg,'(f20.8)') deltaR
        deallocate(arg)
        call GET_COMMAND_ARGUMENT(NUMBER=2, LENGTH=long)
        allocate(CHARACTER(len=long) :: arg)
        call GET_COMMAND_ARGUMENT(NUMBER=2, VALUE=arg)
        read(arg,'(f20.8)') rcut
        deallocate(arg)
        call GET_COMMAND_ARGUMENT(NUMBER=3, LENGTH=long)
        allocate(CHARACTER(len=long) :: arg)
        call GET_COMMAND_ARGUMENT(NUMBER=3, VALUE=arg)
        read(arg,'(i10)') read_restart
        deallocate(arg)
    else
        call usage()
    endif

    call read_config()
    max_bin = int( rcut/deltaR ) + 1
    allocate(hist(max_bin), gr(max_bin))

    !$acc data copy(hist(:)) copyout(gr(:)) present(particle, particle%x(1:Natoms), particle%y(1:Natoms), particle%z(1:Natoms))

    !$acc parallel loop
    do i = 1, max_bin
        hist(i) = 0
    enddo
     
    !$acc parallel loop
    do j = 1, Natoms
        !$acc loop
        do i = 1, Natoms
            if (j .ne. i) then
                xij = particle%x(j)-particle%x(i)
                xij = xij - anint(xij/Lx) * Lx
                yij = particle%y(j)-particle%y(i)
                yij = yij - anint(yij/Lz) * Lz
                zij = particle%z(j)-particle%z(i)
                zij = zij - anint(zij/Lz) * Lz

                Rij = xij*xij + yij*yij + zij*zij
                d = int( sqrt(Rij)/deltaR ) + 1

                if (d .le. max_bin) then
                    !$acc atomic update
                    hist(d) = hist(d) + 1
                endif
            endif
        enddo
    enddo

    rho = dble(Natoms) / (Lx * Ly * Lz)
    rho =  4.0_real64 / 3.0_real64 * acos(-1.0_real64) * rho
    
    !$acc parallel loop 
    do i = 1, max_bin
        nideal  = rho * ( (i*deltaR)**3 - ((i-1)*deltaR)**3)
        gr(i)   = dble(hist(i)) / (nideal * dble(Natoms))
    enddo

    !$acc end data
    !$acc exit data delete(particle%x, particle%y, particle%z)
    !$acc exit data delete(particle)

    open(unit=30, file='RDF')
        do i=1,max_bin
            write(30,'(2f20.8)') (i-1)*deltaR, gr(i)
        enddo
    close(30)

    deallocate(particle%x, particle%y, particle%z)

end program rdf

In [None]:
%%bash
cat $PWD/RDF

## Deep copy with manual attachment

It is also possible to proceed to a bottom-up deep copy, in which you can first copy sub-objects on the accelerator and then attach them to existing children.
With this procedure you will have to attach explicitly the pointers to the children. This can be easily apprehend with the subsequent code.

```fortran
type vect2D
 real, dimension(:), allocatable :: x, y
 real, pointer                   :: coordinate(:)
end type

type(vect2D) :: v0

!$acc enter data copyin(v0%x, v0%y, v0%coordinate)
!$acc enter data copyin(v0)
```

Here the first copyin will pass the arrays on the device memory and the second will provide the complex datatype. But the pointers of the complex datatype (such as v0.x) will still reference the host structure.
We should thus provide to the compiler the information of the datatype as it should be on the device. This is done by adding an `attach` directive.

```fortran
type vect2D
 real, dimension(:), allocatable :: x, y
 real, pointer                   :: coordinate(:)
end type

type(vect2D) :: v0

!$acc enter data copyin(v0%x, v0%y, v0%coordinate)
!$acc enter data copyin(v0) attach(v0%x, v0%y, v0%coordinate)
```

Here, the pointer and its target are present on the device. The directive will inform the compiler to replace the host pointer with the corresponding device pointer in device memory.

The `detach` clause can be use to free the structure memory but is not mandatory.

```fortran
!$acc exit data detach(v0%x, v0%y, v0%coordinate)    ! not required
!$acc exit data copyout(v0%x, v0%y, v0%coordinate)
!$acc exit data copyout(v0)
```

### Exercise

In the following exercise we'll resolve the Lotka–Volterra predator–prey equations

for the prey population:

$\frac{dx}{dt}=birth_xx-death_xxy$

and for the predator population:

$\frac{dy}{dt}=birth_yxy-death_yy.$

Example stored in: `../../examples/Fortran/Deep_copy_attach_detach_exercise.f90`

In [None]:
%%idrrun -a
module utils_lotka
    use ISO_FORTRAN_ENV, only : REAL64, INT32
    implicit none

    type :: population
        real(kind=REAL64),dimension(:),allocatable :: state
    end type          

    contains
    subroutine derivee(x, dx, pop)
        type   (population ), dimension(2), intent(in)  :: pop 
        real   (kind=REAL64), dimension(2), intent(in)  :: x
        real   (kind=REAL64), dimension(2), intent(out) :: dx
        ! Add openacc directives
        dx(1) =  pop(1)%state(2)*x(1) - pop(1)%state(3)*x(1)*x(2)
        dx(2) = -pop(2)%state(3)*x(2) + pop(2)%state(2)*x(1)*x(2)
    end subroutine derivee

    subroutine rk4(pop, dt)
        type   (population ), dimension(2), intent(inout) :: pop    
        real   (kind=REAL64), intent(in)                  :: dt
        
        real   (kind=REAL64), dimension(2)                :: x_temp, k1, k2, k3, k4
        real   (kind=REAL64)                              :: halfdt        
        integer(kind= INT32)                              :: i

        halfdt = dt/2
        ! Add openacc directives

        do i = 1, 2
           x_temp(i) = pop(i)%state(1)
        enddo

        call Derivee(x_temp, k1, pop)
        ! Add openacc directives
        do i = 1, 2
             x_temp(i) = pop(i)%state(1) + k1(i)*halfdt
        enddo
        
        call Derivee(x_temp, k2, pop)
        ! Add openacc directives
        do i = 1, 2
            x_temp(i) = pop(i)%state(1) + k2(i)*halfdt
        enddo

        call Derivee(x_temp, k3, pop)
        ! Add openacc directives
        do i = 1, 2
            x_temp(i) = pop(i)%state(1) + k3(i)*dt
        enddo

        call Derivee(x_temp, k4, pop)
        ! Add openacc directives
        do i = 1, 2
            pop(i)%state(1) = pop(i)%state(1) + (dt/6.0)*(k1(i) + 2.0*k2(i) + 2.0*k3(i) + k4(i))
        enddo
    end subroutine rk4

end module utils_lotka

program lotka_volterra
    use utils_lotka
    use openacc
    implicit none

    type   (population ), dimension(2)         :: pred_prey
    real   (kind=REAL64)                       :: ti, tf, dt, tmax
    integer                                    :: i
    
    ti   =   0.00 
    dt   =   0.05 
    tmax = 100.00 
    
    allocate(pred_prey(2)%state(3), pred_prey(1)%state(3)) 
    
    pred_prey(2)%state(1) = 15.00  ! predator count
    pred_prey(2)%state(2) = 0.01   ! predator birth rate
    pred_prey(2)%state(3) = 1.0    ! predator death rate

    pred_prey(1)%state(1) = 100.00 ! prey count
    pred_prey(1)%state(2) = 2.0    ! prey birth rate 
    pred_prey(1)%state(3) = 0.02   ! prey death rate

    do i=1,2
    ! Add openacc directives to pass child structure to GPU first
    enddo
    ! Add openacc directives for parent structure and connection to child

    open(unit=42, file="output")
    do while (ti <= tmax)
        tf = ti + dt
        call rk4(pred_prey, dt)
        do i = 1, 2
            ! Add openacc directives
        enddo
        write(42,'(f20.8,a1,f20.8,a1,f20.8)') tf,";", pred_prey(1)%state(1),";", pred_prey(2)%state(1)
        ti = tf
    end do    
    close(42)

    do i=1,2
    ! Add openacc directives 
    enddo
    ! Add openacc directives
end program lotka_volterra    

In [None]:
from matplotlib import pyplot as plt
import numpy as np

data = np.genfromtxt("output", delimiter=';')
time = data[:, 0]
preys = data[:, 1]
predators = data[:, 2]

plt.plot(time, preys, color = 'blue')
plt.plot(time, predators, color = 'red')

### Solution

Example stored in: `../../examples/Fortran/Deep_copy_attach_detach_solution.f90`

In [None]:
%%idrrun -a
module utils_lotka
    use ISO_FORTRAN_ENV, only : REAL64, INT32
    implicit none

    type :: population
        real(kind=REAL64),dimension(:),allocatable :: state
    end type          

    contains
    subroutine derivee(x, dx, pop)
        type   (population ), dimension(2), intent(in)  :: pop 
        real   (kind=REAL64), dimension(2), intent(in)  :: x
        real   (kind=REAL64), dimension(2), intent(out) :: dx
        !$acc data present(x(:), dx(:), pop(:), pop(1)%state(:), pop(2)%state(:))
        !$acc serial
        dx(1) =  pop(1)%state(2)*x(1) - pop(1)%state(3)*x(1)*x(2)
        dx(2) = -pop(2)%state(3)*x(2) + pop(2)%state(2)*x(1)*x(2)
        !$acc end serial
        !$acc end data
    end subroutine derivee

    subroutine rk4(pop, dt)
        type   (population ), dimension(2), intent(inout) :: pop    
        real   (kind=REAL64), intent(in)                  :: dt
        
        real   (kind=REAL64), dimension(2)                :: x_temp, k1, k2, k3, k4
        real   (kind=REAL64)                              :: halfdt        
        integer(kind= INT32)                              :: i

        halfdt = dt/2
        !$acc data create(k1(:), k2(:), k3(:), k4(:), x_temp(:)) present(pop(:), pop(1)%state(:), pop(2)%state(:))

        !$acc parallel loop
        do i = 1, 2
           x_temp(i) = pop(i)%state(1)
        enddo

        call Derivee(x_temp, k1, pop)
        !$acc parallel loop
        do i = 1, 2
             x_temp(i) = pop(i)%state(1) + k1(i)*halfdt
        enddo
        
        call Derivee(x_temp, k2, pop)
        !$acc parallel loop
        do i = 1, 2
            x_temp(i) = pop(i)%state(1) + k2(i)*halfdt
        enddo

        call Derivee(x_temp, k3, pop)
        !$acc parallel loop
        do i = 1, 2
            x_temp(i) = pop(i)%state(1) + k3(i)*dt
        enddo

        call Derivee(x_temp, k4, pop)
        !$acc parallel loop
        do i = 1, 2
            pop(i)%state(1) = pop(i)%state(1) + (dt/6.0)*(k1(i) + 2.0*k2(i) + 2.0*k3(i) + k4(i))
        enddo
       !$acc end data
    end subroutine rk4

end module utils_lotka

program lotka_volterra
    use utils_lotka
    use openacc
    implicit none

    type   (population ), dimension(2)         :: pred_prey
    real   (kind=REAL64)                       :: ti, tf, dt, tmax
    integer                                    :: i
    
    ti   =   0.00 
    dt   =   0.05 
    tmax = 100.00 
    
    allocate(pred_prey(2)%state(3), pred_prey(1)%state(3)) 
    
    pred_prey(2)%state(1) = 15.00  ! predator count
    pred_prey(2)%state(2) = 0.01   ! predator birth rate
    pred_prey(2)%state(3) = 1.0    ! predator death rate

    pred_prey(1)%state(1) = 100.00 ! prey count
    pred_prey(1)%state(2) = 2.0    ! prey birth rate 
    pred_prey(1)%state(3) = 0.02   ! prey death rate

    do i=1,2
       !$acc enter data copyin(pred_prey(i)%state(:))
    enddo
    !$acc enter data copyin(pred_prey(1:2)) attach(pred_prey(1)%state(:), pred_prey(2)%state(:))

    open(unit=42, file="output_solution")
    do while (ti <= tmax)
        tf = ti + dt
        call rk4(pred_prey, dt)
        do i = 1, 2
            !$acc update self(pred_prey(i)%state(1))
        enddo
        write(42,'(f20.8,a1,f20.8,a1,f20.8)') tf,";", pred_prey(1)%state(1),";", pred_prey(2)%state(1)
        ti = tf
    end do    
    close(42)

    do i=1,2
        !$acc exit data detach(pred_prey(i)%state)
        !$acc exit data delete(pred_prey(i)%state)
    enddo
    !$acc exit data delete(pred_prey(:))

end program lotka_volterra    

In [None]:
from matplotlib import pyplot as plt
import numpy as np

data = np.genfromtxt("output_solution", delimiter=';')
time = data[:, 0]
preys = data[:, 1]
predators = data[:, 2]

plt.plot(time, preys, color = 'blue')
plt.plot(time, predators, color = 'red')