#### 1- Multi-QPU (nvidia-mqpu)

The `nvidia-mqpu` target is useful for distributing separate quantum circuits to individual GPUs on a single host machine. 

![img](./circuit-mqpu.png)

#### Example: QML

#### Example with `sample` algorithmic primitives
![img](./RBM.png)

In [1]:
import cudaq

cudaq.set_target("nvidia-mqpu")

target = cudaq.get_target()
qpu_count = target.num_qpus()
print("Number of QPUs:", qpu_count)

@cudaq.kernel
def qrbm(v_nodes:int, h_nodes:int, ancilla:int, theta: list[float], coupling: list[float]):

    qubits_num=v_nodes+h_nodes+ancilla
    qubits=cudaq.qvector(qubits_num)

    for i in range(v_nodes+h_nodes):
        ry(theta[i],qubits[i])

    a_target=v_nodes+h_nodes
    count=0
    for v in range(v_nodes):
        for h in range(v_nodes,v_nodes+h_nodes):
            ry.ctrl(coupling[count],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            ry.ctrl(coupling[count+1],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            x(qubits[h])
            ry.ctrl(coupling[count+1],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            ry.ctrl(coupling[count],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            x(qubits[h])

            count+=2
            a_target+=1

    mz(qubits)    
    
v_nodes=2
h_nodes=2
ancilla=4

# Initialize the parameters for the RBM
theta=[2.0482, 1.4329, 2.1774, 2.7122]
coupling=[1.8256, 3.1415, 1.8257, 3.1415, 3.1415, 0.4152, 3.1415, 0.9654]

count_futures = []

for qpu in range(3):
    count_futures.append(cudaq.sample_async(qrbm,v_nodes, h_nodes, ancilla, theta, coupling, shots_count=10000,qpu_id=qpu))

for counts in count_futures:
    print(counts.get())

    

Number of QPUs: 5
{ 10110111:8 00010111:19 11100111:7 00000111:2 10011111:114 10111011:8 11011111:17 10011011:64 01011101:147 11110111:554 11011101:399 00101001:13 10101111:2 10111001:175 01101011:1 10011010:225 10110011:7 01000100:2 11100110:40 01111111:829 01001010:1 00111110:38 10110110:17 11111111:987 01101110:21 11001110:1 00111111:8 00000011:1 00011110:161 01000000:1 01000001:1 01101010:13 01001001:1 11011011:9 00010110:88 11111011:565 01010101:81 10100111:5 01101111:4 11110011:327 00101111:3 10110101:161 11011001:213 10110000:321 01011111:5 10111111:12 01001101:1 10011110:434 00011111:38 10110010:18 10110100:517 01001100:2 10110001:94 11101110:58 11101111:14 10111100:955 10111010:21 10111110:30 11001100:27 11001101:6 00111100:879 00111101:226 01000101:2 10111101:235 10111000:530 10001111:49 00001111:4 10100101:60 00101101:25 01010111:3 10101101:94 }

{ 10110111:8 00010111:21 11100111:6 00000111:3 10011111:96 10111011:6 11011111:19 10011011:87 01011101:140 11110111:548 00001011:7

#### Example with `observe` algorithmic primitives

In [2]:
import cudaq
from cudaq import spin
import numpy as np
import timeit

np.random.seed(1)

cudaq.set_target("nvidia-mqpu")
target = cudaq.get_target()
qpu_count = target.num_qpus()
print("Number of QPUs:", qpu_count)

qubit_count = 10
sample_count = 500

ham = spin.z(0)

parameter_count = qubit_count

# Below we run a circuit for 500 different input parameters.
parameters = np.random.default_rng(13).uniform(low=0,high=1,size=(sample_count,parameter_count))

print('Parameter shape: ', parameters.shape)

@cudaq.kernel
def kernel_rx(theta:list[float]):
    qubits = cudaq.qvector(qubit_count)

    for i in range(qubit_count):
        rx(theta[i], qubits[i])

#single GPU
start_time = timeit.default_timer()

result = cudaq.observe(kernel_rx, ham, parameters)
energies = np.array([r.expectation() for r in result])

end_time = timeit.default_timer()
print('Elapsed time (s) for single GPU: ', end_time-start_time)

#print('Energies from single GPU')
#print(energies)


# Multi-GPU

# We split our parameters into 4 arrays since we have 4 GPUs available.
xi = np.split(parameters,4)

print('We have', parameters.shape[0],
      'parameters which we would like to execute')

print('We split this into', len(xi), 'batches of', xi[0].shape[0], ',',
      xi[1].shape[0], ',', xi[2].shape[0], ',', xi[3].shape[0])

print('Shape after splitting', xi[0].shape)
asyncresults = []

start_time = timeit.default_timer()

for i in range(len(xi)):
    for j in range(xi[i].shape[0]):
        asyncresults.append(
            cudaq.observe_async(kernel_rx, ham, xi[i][j, :], qpu_id=i))
        
end_time = timeit.default_timer()
print('Elapsed time (s) for multi-GPU: ', end_time-start_time)

#print('Energies from multi-GPUs')
for result in asyncresults:
    observe_result = result.get()
    got_expectation = observe_result.expectation()
    #print(got_expectation)


Number of QPUs: 5
Parameter shape:  (500, 10)
Elapsed time (s) for single GPU:  1.8345087748020887
We have 500 parameters which we would like to execute
We split this into 4 batches of 125 , 125 , 125 , 125
Shape after splitting (125, 10)
Elapsed time (s) for multi-GPU:  0.006069962866604328


#### 2. Multi-GPU (nvidia-mgpu)

The `nvidia-mgpu` backend is useful for running a large single quantum circuit spread across multiple GPUs.
- A $n$ qubit quantum state has $2^n$ complex amplitudes, each of which require 8 bytes of memory to store. Hence the total memory required to store a n qubit quantum state is $8$ bytes $\times 2^n$. For $n=30$ qubits, this is roughly $8$ GB but for $n=40$, this exponentially increases to $8700$ GB.

#### Example: GHZ

```python
# mpirun -np 4 python <fname> --target nvidia-mgpu

import cudaq

cudaq.mpi.initialize()

qubit_count = 33

@cudaq.kernel
def kernel(qubit_num: int):
    # Allocate our qubits.
    qvector = cudaq.qvector(qubit_num)
    # Place the first qubit in the superposition state.
    h(qvector[0])
    # Loop through the allocated qubits and apply controlled-X,
    # or CNOT, operations between them.
    for qubit in range(qubit_num - 1):
        x.ctrl(qvector[qubit], qvector[qubit + 1])
    # Measure the qubits.
    mz(qvector)

#print("Preparing GHZ state for", qubit_count, "qubits.")
counts = cudaq.sample(kernel, qubit_count)

if cudaq.mpi.rank() == 0:
    print(counts)

cudaq.mpi.finalize()
```