In [None]:
import time
import unittest
from test.test_meteorology import Test_relhum

import cupy as cp
import dask
import numpy as np
import pandas as pd
import src.geocat.comp.meteorology as geo
import xarray as xr
csvpath = "relhum_ported_test_numpy.csv"

## Plotting and Validation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot(allData,name):
    arraysizes = np.unique(allData['ArraySize'])
    sd_numpy = np.zeros(len(arraysizes))
    sd_cupy = np.zeros(len(arraysizes))
    y_numpy = np.zeros(len(arraysizes))
    y_cupy = np.zeros(len(arraysizes))
    plt.rcParams.update({'font.size': 20})
    for i in range(0,len(arraysizes)):
        cupydata = allData.loc[(allData['ArraySize'] == arraysizes[i]) & (allData['Approach'] == 'cupy')]
        numpydata = allData.loc[(allData['ArraySize'] == arraysizes[i]) & (allData['Approach'] == 'numpy')]
        y_cupy[i] = np.mean(cupydata['Runtime(s)'])
        y_numpy[i] = np.mean(numpydata['Runtime(s)'])
        sd_cupy[i] = np.std(cupydata['Runtime(s)'])
        sd_numpy[i] = np.std(numpydata['Runtime(s)'])
    fig, ax = plt.subplots(figsize=(9, 6))
    ax.errorbar(arraysizes, y_numpy, yerr=sd_numpy, fmt='-o',label='numpy',markersize=20)
    ax.errorbar(arraysizes, y_cupy, yerr=sd_cupy, fmt='-D',label='cupy',markersize=20)
    ax.legend();  # Add a legend.
    ax.set_xlabel('ArraySize')  # Add an x-label to the axes.
    ax.set_ylabel('Runtime(s)')  # Add a y-label to the axes.
    ax.set_title(("Test_relhum"))
    ax.set_xscale('log')
    ax.set_yscale('log')
    plt.savefig(name,dpi=fig.dpi)
def test_validation(res_numpy,res_cupy):
    assert np.allclose(res_numpy,res_cupy, atol=0.0000001)

## Cluster

### Local CUDA Cluster

In [None]:
import dask.distributed as dd
#client = dd.Client()
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()
client = dd.Client(cluster)

In [None]:
client

### CPU cluster on PBS cluster

In [None]:
# import dask.array as da
from dask_jobqueue import PBSCluster
from dask.distributed import Client

clusterCPU = PBSCluster(memory='200 GB',
                     processes=36,
                     cores=36,
                     queue='casper',
                     walltime='02:00:00',
                     resource_spec='select=1:ncpus=36:mem=200gb')
print(clusterCPU.job_script())
clusterCPU.scale(1)
client = Client(clusterCPU)
#cluster.close()


In [None]:
client

### CUDA cluster on PBS cluster

In [None]:
# import dask.array as da
from dask_jobqueue import PBSCluster
from dask.distributed import Client

clusterCUDA = PBSCluster(memory='200 GB',
                     processes=1,
                     cores=1,
                     queue='casper',
                     walltime='02:00:00',
                     resource_spec='select=1:ncpus=1:ngpus=1:mem=200gb')
print(clusterCUDA.job_script())
clusterCUDA.scale(1)
client = Client(clusterCUDA)
#cluster.close()

In [None]:
clusterCPU.close()

In [None]:
client

## Initializing Arrays

In [None]:
max_power = 8
chunksize = 10**5
p_def_arrays = []
t_def_arrays = []
q_def_arrays = []
for i in range(8,9):
    ArraySize = 10**i
    p_def_arrays.append(xr.DataArray(np.random.uniform(low=2000,high=100800,size=ArraySize)))
    t_def_arrays.append(xr.DataArray(np.random.uniform(low=194.65,high=302.45,size=ArraySize)))
    q_def_arrays.append(xr.DataArray(np.random.uniform(low=0,high=0.02038,size=ArraySize)))
numpy_results = []
cupy_results = []

## Testing Numpy input and CPU

In [None]:
xp = np
for i in range(8,9):
    p_def = p_def_arrays[i-1].data
    q_def = q_def_arrays[i-1].data
    t_def = t_def_arrays[i-1].data
    rh_gt_2 = geo.relhum(t_def, q_def, p_def, use_gpu = False)

## Testing Numpy input and GPU

In [None]:
for i in range(8):
    p_def = p_def_arrays[i-1].data
    q_def = q_def_arrays[i-1].data
    t_def = t_def_arrays[i-1].data
    rh_gt_2 = geo.relhum(t_def, q_def, p_def, use_gpu = True)

## Testing Xarray input and CPU

In [None]:
for i in range(8):
    p_def = p_def_arrays[i-1]
    q_def = q_def_arrays[i-1]
    t_def = t_def_arrays[i-1]
    p = xr.DataArray(p_def)
    t = xr.DataArray(t_def)
    q = xr.DataArray(q_def)
    a = geo.relhum(t, q, p, use_gpu = False).compute()

## Testing Xarray input and GPU

In [None]:
for i in range(8):
    p_def = p_def_arrays[i-1]
    q_def = q_def_arrays[i-1]
    t_def = t_def_arrays[i-1]
    p = xr.DataArray(p_def)
    t = xr.DataArray(t_def)
    q = xr.DataArray(q_def)
    a = geo.relhum(t, q, p, use_gpu = True).compute()

## Testing Dask input with CPU

In [None]:
for i in range(8,9):
    p_def = p_def_arrays[0]
    q_def = q_def_arrays[0]
    t_def = t_def_arrays[0]
    p = xr.DataArray(p_def).chunk(10)
    t = xr.DataArray(t_def).chunk(10)
    q = xr.DataArray(q_def).chunk(10)
    a = geo.relhum(t, q, p, use_gpu = False).compute()

## Testing Dask input with GPU

In [None]:
for i in range(2):
    p_def = p_def_arrays[i-1]
    q_def = q_def_arrays[i-1]
    t_def = t_def_arrays[i-1]
    p = xr.DataArray(p_def).chunk(10)
    t = xr.DataArray(t_def).chunk(10)
    q = xr.DataArray(q_def).chunk(10)
    a = geo.relhum(t, q, p, use_gpu = True).compute()

## Unittests with CPU!

In [None]:
test = Test_relhum()

test.setUpClass()
test.test_float_input(use_gpu = False)
test.test_list_input(use_gpu = False)
test.test_numpy_input(use_gpu = False)
test.test_dims_error()
test.test_xarray_type_error()
test.test_dask_compute(use_gpu = False)
test.test_dask_lazy(use_gpu = False)

## Unittests with GPU

In [None]:
test = Test_relhum()

test.setUpClass()
test.test_float_input(use_gpu = True)
test.test_list_input(use_gpu = True)
test.test_numpy_input(use_gpu = True)
test.test_dims_error()
test.test_xarray_type_error()
test.test_dask_compute(use_gpu = True)
test.test_dask_lazy(use_gpu = True)

## Benchmark Results for different Array Sizes (NUMPY/CUPY)

In [None]:
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,max_power):
    ArraySize = 10**i
    p_def = p_def_arrays[i-1].data
    t_def = t_def_arrays[i-1].data
    q_def = q_def_arrays[i-1].data
    print("Array size: ", ArraySize)
    #for numpy and cupy both
    for xp in [np,cp]:
        #calculation will be repeated 10 time to get the less biased performance results
        repsize = 10
        repeat = np.zeros([repsize])
        for rep in range(0,repsize):
            #create different sizes of arrays
            if(xp == cp):
                res_cupy = geo.relhum(t_def, q_def, p_def,True)
                cp.cuda.runtime.deviceSynchronize()
                time1 = time.time()
                res_cupy = geo.relhum(t_def, q_def, p_def,True)
                cp.cuda.runtime.deviceSynchronize()
                time2 = time.time()
                repeat[rep] = time2-time1
            else:
                time1 = time.time()
                res_numpy = geo.relhum(t_def, q_def,p_def,False)
                time2 = time.time()
                repeat[rep] = time2-time1
        #save times
        data = {'Routine': np.repeat(Routine, repsize),
                'Input':"NumPy input",
                'Approach': np.repeat(xp.__name__ , repsize),
                'ArraySize': np.repeat(ArraySize , repsize),
                'iteration' : np.arange(1,repsize+1),
                'Runtime(s)': repeat}

        allData = pd.concat([allData,pd.DataFrame(data)], ignore_index=True)
        print(xp.__name__,np.mean(repeat), "seconds")
    test_validation(res_numpy,res_cupy)
try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)
plot(allData,"Test_relhum_ported_numpy.jpg")

In [None]:
plot(allData,"Test_relhum_ported_numpy.jpg")

## Results for different ArraySizes Xarray (with NumPy/CuPy arrays inside the Xarray)

### Test Relhum on CPU

In [None]:
client = Client(clusterCPU)
client

In [None]:
csvpath = "relhum_ported_test_xarray.csv"
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,max_power):
    ArraySize = 10**i
    print("Array size: ", ArraySize)
    p_def = p_def_arrays[i-1]
    t_def = t_def_arrays[i-1]
    q_def = q_def_arrays[i-1]
    #for numpy and cupy both
    xp = np 
    repsize = 10
    repeat = np.zeros([repsize])
    for rep in range(0,repsize):
    #create different sizes of arrays
        numpy_res = geo.relhum(t_def, q_def, p_def,False).compute()
        time1 = time.time()
        numpy_res = geo.relhum(t_def, q_def, p_def,False).compute()
        time2 = time.time()
        repeat[rep] = time2-time1
    numpy_results.append(numpy_res)
    #save times
    data = {'Routine': np.repeat(Routine, repsize),
            'Input':"Xarray with NumPy input",
            'Approach': np.repeat(xp.__name__ , repsize),
            'ArraySize': np.repeat(ArraySize , repsize),
            'iteration' : np.arange(1,repsize+1),
            'Runtime(s)': repeat}
    allData = pd.concat([allData,pd.DataFrame(data)], ignore_index=True)
    print(xp.__name__,np.mean(repeat), "seconds")
try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)

### Test Relhum on GPU

In [None]:
client = Client(clusterCUDA)
client

In [None]:
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,max_power):
    ArraySize = 10**i
    p_def = p_def_arrays[i-1]
    t_def = t_def_arrays[i-1]
    q_def = q_def_arrays[i-1]
    print("Array size: ", ArraySize)
    #for numpy and cupy both
    xp = cp
        #calculation will be repeated 10 time to get the less biased performance results
    repsize = 10
    repeat = np.zeros([repsize])
    for rep in range(0,repsize):
        #create different sizes of arrays
        cupy_res = geo.relhum(t_def, q_def, p_def,True).compute()
        cp.cuda.runtime.deviceSynchronize()
        time1 = time.time()
        cupy_res = geo.relhum(t_def, q_def, p_def,True).compute()
        cp.cuda.runtime.deviceSynchronize()
        time2 = time.time()
        repeat[rep] = time2-time1
    cupy_results.append(cupy_res)
    #save times
    data = {'Routine': np.repeat(Routine, repsize),
            'Input':"Xarray with NumPy input",
            'Approach': np.repeat(xp.__name__ , repsize),
            'ArraySize': np.repeat(ArraySize , repsize),
            'iteration' : np.arange(1,repsize+1),
            'Runtime(s)': repeat}
    allData = pd.concat([allData,pd.DataFrame(data)], ignore_index=True)
    print(xp.__name__,np.mean(repeat), "seconds")

try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)
# plot(allData,"Relhum_xarray_np_ported.jpg")

### Validation

In [None]:
#validation 
for i in range(len(numpy_results)):
    test_validation(cupy_results[i].data,numpy_results[i].data)

In [None]:
cupy_results[0]

In [None]:
numpy_results[0]

## Benchmark Results for different ArraySizes Xarray (with Dask arrays inside the Xarray, then dask array type is either NumPy or CuPy)

### Test Relhum on CPU

In [None]:
client = Client(clusterCPU)
client

In [None]:
csvpath = "relhum_ported_test_dask.csv"
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,max_power):
    ArraySize = 10**i
    print("Array size: ", ArraySize)
    p_def = p_def_arrays[i-1].chunk(chunksize)
    t_def = t_def_arrays[i-1].chunk(chunksize)
    q_def = q_def_arrays[i-1].chunk(chunksize)
    #for numpy and cupy both
    xp = np
    #calculation will be repeated 10 time to get the less biased performance results
    repsize = 10
    repeat = np.zeros([repsize])
    for rep in range(0,repsize):
        #create different sizes of arrays
        numpy_res = geo.relhum(t_def, q_def, p_def,False).compute()
        time1 = time.time()
        numpy_res = geo.relhum(t_def, q_def, p_def,False).compute()
        time2 = time.time()
        repeat[rep] = time2-time1
        #save times
    numpy_results.append(numpy_res)
    data = {'Routine': np.repeat(Routine, repsize),
            'Input':"Xarray with Dask array input",
            'Approach': np.repeat(xp.__name__ , repsize),
            'ArraySize': np.repeat(ArraySize , repsize),
            'iteration' : np.arange(1,repsize+1),
            'Runtime(s)': repeat}
    allData = pd.concat([allData,pd.DataFrame(data)], ignore_index=True)
    print(xp.__name__,np.mean(repeat), "seconds")

try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)

### Test Relhum on GPU

In [None]:
client = Client(clusterCUDA)
client

In [None]:
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,max_power):
    ArraySize = 10**i
    print("Array size: ", ArraySize)
    p_def = p_def_arrays[i-1].chunk(chunksize)
    t_def = t_def_arrays[i-1].chunk(chunksize)
    q_def = q_def_arrays[i-1].chunk(chunksize)
    #for numpy and cupy both
    xp = cp
    #calculation will be repeated 10 time to get the less biased performance results
    repsize = 10
    repeat = np.zeros([repsize])
    for rep in range(0,repsize):
        #create different sizes of arrays
        cupy_res = geo.relhum(t_def, q_def, p_def,True).compute()
        cp.cuda.runtime.deviceSynchronize()
        time1 = time.time()
        cupy_res = geo.relhum(t_def, q_def, p_def,True).compute()
        cp.cuda.runtime.deviceSynchronize()
        time2 = time.time()
        repeat[rep] = time2-time1
        #save times
    cupy_results.append(cupy_res)
    data = {'Routine': np.repeat(Routine, repsize),
            'Input':"Xarray with Dask array input",
            'Approach': np.repeat(xp.__name__ , repsize),
            'ArraySize': np.repeat(ArraySize , repsize),
            'iteration' : np.arange(1,repsize+1),
            'Runtime(s)': repeat}
    new = pd.DataFrame(data)
    allData = pd.concat([allData,new], ignore_index=True)
    print(xp.__name__,np.mean(repeat), "seconds")
    #print(np.allclose(cupy_res.data,numpy_res.data,atol=0.0000001))
try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)

### Validation

In [None]:
def test_validation(res_numpy,res_cupy):
    assert np.allclose(res_numpy,res_cupy, atol=0.0000001)
#validation 
for i in range(len(numpy_results)):
    test_validation(cupy_results[i].data,numpy_results[i].data)

In [None]:
numpy_results[13]

In [None]:
cupy_results[13]

In [None]:
len(cupy_results)

### Only comparing "compute()" runtime

In [None]:
chunksize = 100

#### on CPU

In [None]:
client = Client(clusterCPU)
client

In [None]:
csvpath = "relhum_ported_test_dask_compute_10.csv"
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,7):
    ArraySize = 10**i
    print("Array size: ", ArraySize)
    p_def = p_def_arrays[i-1].chunk(chunksize)
    t_def = t_def_arrays[i-1].chunk(chunksize)
    q_def = q_def_arrays[i-1].chunk(chunksize)
    #for numpy and cupy both
    xp = np
    #calculation will be repeated 10 time to get the less biased performance results
    repsize = 10
    repeat = np.zeros([repsize])
    for rep in range(0,repsize):
        #create different sizes of arrays
        numpy_res = geo.relhum(t_def, q_def, p_def,False)
        numpy_res.compute()
        time1 = time.time()
        numpy_res = numpy_res.compute()
        time2 = time.time()
        repeat[rep] = time2-time1
        #save times
    numpy_results.append(numpy_res)
    data = {'Routine': np.repeat(Routine, repsize),
            'Input':"Xarray with Dask array input compute",
            'Approach': np.repeat(xp.__name__ , repsize),
            'ArraySize': np.repeat(ArraySize , repsize),
            'iteration' : np.arange(1,repsize+1),
            'Runtime(s)': repeat}
    allData = pd.concat([allData,pd.DataFrame(data)], ignore_index=True)
    print(xp.__name__,np.mean(repeat), "seconds")

try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)

#### on GPU

In [None]:
client = Client(clusterCUDA)
client

In [None]:
#Test_relhum main body
Routine = "Relhum"
print(Routine)
allData = pd.DataFrame()
#For different Array sizes
for i in range(1,7):
    ArraySize = 10**i
    print("Array size: ", ArraySize)
    p_def = p_def_arrays[i-1].chunk(chunksize)
    t_def = t_def_arrays[i-1].chunk(chunksize)
    q_def = q_def_arrays[i-1].chunk(chunksize)
    #for numpy and cupy both
    xp = cp
    #calculation will be repeated 10 time to get the less biased performance results
    repsize = 10
    repeat = np.zeros([repsize])
    for rep in range(0,repsize):
        #create different sizes of arrays
        cupy_res = geo.relhum(t_def, q_def, p_def,True)
        cupy_res.compute()
        cp.cuda.runtime.deviceSynchronize()
        time1 = time.time()
        cupy_res = cupy_res.compute()
        cp.cuda.runtime.deviceSynchronize()
        time2 = time.time()
        repeat[rep] = time2-time1
        #save times
    cupy_results.append(cupy_res)
    data = {'Routine': np.repeat(Routine, repsize),
            'Input':"Xarray with Dask array input compute",
            'Approach': np.repeat(xp.__name__ , repsize),
            'ArraySize': np.repeat(ArraySize , repsize),
            'iteration' : np.arange(1,repsize+1),
            'Runtime(s)': repeat}
    new = pd.DataFrame(data)
    allData = pd.concat([allData,new], ignore_index=True)
    print(xp.__name__,np.mean(repeat), "seconds")
    #print(np.allclose(cupy_res.data,numpy_res.data,atol=0.0000001))
try:
    previous = pd.read_csv(csvpath)
    previous = pd.concat([previous,allData])
except FileNotFoundError:
    previous = allData
previous.to_csv(csvpath, index=False)

#### Validation

In [None]:
#validation 
for i in range(len(numpy_results)):
    test_validation(cupy_results[i].data,numpy_results[i].data)