## Latest Process code

Below the process function definition to upload ALL the lat-lon pairs of the data.
Running the process function over one file (e.g. the file in here) takes 3.5h which quite a lot for just one file.
The main reason seems to be the object creation of the "c3.SurfaceHindcastData" takes 0.3s per object. This seems strange as it should be just an oject initialization, why does it take so long?

Would be awesome if you can investigate that @Darren and maybe you have ideas how to speed that up?

In [8]:
# Prototype for prosessing hindcast files
from datetime import datetime, timedelta
from itertools import islice

def chunk(gen, k):
    """Chunk a generator into batches of size k 
    """
    while True:
        chunk = [*islice(gen, 0, k)]
        if chunk:
            yield chunk
        else:
            break

def upsertFunc(objs):
    c3.SurfaceHindcastData.upsertBatch(objs)

def process(this,chunkSize=5000,maxConcurrency=16):
    """ Process a single Hindcast NetCDF file into the Hindcast__Data types"""
    # extract surface data for a variable
    hycom_file = c3.HycomUtil.nc_open(this.file.url)
    
    # extract lat-long, or derive this from types
    # Not yet done:determine the offset for each based on the subsetOptions for this file
    # Note: for now it's just an integer list assuming full converage
    xsz = len(hycom_file['lon'])
    ysz = len(hycom_file['lat'])
    latitudes = range(ysz)
    longitudes = range(xsz)
    
    # Generate a list of times that the file contains
    def gentimes(start,end,stride):
        t = start
        while t <= end:
            yield t
            t += timedelta(hours=stride)
    
    times = list(gentimes(this.start,this.end,this.subsetOptions.timeStride))
    #print (f"Processing {len(times)} timeSteps:")
    #t1 = [times[24]]

    # Loop over timesteps
    # Use a generator to instatiate types in batches
    actions = []
    it = 0
    def idx(i,j):
        return ysz*i  + j
    for time in times:
        print(f"Processing for time step: {time}")
        water_u = hycom_file.variables['water_u'][:].data[it,0,:,:]
        water_v = hycom_file.variables['water_v'][:].data[it,0,:,:]
        
        genRecords = (
            c3.SurfaceHindcastData(
                **{
                    'start': time,
                    'parent' : c3.SurfaceHindcastDataSeries(
                        id = 'HNDCST_SRFC_' + str(i) + '-' + str(j)
                    ).toJson(),
                    'name': 'water_u',
                    'water_u': water_u[i,j],
                    'water_v': water_v[i,j]
                }
            )
            for i in latitudes
                for j in longitudes
        )
        
        ic = 1
        for objs in chunk(genRecords,chunkSize):
            print (f"Submiting Async for chunk: {ic} {chunkSize}")

            action = c3.AsyncAction.submit({
                'typeName': "SurfaceHindcastData",
                'action': 'upsertBatch',
                'args': {
                    'objs': c3.c3Make("[SurfaceHindcastData]",objs)
                 }
            })
            actions.append(action)

            ic += 1
            
#         print(f"Loading {xsz*ysz} records, chunkSize: {chunkSize}, maxConcurrency: {maxConcurrency}")
#         _ = c3.Client.executeConcurrently(upsertFunc,[(x,) for x in chunk(genRecords,chunkSize)],maxConcurrency)
            
        it += 1

    # close the file
    c3.HycomUtil.nc_close(ds=hycom_file, url=this.file.url)
    
    update = c3.HindcastFile(
        **{
            "id": this.id,
            "processed": True
        }
    ).merge()

In [6]:
file = c3.HindcastFile.get(
    '55c392d8-9d83-478f-85b0-f1c0037ef4ee/GOMu0.04-expt_90.1m000-2021-2021-09-02T12:00:00Z-2021-09-03T11:00:00Z.nc'
)

In [None]:
process(file, chunkSize=5000)

Processing for time step: 2021-09-02 12:00:00
Submiting Async for chunk: 1 5000
Submiting Async for chunk: 2 5000
Submiting Async for chunk: 3 5000
Submiting Async for chunk: 4 5000
Submiting Async for chunk: 5 5000
Submiting Async for chunk: 6 5000
Submiting Async for chunk: 7 5000
Submiting Async for chunk: 8 5000
Submiting Async for chunk: 9 5000
Submiting Async for chunk: 10 5000
Submiting Async for chunk: 11 5000
Submiting Async for chunk: 12 5000
Submiting Async for chunk: 13 5000
Submiting Async for chunk: 14 5000
Submiting Async for chunk: 15 5000
Submiting Async for chunk: 16 5000
Submiting Async for chunk: 17 5000
Submiting Async for chunk: 18 5000
Submiting Async for chunk: 19 5000
Submiting Async for chunk: 20 5000
Submiting Async for chunk: 21 5000
Submiting Async for chunk: 22 5000
Submiting Async for chunk: 23 5000
Submiting Async for chunk: 24 5000
Submiting Async for chunk: 25 5000
Submiting Async for chunk: 26 5000
Submiting Async for chunk: 27 5000
Submiting Async fo

Processing for time step: 2021-09-02 18:00:00
Submiting Async for chunk: 1 5000
Submiting Async for chunk: 2 5000
Submiting Async for chunk: 3 5000
Submiting Async for chunk: 4 5000
Submiting Async for chunk: 5 5000
Submiting Async for chunk: 6 5000
Submiting Async for chunk: 7 5000
Submiting Async for chunk: 8 5000
Submiting Async for chunk: 9 5000
Submiting Async for chunk: 10 5000
Submiting Async for chunk: 11 5000
Submiting Async for chunk: 12 5000
Submiting Async for chunk: 13 5000
Submiting Async for chunk: 14 5000
Submiting Async for chunk: 15 5000
Submiting Async for chunk: 16 5000
Submiting Async for chunk: 17 5000
Submiting Async for chunk: 18 5000
Submiting Async for chunk: 19 5000
Submiting Async for chunk: 20 5000
Submiting Async for chunk: 21 5000
Submiting Async for chunk: 22 5000
Submiting Async for chunk: 23 5000
Submiting Async for chunk: 24 5000
Submiting Async for chunk: 25 5000
Submiting Async for chunk: 26 5000
Submiting Async for chunk: 27 5000
Submiting Async fo

Processing for time step: 2021-09-03 00:00:00
Submiting Async for chunk: 1 5000
Submiting Async for chunk: 2 5000
Submiting Async for chunk: 3 5000
Submiting Async for chunk: 4 5000
Submiting Async for chunk: 5 5000
Submiting Async for chunk: 6 5000
Submiting Async for chunk: 7 5000
Submiting Async for chunk: 8 5000
Submiting Async for chunk: 9 5000
Submiting Async for chunk: 10 5000
Submiting Async for chunk: 11 5000
Submiting Async for chunk: 12 5000
Submiting Async for chunk: 13 5000
Submiting Async for chunk: 14 5000
Submiting Async for chunk: 15 5000
Submiting Async for chunk: 16 5000
Submiting Async for chunk: 17 5000
Submiting Async for chunk: 18 5000
Submiting Async for chunk: 19 5000
Submiting Async for chunk: 20 5000
Submiting Async for chunk: 21 5000
Submiting Async for chunk: 22 5000
Submiting Async for chunk: 23 5000
Submiting Async for chunk: 24 5000
Submiting Async for chunk: 25 5000
Submiting Async for chunk: 26 5000
Submiting Async for chunk: 27 5000
Submiting Async fo

Processing for time step: 2021-09-03 06:00:00
Submiting Async for chunk: 1 5000
Submiting Async for chunk: 2 5000
Submiting Async for chunk: 3 5000
Submiting Async for chunk: 4 5000
Submiting Async for chunk: 5 5000
Submiting Async for chunk: 6 5000
Submiting Async for chunk: 7 5000
Submiting Async for chunk: 8 5000
Submiting Async for chunk: 9 5000
Submiting Async for chunk: 10 5000
Submiting Async for chunk: 11 5000
Submiting Async for chunk: 12 5000
Submiting Async for chunk: 13 5000
Submiting Async for chunk: 14 5000
Submiting Async for chunk: 15 5000
Submiting Async for chunk: 16 5000
Submiting Async for chunk: 17 5000
Submiting Async for chunk: 18 5000
Submiting Async for chunk: 19 5000
Submiting Async for chunk: 20 5000
Submiting Async for chunk: 21 5000
Submiting Async for chunk: 22 5000
Submiting Async for chunk: 23 5000
Submiting Async for chunk: 24 5000
Submiting Async for chunk: 25 5000
Submiting Async for chunk: 26 5000
Submiting Async for chunk: 27 5000
Submiting Async fo

In [29]:
process(file, chunkSize=2000,maxConcurrency=8)

Processing for time step: 2021-09-01 23:00:00
Loading 187186 records, chunkSize: 2000, maxConcurrency: 8


In [31]:
process(file, chunkSize=16000,maxConcurrency=8)

Processing for time step: 2021-09-02 00:00:00
Loading 187186 records, chunkSize: 16000, maxConcurrency: 8


In [33]:
process(file, chunkSize=2000,maxConcurrency=32)

Processing for time step: 2021-09-02 01:00:00
Loading 187186 records, chunkSize: 2000, maxConcurrency: 32


In [47]:
process(file, chunkSize=1000,maxConcurrency=32)

IndexError: list index out of range

## Testing Version

In [1]:
# Prototype for prosessing hindcast files
from datetime import datetime, timedelta
from itertools import islice

def chunk(gen, k):
    """Chunk a generator into batches of size k 
    """
    while True:
        chunk = [*islice(gen, 0, k)]
        if chunk:
            yield chunk
        else:
            break

def createFunc(objs):
    c3.SurfaceHindcastData.createBatch(objs)

def process(this,chunkSize=5000,useDict=True,asynchronous=True,dryRun=False):
    """ Process a single Hindcast NetCDF file into the Hindcast__Data types"""
    # extract surface data for a variable
    hycom_file = c3.HycomUtil.nc_open(this.file.url)
    
    # extract lat-long, or derive this from types
    # Not yet done:determine the offset for each based on the subsetOptions for this file
    # Note: for now it's just an integer list assuming full converage
    xsz = len(hycom_file['lon'])
    ysz = len(hycom_file['lat'])
    latitudes = range(ysz)
    longitudes = range(xsz)
    
    # Generate a list of times that the file contains
    def gentimes(start,end,stride):
        t = start
        while t <= end:
            yield t
            t += timedelta(hours=stride)
    
    times = list(gentimes(this.start,this.end,this.subsetOptions.timeStride))
    #print (f"Processing {len(times)}")
    t1 = [times[0]]

    # Loop over timesteps
    # Use a generator to instatiate types in batches
    actions = []
    it = 0
    def idx(i,j):
        return ysz*i  + j
    for time in t1:
        print(f"Processing for time step: {time}")
        water_u = hycom_file.variables['water_u'][:].data[it,0,:,:]
        water_v = hycom_file.variables['water_v'][:].data[it,0,:,:]
        
        # Pass Dictionaries to createBatch
        if useDict:
            genRecords = (
                {
                        'start': time,
#                         'parent' : 'HNDCST_SRFC_' + str(i) + '-' + str(j),
                        'parent' : c3.SurfaceHindcastDataSeries(
                            id = 'HNDCST_SRFC_' + str(i) + '-' + str(j)
                        ).toJson(),
                        'name': 'water_u',
                        'water_u': water_u[i,j],
                        'water_v': water_v[i,j]
                }
                for i in latitudes
                    for j in longitudes
            )
            
        else:
            # Pass pre-instatiated objects to createBatch
            genRecords = (
                c3.SurfaceHindcastData( # look at the code to understand, TimeDataPoint (Check out Type)
                    **{
                        'start': time,
                        'parent' : c3.SurfaceHindcastDataSeries(
                            id = 'HNDCST_SRFC_' + str(i) + '-' + str(j)
                        ).toJson(),
                        'name': 'water_u',
                        'water_u': water_u[i,j],
                        'water_v': water_v[i,j]
                    }
                )
                for i in latitudes
                    for j in longitudes
            )
        
#         print(f"generating {chunkSize} records...")
#         objs = next(chunk(genRecords,chunkSize))
#         print("done")
        
        # Asychronosly submit upserts 
        if asynchronous:
            ic = 1
            for objs in chunk(genRecords,chunkSize):
                print (f"Submiting Async for chunk: {ic} {chunkSize}")
                if dryRun:
                  print(c3.c3Make("[SurfaceHindcastData]",objs))
                else:
                    action = c3.AsyncAction.submit({
                        'typeName': "SurfaceHindcastData",
                        'action': 'upsertBatch',
                        'args': {
                            'objs': c3.c3Make("[SurfaceHindcastData]",objs)
                         }
                    })
                    actions.append(action)
            
                ic += 1
                break
        else:
            print(f"Loading {xsz*ysz} records with chunkSize of {chunkSize}")
            _ = c3.Client.executeConcurrently(createFunc,[(x,) for x in chunk(genRecords,chunkSize)])
            
        it += 1

    # close the file
    c3.HycomUtil.nc_close(ds=hycom_file, url=this.file.url)
    return actions
    

In [2]:
file = c3.HindcastFile.get(
    '55c392d8-9d83-478f-85b0-f1c0037ef4ee/GOMu0.04-expt_90.1m000-2021-2021-09-02T12:00:00Z-2021-09-03T11:00:00Z.nc'
)

In [3]:
actions = process(file,chunkSize=5000,useDict=False, asynchronous=True,dryRun=False)

Processing for time step: 2021-09-02 12:00:00
Submiting Async for chunk: 1 5000


In [4]:
actions[0].id

'c773f5ae-9745-4038-9b14-ed07f59c2674'

In [18]:
# Changed parent to use ...toJson()
actions = process(file,chunkSize=100,useDict=True)

Processing for time step: 2021-09-01 21:00:00
Loading 187186 records with chunkSize of 100


In [9]:
actions = process(file,chunkSize=1000,useDict=False)

Processing for time step: 2021-09-01 17:00:00
Loading 187186 records with chunkSize of 1000


In [11]:
actions = process(file,chunkSize=5000,useDict=False)

Processing for time step: 2021-09-01 18:00:00
Loading 187186 records with chunkSize of 5000


In [13]:
actions = process(file,chunkSize=1000,useDict=True)

Processing for time step: 2021-09-01 19:00:00
Loading 187186 records with chunkSize of 1000


In [16]:
# Changed parent to use ...toJson()
actions = process(file,chunkSize=1000,useDict=True)

Processing for time step: 2021-09-01 20:00:00
Loading 187186 records with chunkSize of 1000


In [122]:
actions = process(file,chunkSize=5000,asynchronous=False,useDict=False)

Processing for time step: 2021-09-01 14:00:00
(346, 541)
Loading 187186 records with chunkSize of 5000


In [124]:
actions = process(file,chunkSize=5000,useDict=False)

Processing for time step: 2021-09-01 15:00:00
Loading 187186 records with chunkSize of 5000


In [118]:
actions = process(file,chunkSize=10000,asynchronous=False,useDict=False)

Processing 24
time: 2021-09-01 13:00:00
(346, 541)


In [99]:
for a in actions:
    print(a.completed)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [94]:
i=range(2)
j=range(3)
ij = 

SyntaxError: invalid syntax (<ipython-input-94-6bce9e67f1d0>, line 3)

## Production Version

In [3]:
file = c3.HindcastFile.get('9438b440-e200-4764-b6fe-ee6a278aff55/GOMu0.04-expt_90.1m000-2021-2021-09-01T00:00:00Z-2021-09-01T23:00:00Z.nc')
file.process(
    chunkSize=10000,
    maxConcurrency=16
)

Json request to /api/1/dev/tc01d/HindcastFile?action=process failed with response ServerResponse(statusCode=504, content='<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n<hr><center>Microsoft-Azure-Application-Gateway/v2</center>\r\n</body>\r\n</html>\r\n', headers={'Server': 'Microsoft-Azure-Application-Gateway/v2', 'Date': 'Thu, 04 Nov 2021 13:35:07 GMT', 'Content-Type': 'text/html', 'Content-Length': '193', 'Connection': 'keep-alive'})


RuntimeError: Json request to /api/1/dev/tc01d/HindcastFile?action=process failed with response ServerResponse(statusCode=504, content='<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n<hr><center>Microsoft-Azure-Application-Gateway/v2</center>\r\n</body>\r\n</html>\r\n', headers={'Server': 'Microsoft-Azure-Application-Gateway/v2', 'Date': 'Thu, 04 Nov 2021 13:35:07 GMT', 'Content-Type': 'text/html', 'Content-Length': '193', 'Connection': 'keep-alive'})

In [1]:
job = c3.HindcastProcessJob(
    **{
        'options' : c3.HycomProcessJobOptions(
            **{
                'batchSize': 1,
                'limit': -1,
                'chunkSize': 25400,
                'maxConcurrency':8
            }
        ).toJson(),
        'maxConcurrencyPerNode': 2
    }
).upsert()

In [2]:
job.start()
#job.setMaxConcurrency(8)
job.setMaxConcurrencyPerNode(2)

In [5]:
job.status()

c3.BatchJobStatus(
 started=datetime.datetime(2021, 11, 8, 17, 15, 11, tzinfo=datetime.timezone.utc),
 startedby='dadams@illinois.edu',
 status='running',
 newBatchSubmitted=True)

In [10]:
!python --version


Python 3.7.9


In [6]:
job.id

'44f2e1f2-d2a2-4d41-8c2f-363b4e3e5437'

## Data Loading

In [1]:
import datetime
# Example: Define archive for surface `water_u` and `water_v` on a lat-long subset for the month of September 2021
hcast = c3.HycomHindcast.get("GOMu0.04-expt_90.1m000-2021")
archv = c3.HindcastArchive(
    **{
        "hindcast": hcast,
        "description": "Test subset with vertCoord=4 ",
        "subsetOptions": c3.HycomSubsetOptions(
            **{
                "timeRange": {
                    "start": datetime.datetime(2021,9,1,12),
                    "end": datetime.datetime(2021,9,11,11)
                },
                "vars": "water_u,water_v",
                "vertCoord": 4
            }
        ).toJson(),
        "downloadOptions": c3.HycomDownloadOptions(
            **{
                'externalDir': 'hycom-test',
                'maxTimesPerFile': 24
            }
        ).toJson()
        
    }
)
archv.upsert()
#archv

c3.HindcastArchive(
 id='55c392d8-9d83-478f-85b0-f1c0037ef4ee',
 meta=c3.Meta(
        tenantTagId=150,
        tenant='dev',
        tag='tc01d',
        created=datetime.datetime(2021, 10, 26, 13, 34, 5, tzinfo=datetime.timezone.utc),
        createdBy='dadams@illinois.edu',
        updated=datetime.datetime(2021, 10, 26, 13, 34, 5, tzinfo=datetime.timezone.utc),
        updatedBy='dadams@illinois.edu',
        timestamp=datetime.datetime(2021, 10, 26, 13, 34, 5, tzinfo=datetime.timezone.utc),
        fetchInclude='[this,{hindcast:[name,id]}]',
        fetchType='HindcastArchive'),
 version=65537,
 hindcast=c3.HycomHindcast(
            id='GOMu0.04-expt_90.1m000-2021',
            name='GOMu0.04-expt_90.1m000-2021'),
 description='Test subset with vertCoord=4 ',
 subsetOptions=c3.HycomSubsetOptions(
                 timeRange=c3.TimeRange(
                             start=datetime.datetime(2021, 9, 1, 12, 0),
                             end=datetime.datetime(2021, 9, 11, 11, 0)),

In [2]:
# Create Batch job
job = c3.HindcastDownloadJob(
    **{
        'options': c3.HycomDownloadJobOptions(
            **{
                'limit': 300
            }
        ).toJson()
    }
).upsert()
job.start()

c3.BatchJobStatus(
 started=datetime.datetime(2021, 10, 26, 13, 34, 55, tzinfo=datetime.timezone.utc),
 startedby='dadams@illinois.edu',
 status='running')

In [3]:
import time
from IPython.display import clear_output
status = job.status()
while status.status == 'running':
    archv = c3.HindcastArchive.get("55c392d8-9d83-478f-85b0-f1c0037ef4ee")
    clear_output()
    status = job.status()
    #gom_dataset = c3.HycomDataset.fetch(spec={'include':"this,hindcastArchiveSize"}).objs[0]
    #print (gom_dataset)
    print(f"Archive Size: {round(archv.archiveSize/(1024**3),5)} GiB")
    filecount = c3.HindcastFile.fetchCount(spec={'filter':"hindcastArchive.id=='"+archv.id+"' && status=='downloaded'"})
    allcount = c3.HindcastFile.fetchCount(spec={'filter':"hindcastArchive.id=='"+archv.id+"'"})
    print(f"Download count: {filecount} of {allcount}")
    print (status)
    time.sleep(20)

Archive Size: 0.0709 GiB
Download count: 10 of 10
c3.BatchJobStatus(
 started=datetime.datetime(2021, 10, 26, 13, 34, 55, tzinfo=datetime.timezone.utc),
 startedby='dadams@illinois.edu',
 completed=datetime.datetime(2021, 10, 26, 13, 37, 43, tzinfo=datetime.timezone.utc),
 status='completed',
 newBatchSubmitted=False)


In [6]:
job.status()

c3.BatchJobStatus(
 started=datetime.datetime(2021, 10, 26, 12, 59, 40, tzinfo=datetime.timezone.utc),
 startedby='dadams@illinois.edu',
 completed=datetime.datetime(2021, 10, 26, 13, 0, 8, tzinfo=datetime.timezone.utc),
 status='failed',
 errors=c3.Arry<JobRunErrorDetail>([c3.JobRunErrorDetail(
           failedActionId='3184.123123157',
           errorMsg='Error executing command: '
                     '/usr/local/share/c3/condaEnvs/dev/tc01d/py-hycom_1_0_0/bin/python '
                     '/tmp/pythonActionSourceCache5863133234282574632/HindcastFile_download.py\n'
                     'p_logger=main url=http://dev-dti-app-w-002:8080 '
                     'connector=null mode="thick" Action failed!\n'
                     "AttributeError: 'HycomSubsetOptions' object has no "
                     "attribute 'vertCord'\n"
                     '\n'
                     'The above exception was the direct cause of the '
                     'following exception:\n'
                  