# Helper functions

In [1]:
def dynamicMapReduceCollect(gstpFilter, nBatches=200):
    """
    This constructs a DynMapReduce job to fetch GaussianProcessRegressionPipe in parallel 
    and extract the hyper parameters of the trained model. The fetches are especified via 
    the gstpFilter argument, which finds the relevant models.
    It returns a job instance that can be checked regularly until it completes.
    """
    
    def cassandra_mapper(batch, objs, job):
        """
        Maps the instances of the key type to each batch.
        """
        root = 'geoSurfaceTimePoint.id == "'
        leaf = '"'
        searchKeys = [root + obj.id + leaf for obj in objs]
        filt = c3.Filter().intersects("targetSpec.filter", searchKeys).value
        models = c3.GaussianProcessRegressionPipe.fetch({
            "filter": filt,
            "limit": -1,
            "include": "trainedModel.model"
        }).objs
        
        return {batch: models}
    
    def cassandra_reducer(key, interValues, job):
        """
        Operation to perform in each batch.
        """
        values = []
        for iv in interValues:
            for val in iv:
                pickledModel = val["trainedModel"]["model"]
                model = c3.PythonSerialization.deserialize(serialized=pickledModel)
                hp = model.kernel_.get_params()['k2__length_scale']
                model_id = val["id"]
                values.append((hp,model_id))
        return values
    
    # Transform mapper and reducer to Lambdas
    map_lambda = c3.Lambda.fromPython(cassandra_mapper)
    reduce_lambda = c3.Lambda.fromPython(cassandra_reducer, runtime="gordon-ML_1_0_0")
    
    # Schedule the job 
    job = c3.DynMapReduce.startFromSpec(c3.DynMapReduceSpec(
        targetType="GeoSurfaceTimePoint",       
        filter=gstpFilter, 
        mapLambda=map_lambda,
        reduceLambda=reduce_lambda,
        batchSize=nBatches
        )
    )
    
    return job

In [2]:
def get_df_from_job_results(job):
    """
    Iterates over job result and builds dataframe.
    """
    import pandas as pd
    import numpy as np

    lengthScales = []
    ids = []
    if job.status().status == "completed":
        for key, value in job.results().items():
            for subvalue in value:
                ls = np.array(subvalue[0]).astype(float)
                model_id = np.array([subvalue[1]]).astype(str)
                lengthScales.append(ls)
                ids.append(model_id[0])
                
        df = pd.DataFrame(lengthScales)
        df["modelId"] = ids
        return df
    else:
        print("Job did not complete")
        return

# Collect hyperparameters

In [117]:
lat = -0.625
time = "2017-07-01T12:20:00"
#time2 = "2017-07-01T21:20:00"
#gstpFilter = c3.Filter().ge("latitude", -1).and_().le("latitude", 0).and_().ge("longitude", -1).and_().le("longitude", 0).and_().ge("time", time).and_().le("time", time2)
gstpFilter = c3.Filter().eq("latitude", lat).and_().eq("time", time)

In [118]:
job = dynamicMapReduceCollect(gstpFilter, 100)

In [145]:
job.status()

c3.MapReduceStatus(
 started=datetime.datetime(2022, 7, 3, 13, 6, 59, tzinfo=datetime.timezone.utc),
 startedby='jcarzon@andrew.cmu.edu',
 completed=datetime.datetime(2022, 7, 3, 13, 7, 21, tzinfo=datetime.timezone.utc),
 status='completed')

In [146]:
df = get_df_from_job_results(job)

In [147]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,modelId
0,1986.448404,113.770390,5806.057474,1.0,1022.437876,6557.094305,1860.345478,6382.200560,6194.738729,118.083608,...,23.329774,18.587526,36.990009,10.631653,12.418601,2280.734775,3720.883219,2945.403255,19.890909,GSTP_-0.625_-0.938_2017-07-01T12:20:00
1,1986.448404,113.770390,5806.057474,1.0,1022.437876,6557.094305,1860.345478,6382.200560,6194.738729,118.083608,...,23.329774,18.587526,36.990009,10.631653,12.418601,2280.734775,3720.883219,2945.403255,19.890909,GSTP_-0.625_-0.938_2017-07-01T12:20:00_dust
2,80410.929209,73205.418466,131.117147,1.0,62419.720073,94668.051058,69266.465773,71263.678014,58139.757229,34657.606411,...,56.454563,28.623640,47.724723,19.941392,24.022445,44024.995907,89.947431,13.974685,27.854066,GSTP_-0.625_-10.312_2017-07-01T12:20:00
3,80410.929209,73205.418466,131.117147,1.0,62419.720073,94668.051058,69266.465773,71263.678014,58139.757229,34657.606411,...,56.454563,28.623640,47.724723,19.941392,24.022445,44024.995907,89.947431,13.974685,27.854066,GSTP_-0.625_-10.312_2017-07-01T12:20:00_dust
4,100000.000000,100000.000000,100000.000000,1.0,100000.000000,69664.658712,100000.000000,100000.000000,100000.000000,100000.000000,...,44901.582346,31497.157378,82912.451010,100000.000000,100000.000000,18124.593076,9782.389959,5731.682118,27322.010211,GSTP_-0.625_-100.312_2017-07-01T12:20:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,87300.484588,70894.226355,77208.851798,1.0,100000.000000,65548.548353,100000.000000,9488.071613,37.891938,50523.306585,...,92654.397667,90982.258592,236.273340,157.159875,693.705583,461.778526,9.427300,35.854143,22346.061217,GSTP_-0.625_94.688_2017-07-01T12:20:00_dust
380,27861.518679,28652.668557,55691.601877,1.0,18695.023804,39334.362033,84481.422972,1758.204620,64.845761,509.186221,...,21323.698646,714.547017,77.860560,189.170013,358.495231,892.934715,39.641835,42.319873,350.666637,GSTP_-0.625_96.562_2017-07-01T12:20:00
381,27861.518679,28652.668557,55691.601877,1.0,18695.023804,39334.362033,84481.422972,1758.204620,64.845761,509.186221,...,21323.698646,714.547017,77.860560,189.170013,358.495231,892.934715,39.641835,42.319873,350.666637,GSTP_-0.625_96.562_2017-07-01T12:20:00_dust
382,2131.702852,100000.000000,100000.000000,1.0,100000.000000,100000.000000,100000.000000,100000.000000,52263.948695,1068.652880,...,784.909211,279.273568,152.646708,246.656874,435.513368,100000.000000,211.116076,99.602979,960.731003,GSTP_-0.625_98.438_2017-07-01T12:20:00
