In [61]:
def dynamicMapReduceCollect(gstpFilter, targetName, nBatches=200):
    """
    This constructs a DynMapReduce job to fetch GaussianProcessRegressionPipe in parallel 
    and extract the hyper parameters of the trained model. The fetches are especified via 
    the gstpFilter argument, which finds the relevant models, and a targetName, which filters
    the ones that were trained with that target.
    It returns a job instance that can be checked regularly until it completes.
    """
    
    def cassandra_mapper(batch, objs, job):
        """
        Maps the instances of the key type to each batch.
        """
        root = "GSTP_"
        leaf = "_" + job.context.value['targetName']
        searchKeys = [root + obj.id + leaf for obj in objs]
        filt = c3.Filter().intersects("id", searchKeys).value
        models = c3.GaussianProcessRegressionPipe.fetch({
            "filter": filt,
            "limit": -1,
            "include": "trainedModel.model"
        }).objs
        
        return {batch: models}
    
    def cassandra_reducer(key, interValues, job):
        """
        Operation to perform in each batch.
        """
        values = []
        for iv in interValues:
            for val in iv:
                pickledModel = val["trainedModel"]["model"]
                model = c3.PythonSerialization.deserialize(serialized=pickledModel)
                hp = model.kernel_.get_params()['k2__length_scale']
                model_id = val["id"]
                values.append((hp,model_id))
        return values
    
    # Transform mapper and reducer to Lambdas
    map_lambda = c3.Lambda.fromPython(cassandra_mapper)
    reduce_lambda = c3.Lambda.fromPython(cassandra_reducer, runtime="gordon-ML_1_0_0")
    
    # Schedule the job 
    job = c3.DynMapReduce.startFromSpec(c3.DynMapReduceSpec(
        targetType="GeoSurfaceTimePoint",       
        filter=gstpFilter, 
        mapLambda=map_lambda,
        reduceLambda=reduce_lambda,
        batchSize=nBatches,
        context= c3.MappObj(value={'targetName': targetName})
        )
    )
    
    return job

In [176]:
lat1 = 0.0
lat2 = 10.0
lon1 = 0.0
lon2 = 10.0
time1 = "2017-07-01T00:00:00"
time2 = "2017-07-01T23:59:59"
gstpFilter = c3.Filter().ge("latitude", lat1).and_().le("latitude", lat2).and_().ge("longitude", lon1).and_().le("longitude", lon2).and_().ge("time", time1).and_().le("time", time2)

In [177]:
job = dynamicMapReduceCollect(gstpFilter,"all",10)

In [197]:
job.status().status

'completed'

## Retrive Pandas dataframe with results from DynMapReduce

In [194]:
def get_df_from_job_results(job):
    """
    Iterates over job result and builds dataframe.
    """
    import pandas as pd
    import numpy as np

    lengthScales = []
    ids = []
    if job.status().status == "completed":
        for key, value in job.results().items():
            for subvalue in value:
                ls = np.array(subvalue[0]).astype(float)
                model_id = np.array([subvalue[1]]).astype(str)
                lengthScales.append(ls)
                ids.append(model_id[0])
                
        df = pd.DataFrame(lengthScales)
        df["modelId"] = ids
        return df
    else:
        print("Job did not complete")
        return

In [195]:
df = get_df_from_job_results(job)

In [196]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,modelId
0,86965.062514,82364.593910,93570.835549,1.0,87647.908166,71765.063392,100000.000000,43220.301826,87679.541568,421.959026,...,365.408222,90.737847,161.813907,118.362800,452.965953,90348.291546,145.008846,262.297148,131.842433,GSTP_0.625_0.938_2017-07-01T00:20:00_all
1,20148.062451,21390.181484,33110.469864,1.0,16514.097452,18403.523531,21638.837970,24280.577374,11709.153551,326.336973,...,12442.091243,78.863824,146.731013,101.020884,10385.044203,21779.767266,122.310576,317.237318,106.375170,GSTP_0.625_0.938_2017-07-01T03:20:00_all
2,86711.002920,100000.000000,21143.169496,1.0,13826.938492,23284.339204,100000.000000,8995.704460,25419.624993,54524.966233,...,81094.928193,84.888557,165.892745,106.095167,1345.782804,52381.999155,190.184875,134.469591,108.383955,GSTP_0.625_0.938_2017-07-01T06:20:00_all
3,57197.892450,77794.561115,84280.966714,1.0,13306.099699,46129.307313,100000.000000,41265.057060,47233.053562,56888.657043,...,495.114501,115.525042,203.960994,136.609669,176.978584,57679.912583,194.687649,197.831082,93.714415,GSTP_0.625_0.938_2017-07-01T09:20:00_all
4,15035.038696,20084.646017,100000.000000,1.0,17631.957413,57816.506064,22538.376472,86478.186803,62279.868180,57425.759727,...,391.357240,149.205911,309.503770,191.855867,191.231050,70920.247912,371.692519,156.657200,109.644275,GSTP_0.625_0.938_2017-07-01T12:20:00_all
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,70178.358401,100000.000000,49719.879207,1.0,5109.909952,29464.183981,92051.082722,289.831707,236.804453,37692.979522,...,37544.555012,111.963347,33700.283190,68.816705,100000.000000,33336.111513,270.889738,222.656466,138.252816,GSTP_3.125_0.938_2017-07-01T15:20:00_all
316,72828.330100,54205.403481,70612.163588,1.0,37259.960983,41840.371245,40972.865470,11723.992622,194.129472,60237.040642,...,167.216689,145.292048,39199.042316,98.965912,95427.514183,59991.875216,292.010121,142.321729,91.585410,GSTP_3.125_0.938_2017-07-01T18:20:00_all
317,29818.439461,46650.711283,20094.456927,1.0,10573.776049,14773.582480,27204.444274,34534.696975,22217.219169,23567.466070,...,50992.463055,116.782142,29672.720773,113.952018,78372.150937,74240.411785,1745.809535,160.349517,63.717832,GSTP_3.125_0.938_2017-07-01T21:20:00_all
318,75458.537067,71305.702020,8495.000998,1.0,33469.566755,60383.282633,84247.930719,80534.066831,25318.967357,129.568066,...,146.989911,84137.483453,6198.460093,16292.532691,61130.955589,74479.171755,89544.727272,143.180055,244.602303,GSTP_3.125_2.812_2017-07-01T00:20:00_all


In [175]:
df.modelId[0].split('_')

['GSTP', '0.625', '0.938', '2017-07-01T00:20:00', 'all']