# Preamble

Code we'll use to set up an example.

## Manual fit

Borrowed from https://tc02d-dev.c3dti.ai/jupyter/notebooks/research_teams/gordon/methods_c3/examples/manual-model-fitting.ipynb.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Get target

In [3]:
def get_response(gstp):
    # filter
    s3haodFilter = c3.Filter().eq("geoSurfaceTimePoint", gstp)
    
    # collect objects
    data = c3.Simulation3HourlyAODOutput.fetch({"filter": s3haodFilter})

    # cast it into pandas
    data = data.objs.toJson()
    return np.array(pd.DataFrame(data).loc[:, 'dust':'insolubleAitkenMode'].sum(axis=1))

In [38]:
def get_average_response(
    time1="2017-07-01T00:00:00",
    time2="2017-07-01T02:59:59",
    lat1 = -1.0,
    lat2 = 0.0,
    lon1 = -22.0,
    lon2 = -20.0
):
    gstpFilter = c3.Filter() \
        .ge("latitude", lat1) \
        .and_().le("latitude", lat2) \
        .and_().ge("longitude", lon1) \
        .and_().le("longitude", lon2) \
        .and_().ge("time", time1) \
        .and_().le("time", time2)
    gstps = c3.GeoSurfaceTimePoint.fetch({"filter": gstpFilter})
    
    each_response = [get_response(gstp) for gstp in gstps.objs]
    response_full = [len(resp)==221 for resp in each_response]
    
    my_dict = dict(zip(
        [gstps.objs[k].id for k in range(len(gstps.objs)) if response_full[k]],
        [each_response[k] for k in range(len(gstps.objs)) if response_full[k]]
    ))
    
    y_train = pd.DataFrame(my_dict).mean(axis=1)
    
    return y_train

### Get inputs

In [9]:
myFeaturesNames = [
    "bparam",
    "acure_autoconv_exp_nd",
    "a_ent_1_rp",
    "acure_dry_dep_acc",
    "acure_dry_dep_ait",
    "acure_dry_dep_so2",
    "acure_bvoc_soa",
    "acure_dms",
    "acure_bl_nuc",
    "acure_ait_width",
    "acure_cloud_ph",
    "dbsdtbs_turb_0",
    "acure_bc_ri",
    "acure_sea_spray",
    "acure_carb_bb_diam",
    "acure_anth_so2_r",
    "acure_prim_so4_diam"
]

In [12]:
def get_inputs(
    featuresNames=myFeaturesNames
):
    # Obtain a comma-separated table of simulation ensemble members' parameter combinations
    csv_table_metric = c3.SimulationModelParameters.fetch().objs.toJson()

    # Save this table instead as a pandas dataframe (221 ensemble members x 64 parameters)
    dfparams = pd.DataFrame(csv_table_metric)

    if featuresNames is not None:
        X_train = dfparams.loc[:, myFeaturesNames]
        return X_train
    else:
        return dfparams

### Fit model

In [7]:
def fit_model(
    X_train,
    y_train,
    nu=0.5
):
    # create kernel
    GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=[1.0]*len(myFeaturesNames), nu=nu, coefficient=1.0).build().kernel.upsert()

    # define technique
    GPR_technique = c3.GaussianProcessRegressionTechnique(
        randomState=42,
        kernel=GPR_kernel,
        centerTarget=True
    ).upsert()

    # create pipe
    GPR_pipe = c3.GaussianProcessRegressionPipe(
        technique=GPR_technique
    )

    GPR_trained = GPR_pipe.train(
        input=c3.Dataset.fromPython(X_train),
        targetOutput=c3.Dataset.fromPython(pd.DataFrame(y_train))
    )
    
    return GPR_trained

## C3 fit

Borrowed from https://tc02d-dev.c3dti.ai/jupyter/notebooks/research_teams/gordon/applying-methods/GPRforAOD.ipynb

In [238]:
lat1 = -2.0
lat2 = 0.0
lon1 = -22.0
lon2 = -20.0
time1 = "2017-07-01T00:00:00"
time2 = "2017-07-01T02:59:59"

gstpFilter = c3.Filter() \
    .ge("latitude", lat1) \
    .and_().le("latitude", lat2) \
    .and_().ge("longitude", lon1) \
    .and_().le("longitude", lon2) \
    .and_().ge("time", time1) \
    .and_().le("time", time2)

In [249]:
excludeFeats = list(set(get_inputs(None).columns) - set(myFeaturesNames + ['version', 'type', 'id', 'ensemble', 'meta']))

In [250]:
kernelLen = len(myFeaturesNames)

GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=[1.0]*kernelLen, nu=0.5, coefficient=1.0).build().kernel.upsert()

GPR_technique = c3.GaussianProcessRegressionTechnique(
    randomState=42,
    kernel=GPR_kernel,
    centerTarget=True
).upsert()

In [251]:
jobOptions = c3.AODGaussianMLTrainingJobOptions(
    batchSize=1,
    gstpFilter=gstpFilter,
    targetName="all",
    gprTechnique=GPR_technique,
    excludeFeatures=excludeFeats
)

job = c3.AODGaussianMLTrainingJob(
    options=jobOptions
).upsert()

In [252]:
job.start()

c3.BatchJobStatus(
 started=datetime.datetime(2022, 10, 4, 6, 39, 15, tzinfo=datetime.timezone.utc),
 startedby='jcarzon@andrew.cmu.edu',
 status='running')

In [263]:
job.status()

c3.BatchJobStatus(
 started=datetime.datetime(2022, 10, 4, 6, 39, 15, tzinfo=datetime.timezone.utc),
 startedby='jcarzon@andrew.cmu.edu',
 completed=datetime.datetime(2022, 10, 4, 6, 39, 27, tzinfo=datetime.timezone.utc),
 status='completed',
 newBatchSubmitted=False)

In [126]:
print(job.status().errors[0].errorMsg)

TypeError: 'NoneType' object is not subscriptable

# Example

## Manual results

In [40]:
y_train = get_average_response()
X_train = get_inputs()

In [41]:
GPR_trained = fit_model(X_train, y_train)

In [44]:
dict(zip(
    myFeaturesNames,
    c3.PythonSerialization.deserialize(serialized=GPR_trained.trainedModel.model).kernel_.get_params()['k2__length_scale']
))

{'bparam': 1102.0791624590156,
 'acure_autoconv_exp_nd': 8.190325810371723,
 'a_ent_1_rp': 10.753323657036342,
 'acure_dry_dep_acc': 3.196243178754316,
 'acure_dry_dep_ait': 10.339821623432993,
 'acure_dry_dep_so2': 26.447472776447864,
 'acure_bvoc_soa': 7.400479537433278,
 'acure_dms': 26.96266310267672,
 'acure_bl_nuc': 5.695196868432163,
 'acure_ait_width': 100000.00000000001,
 'acure_cloud_ph': 19384.86860172804,
 'dbsdtbs_turb_0': 38.762582138540736,
 'acure_bc_ri': 100000.00000000001,
 'acure_sea_spray': 2.9232593287553,
 'acure_carb_bb_diam': 25980.26328199287,
 'acure_anth_so2_r': 100000.00000000001,
 'acure_prim_so4_diam': 8.737555407472135}

## C3 results

In [264]:
params = c3.AODGPRModelFinder.extractLearnedParametersJob(excludeFeats, gstpFilter, "all", GPR_technique, 10)

In [268]:
params.status()

c3.MapReduceStatus(
 started=datetime.datetime(2022, 10, 4, 6, 39, 33, tzinfo=datetime.timezone.utc),
 startedby='jcarzon@andrew.cmu.edu',
 completed=datetime.datetime(2022, 10, 4, 6, 39, 37, tzinfo=datetime.timezone.utc),
 status='failed',
 errors=c3.Arry<JobRunErrorDetail>([c3.JobRunErrorDetail(
           failedActionId='6179.856649736',
           errorMsg='Error executing command: '
                     '/usr/local/share/c3/condaEnvs/dev/tc02d/py-client_1_0_0/bin/python '
                     '/tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: '
                     'any, objs: any, job: any): any>_applyPython.py\n'
                     'Process exited with 3 exit code.\n'
                     'a_id=6179.856650310 a_implementation=python t_tenant=dev '
                     't_tag=tc02d t_type=Lambda<function(batch: any, objs: '
                     'any, job: any): any> t_action=applyPython '
                     'p_logger=action url=http://dev-dti-app-w-006:80

In [269]:
print(params.status().errors[0].errorMsg)

Error executing command: /usr/local/share/c3/condaEnvs/dev/tc02d/py-client_1_0_0/bin/python /tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: any, objs: any, job: any): any>_applyPython.py
Process exited with 3 exit code.
a_id=6179.856650310 a_implementation=python t_tenant=dev t_tag=tc02d t_type=Lambda<function(batch: any, objs: any, job: any): any> t_action=applyPython p_logger=action url=http://dev-dti-app-w-006:8080 connector=null mode="thick" Action failed!
Traceback (most recent call last):
  File "/tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: any, objs: any, job: any): any>_applyPython.py", line 406, in _c3_remote_bootstrap__run_c3_action
    _c3_result = _action()
  File "/tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: any, objs: any, job: any): any>_applyPython.py", line 530, in <lambda>
    action=lambda: applyPython(this = _c3_inputs.get('this'),actuals = _c3_inputs.get('actuals')),
  File "Lambda<function(

In [None]:
c3.AODGPRModelFinder.getDataframeFromJob(params)

# Unwind the AODGPRModelFinder method

In [151]:
def cassandra_mapper(batch, objs, job):
    models = []
    for obj in objs:
        model = c3.AODGPRModelFinder.getPipe(
            job.context.value["excludeFeatures"],
            obj.id,
            job.context.value["targetName"],
            job.context.value["technique"]
        )
        models.append(model)

    return {batch: models}

In [152]:
map_lambda = c3.Lambda.fromPython(cassandra_mapper)

In [153]:
def cassandra_reducer(key, interValues, job):
    values = []
    for iv in interValues:
        for val in iv:
            for m in val:
                pickledModel = m["trainedModel"]["model"]
                model = c3.PythonSerialization.deserialize(serialized=pickledModel)
                hp = model.kernel_.get_params()['k2__length_scale']
                model_id = m["id"]

                # find GSTP
                dssId = m["dataSourceSpec"]["id"]
                dss = c3.GPRDataSourceSpec.get(dssId)
                gstpId = dss.targetSpec.filter.split(" == ")[1].replace('"', '')
                gstp = c3.GeoSurfaceTimePoint.get(gstpId)
                lat = gstp.latitude
                lon = gstp.longitude
                time = gstp.time
                values.append((hp, model_id, lat, lon, time))

    return values

In [154]:
reduce_lambda = c3.Lambda.fromPython(cassandra_reducer, runtime="gordon-ML_1_0_0")

In [161]:
job_context = c3.MappObj(
    value={
        'excludeFeatures': excludeFeats,
        'targetName': "all",
        'technique': GPR_technique
    }
)

In [163]:
job = c3.DynMapReduce.startFromSpec(
    c3.DynMapReduceSpec(
        targetType="GeoSurfaceTimePoint",       
        filter=gstpFilter, 
        mapLambda=map_lambda,
        reduceLambda=reduce_lambda,
        batchSize=1,
        context=job_context
    )
)

In [166]:
print(job.status().errors[0].errorMsg)

Error executing command: /usr/local/share/c3/condaEnvs/dev/tc02d/py-client_1_0_0/bin/python /tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: any, objs: any, job: any): any>_applyPython.py
Process exited with 3 exit code.
a_id=6179.813033673 a_implementation=python t_tenant=dev t_tag=tc02d t_type=Lambda<function(batch: any, objs: any, job: any): any> t_action=applyPython p_logger=action url=http://dev-dti-app-w-006:8080 connector=null mode="thick" Action failed!
Traceback (most recent call last):
  File "/tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: any, objs: any, job: any): any>_applyPython.py", line 406, in _c3_remote_bootstrap__run_c3_action
    _c3_result = _action()
  File "/tmp/pythonActionSourceCache9202922681108544513/Lambda<function(batch: any, objs: any, job: any): any>_applyPython.py", line 530, in <lambda>
    action=lambda: applyPython(this = _c3_inputs.get('this'),actuals = _c3_inputs.get('actuals')),
  File "Lambda<function(

In [149]:
job = extractLearnedParametersJob(excludeFeats, gstpFilter, "all", GPR_technique, 10)

In [None]:
def getDataframeFromJob(job):
    """
    Iterates over job result and builds dataframe.
    """
    import pandas as pd
    import numpy as np

    lengthScales = []
    ids = []
    lats = []
    lons = []
    times = []
    if job.status().status == "completed":
        for key, value in job.results().items():
            for subvalue in value:
                ls = np.array(subvalue[0]).astype(float)
                model_id = np.array([subvalue[1]]).astype(str)
                lengthScales.append(ls)
                ids.append(model_id[0])
                lats.append(np.array(subvalue[2]).astype(float))
                lons.append(np.array(subvalue[3]).astype(float))
                times.append(np.array(subvalue[4]))
                
        df = pd.DataFrame(lengthScales)
        df["modelId"] = ids
        df["latitude"] = lats
        df["longitude"] = lons
        df["time"] = times
        return df
    else:
        return False