# Get a list of GSTPs to consider

My preferred way is to grab the targetSpec filters directly from the parameter table, so all the machinery below is copied in order to collect a list of these strings from that table.

In [3]:
lat1 = 0.0
lat2 = 5.0
lon1 = 0.0
lon2 = 5.0
time1 = "2017-07-01T00:00:00"
time2 = "2017-07-01T02:59:59"
gstpFilter = c3.Filter() \
    .ge("latitude", lat1) \
    .and_().le("latitude", lat2) \
    .and_().ge("longitude", lon1) \
    .and_().le("longitude", lon2) \
    .and_().ge("time", time1) \
    .and_().le("time", time2)

excludeFeats = ["acure_anth_so2", "acure_carb_bb_ems", "acure_carb_ff_ems", "acure_carb_res_ems"]
kernelLen = 59 - len(excludeFeats)

GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=[1.0]*kernelLen, nu=0.5, coefficient=1.0).build().kernel

GPR_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
)

In [4]:
job = c3.AODGPRModelFinder.extractLearnedParametersJob(excludeFeats, gstpFilter, "all", GPR_technique, 10)

In [17]:
job.status()

c3.MapReduceStatus(
 started=datetime.datetime(2022, 8, 18, 16, 21, 14, tzinfo=datetime.timezone.utc),
 startedby='jcarzon@andrew.cmu.edu',
 completed=datetime.datetime(2022, 8, 18, 16, 26, 50, tzinfo=datetime.timezone.utc),
 status='completed')

In [18]:
df = c3.AODGPRModelFinder.getDataframeFromJob(job)

In [19]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,modelId,latitude,longitude,time
0,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,421.668536,100000.000000,...,118.616164,454.392120,100000.00000,145.282857,262.994647,132.077327,054a05df-8934-4080-a744-c5b7ee8d1d9b,0.625,0.9375,2017-07-01T00:20:00
1,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,421.668536,100000.000000,...,118.616164,454.392120,100000.00000,145.282857,262.994647,132.077327,23b7114c-d87d-4e18-a30f-74331bccd193,0.625,0.9375,2017-07-01T00:20:00
2,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,421.668536,100000.000000,...,118.616164,454.392120,100000.00000,145.282857,262.994647,132.077327,304cb71e-ea37-4c63-b25e-b06e46d23fef,0.625,0.9375,2017-07-01T00:20:00
3,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,421.668536,100000.000000,...,118.616164,454.392120,100000.00000,145.282857,262.994647,132.077327,496d7053-c45c-44d1-a81f-1ddf313a1eed,0.625,0.9375,2017-07-01T00:20:00
4,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,100000.000000,421.668536,100000.000000,...,118.616164,454.392120,100000.00000,145.282857,262.994647,132.077327,524505ec-f619-4341-b912-c8aba5e770a8,0.625,0.9375,2017-07-01T00:20:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,27847.443086,156.473661,26067.774163,124.442977,31097.758227,40964.183379,25019.891718,30392.208292,26878.562710,25726.279075,...,107.561677,133.030548,32929.23658,15510.779009,120.943416,325.839865,00a5dfe5-ff78-455f-8838-34638ba021ee,4.375,4.6875,2017-07-01T00:20:00
66,27847.443086,156.473661,26067.774163,124.442977,31097.758227,40964.183379,25019.891718,30392.208292,26878.562710,25726.279075,...,107.561677,133.030548,32929.23658,15510.779009,120.943416,325.839865,2b2a0c75-a5db-4243-849d-d5f743a58395,4.375,4.6875,2017-07-01T00:20:00
67,27847.443086,156.473661,26067.774163,124.442977,31097.758227,40964.183379,25019.891718,30392.208292,26878.562710,25726.279075,...,107.561677,133.030548,32929.23658,15510.779009,120.943416,325.839865,a02b78b5-431c-436a-bfd4-ddfc76cec57f,4.375,4.6875,2017-07-01T00:20:00
68,27847.443086,156.473661,26067.774163,124.442977,31097.758227,40964.183379,25019.891718,30392.208292,26878.562710,25726.279075,...,107.561677,133.030548,32929.23658,15510.779009,120.943416,325.839865,b4a0b553-b96b-4806-acfb-18a33bedbb00,4.375,4.6875,2017-07-01T00:20:00


Below I'm just making up examples of filter strings.

In [20]:
def get_GSTP_filter(
    k
):
    """
    k : int
        Index for a list of model IDs
    """
    pipe = c3.GaussianProcessRegressionPipe.get(df.modelId[k])
    sourceSpec = c3.GPRDataSourceSpec.get(pipe.dataSourceSpec.id)
    return sourceSpec.targetSpec.filter

geoSurfaceTimePoints = list(set([get_GSTP_filter(k) for k in range(len(df.modelId))]))

gstpFilter = ' || '.join(geoSurfaceTimePoints)

In [21]:
len(gstpFilter)

752

# Train a single model using these GSTPs

Note that the set y is of length 2000 whereas it should match the length of X, 2652.

In [30]:
# features to ignore
excludeFeats = ['acure_bl_nuc', 'acure_ait_width', 'acure_cloud_ph',
       'acure_carb_ff_ems', 'acure_carb_ff_ems_eur', 'acure_carb_ff_ems_nam',
       'acure_carb_ff_ems_chi', 'acure_carb_ff_ems_asi',
       'acure_carb_ff_ems_mar', 'acure_carb_ff_ems_r', 'acure_carb_bb_ems',
       'acure_carb_bb_ems_sam', 'acure_carb_bb_ems_naf',
       'acure_carb_bb_ems_saf', 'acure_carb_bb_ems_bnh',
       'acure_carb_bb_ems_rnh', 'acure_carb_bb_ems_rsh', 'acure_carb_res_ems',
       'acure_carb_res_ems_chi', 'acure_carb_res_ems_asi',
       'acure_carb_res_ems_afr', 'acure_carb_res_ems_lat',
       'acure_carb_res_ems_r', 'acure_carb_ff_diam', 'acure_carb_bb_diam',
       'acure_carb_res_diam', 'acure_prim_so4_diam', 'acure_sea_spray',
       'acure_anth_so2', 'acure_anth_so2_chi', 'acure_anth_so2_asi',
       'acure_anth_so2_eur', 'acure_anth_so2_nam', 'acure_anth_so2_r',
       'acure_volc_so2', 'acure_bvoc_soa', 'acure_dms', 'acure_prim_moc',
       'acure_dry_dep_ait', 'acure_dry_dep_so2',
       'acure_kappa_oc', 'acure_sig_w', 'acure_rain_frac',
       'acure_cloud_ice_thresh', 'acure_convective_plume_scavenging',
       'acure_scav_diam', 'acure_bc_ri', 'acure_oxidants_oh',
       'acure_oxidants_o3', 'bparam', 'two_d_fsd_factor', 'c_r_correl',
       'acure_autoconv_exp_lwp', 'acure_autoconv_exp_nd', 'dbsdtbs_turb_0',
       'ai', 'm_ci', 'a_ent_1_rp']
kernelLen = 59 - len(excludeFeats)

# create kernel
GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=[1.0]*kernelLen, nu=0.5, coefficient=1.0).build().kernel.upsert()

# define technique
GPR_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
).upsert()

# define data source spec
GPR_dataspec = c3.GPRDataSourceSpec(
    featuresType = c3.TypeRef(
        typeName="SimulationModelParameters"
    ),
    featuresSpec=c3.FetchSpec(
        limit=-1
    ),
    excludeFeatures=excludeFeats,
    targetType=c3.TypeRef(
        typeName="Simulation3HourlyAODOutput"
    ),
    targetSpec=c3.FetchSpec(
        filter=gstpFilter,
        limit=-1
    ),
    targetName="all"
).upsert()

# create pipe
GPR_pipe = c3.GaussianProcessRegressionPipe(
    technique=GPR_technique,
    dataSourceSpec=GPR_dataspec
)

In [31]:
import pandas as pd

X = GPR_pipe.getFeatures()
dfX = c3.Dataset.toPandas(dataset=X)
X = c3.Dataset.fromPython(pd.concat([dfX]*len(geoSurfaceTimePoints), axis=0, ignore_index=True))
c3.Dataset.toPandas(dataset=X)

Unnamed: 0,acure_dry_dep_acc
0,0.500000
1,0.470000
2,0.618559
3,0.407896
4,0.746683
...,...
2647,0.853673
2648,0.511722
2649,0.833222
2650,0.910051


In [32]:
y = GPR_pipe.getTarget()
dfy = c3.Dataset.toPandas(dataset=y)
dfy

Unnamed: 0,all
0,0.413134
1,0.411247
2,0.492994
3,0.374601
4,0.308180
...,...
2647,0.324252
2648,0.297467
2649,0.258230
2650,0.160031


# Development

In [23]:
import pandas as pd

dataSourceSpec = c3.GPRDataSourceSpec.get(GPR_pipe.dataSourceSpec.id)

In [28]:
dataSourceSpec

c3.GPRDataSourceSpec(
 id='S',
 meta=c3.Meta(
        tenantTagId=151,
        tenant='dev',
        tag='tc02d',
        created=datetime.datetime(2022, 8, 18, 16, 28, 55, tzinfo=datetime.timezone.utc),
        createdBy='jcarzon@andrew.cmu.edu',
        updated=datetime.datetime(2022, 8, 18, 16, 28, 55, tzinfo=datetime.timezone.utc),
        updatedBy='jcarzon@andrew.cmu.edu',
        timestamp=datetime.datetime(2022, 8, 18, 16, 28, 55, tzinfo=datetime.timezone.utc),
        fetchInclude='[]',
        fetchType='GPRDataSourceSpec'),
 version=1,
 featuresType=c3.TypeRef(typeName='SimulationModelParameters'),
 featuresSpec=c3.FetchSpec(offset=0, limit=-1),
 excludeFeatures=c3.Arry<string>(['acure_bl_nuc',
                   'acure_ait_width',
                   'acure_cloud_ph',
                   'acure_carb_ff_ems',
                   'acure_carb_ff_ems_eur',
                   'acure_carb_ff_ems_nam',
                   'acure_carb_ff_ems_chi',
                   'acure_carb_ff_ems_

In [29]:
targetType = dataSourceSpec.targetType.toType()
outputTableC3 = targetType.fetch(dataSourceSpec.targetSpec).objs.toJson()
outputTablePandas = pd.DataFrame(outputTableC3)
outputTablePandas = outputTablePandas.drop("version", axis=1)

In [25]:
# collect only the numeric fields
outputTablePandas = outputTablePandas.select_dtypes(["number"])

In [26]:
if dataSourceSpec.targetName == "all":
    outputTablePandas = pd.DataFrame(
        outputTablePandas.sum(axis=1),
        columns=[dataSourceSpec.targetName]
    )
else:
    outputTablePandas = pd.DataFrame(outputTablePandas[dataSourceSpec.targetName])

In [27]:
c3.Dataset.fromPython(outputTablePandas)

c3.Dataset(
 shape=c3.Arry<int>([2000, 1]),
 indices=c3.Mapp<int, [string]>({0: c3.Arry<string>(['0', '1', '2', ..., '1997', '1998', '1999']),
           1: c3.Arry<string>(['all'])}),
 m_data=c3.Arry<double>([0.41313433650000003,
          0.41124703549999997,
          0.492994244,
          ...,
          0.22186180700000002,
          0.4444020461,
          0.38097793300000005]))