In [3]:
def get_features_and_outputs_from_simulations(output_name, timestamp):
    """
    Reads the simulation parameter map, fetches Cassandra to obtain the correspoding output for the specified timestamp.
    Splits the data into train/test sets with, returns c3.Dataset for each one of them.
    
    
    Inputs:
    - str output_name: name of the variable in SimulationModelOutput under investigation
    - str timestamp: shape of the timestamp to fetch the timeseries, e.g. "2017-08-19T09:00:00.000"
    
    Returns:
    - c3.Dataset X_train: train set of features
    - c3.Dataset X_test: test set of features
    - c3.Dataset y_train: train set of outputs
    - c3.Dataset y_test: test set of outputs

    """
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    # fetch simulation parameters
    parameters = c3.SimulationModelParameters.fetch().objs
    parameters = parameters.toJson()
    df = pd.DataFrame(parameters)
    simulations = pd.DataFrame(df['id'])
    X = df[df.columns[5:]]
    
    # define simple metric
    metric_name = "Average_" + output_name + "_SimulationSample" 
    metric_descr = "Calculates average of " + output_name + " for a given set of SimulationSample"
    metric_expr = "avg(avg(normalized.data." + output_name + "))"
    metric = c3.SimpleMetric(id = metric_name,
                                    name = metric_name,
                                    description = metric_descr,
                                    srcType = "SimulationSample",
                                    path = "output",  # the timeseries is in the output field of SimSam
                                    expression = metric_expr
                                  )
    print(metric)
    # define metric spec
    spec = c3.EvalMetricsSpec(
                                    ids = simulations['id'],
                                    expressions = [metric_name],
                                    start = timestamp,
                                    end = timestamp,
                                    interval = "SECOND" 
                                )

    # evaluate metric, cast it to pandas
    evalMetricsResult = c3.SimulationSample.evalMetricsWithMetadata(
                                                                        spec=spec,
                                                                        overrideMetrics=[metric]
                                                                    )
    y = c3.EvalMetricsResult.toPandas(result=evalMetricsResult)
    
    
    # split into train/test sets
    datasets = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # cast into c3 Datasets
    X_train = c3.Dataset.fromPython(datasets[0])
    X_test = c3.Dataset.fromPython(datasets[1])
    y_train = c3.Dataset.fromPython(datasets[2])
    y_test = c3.Dataset.fromPython(datasets[3])
    
    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = get_features_and_outputs_from_simulations("mass_BC_acc", "2017-08-19T09:00:00.000")

c3.SimpleMetric(
 name='Average_mass_BC_acc_SimulationSample',
 expression='avg(avg(normalized.data.mass_BC_acc))',
 description='Calculates average of mass_BC_acc for a given set of '
              'SimulationSample',
 id='Average_mass_BC_acc_SimulationSample',
 srcType=c3.TypeRef(typeName='SimulationSample'),
 path='output')


In [30]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=[1.0]*X_train.shape[1], nu=0.5, coefficient=1.0).build().kernel

# define technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

## Save model to table

In [6]:
trained_GPReg_pipe.upsert()

c3.GaussianProcessRegressionPipe(
 id='ba830ce6-76a0-4fc8-9f12-4557ca0a094d',
 meta=c3.Meta(
        created=datetime.datetime(2022, 4, 28, 12, 10, tzinfo=datetime.timezone.utc),
        updated=datetime.datetime(2022, 4, 28, 12, 10, tzinfo=datetime.timezone.utc),
        timestamp=datetime.datetime(2022, 4, 28, 12, 10, tzinfo=datetime.timezone.utc)),
 version=1,
 typeIdent='PIPE:LF:GPREG',
 noTrainScore=False,
 untrainableOverride=False)

# Retrieve kernel parameters from trained model

### first build the pipeline and check parameters

In [24]:
# get data, build pipeline
X_train, X_test, y_train, y_test = get_features_and_outputs_from_simulations("mass_BC_acc", "2017-08-19T09:00:00.000")

GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=[1.0]*X_train.shape[1], nu=2.0, coefficient=10.0).build().kernel

GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

In [25]:
GPReg_pipe

c3.GaussianProcessRegressionPipe(
 noTrainScore=False,
 untrainableOverride=False,
 technique=c3.GaussianProcessRegressionTechnique(
             randomState=42,
             kernel=c3.SklearnGPRKernel(
                      name='Matern',
                      hyperParameters=c3.Mapp<string, any>({'coefficient': 10.0,
                                        'lengthScale': c3.Arry<double>([1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                        1.0,
                                                     

### now train the model and check parameters again

In [26]:
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [27]:
skm = c3.PythonSerialization.deserialize(serialized=trained_GPReg_pipe.trainedModel.model)

In [28]:
skm.kernel_.get_params()

{'k1': 0.00316**2,
 'k2': Matern(length_scale=[6.23e+04, 5.16e+04, 5.13e+04, 1, 6.59e+04, 3.97e+04, 3.57e+04, 4.04e+04, 4.78e+04, 7.45e+04, 1, 6.13e+04, 7.28e+04, 3.15e+04, 3.45e+04, 7.21e+04, 5.3e+04, 1, 6.46e+04, 8.63e+04, 3.74e+04, 5.81e+04, 2.87e+04, 6.51e+04, 3.67e+04, 5.49e+04, 4.01e+04, 4.07e+04, 1, 4.3e+04, 3.98e+04, 5.07e+04, 7.29e+04, 5.78e+04, 5.11e+04, 2.69e+04, 4.7e+04, 2.43e+04, 6.29e+04, 3.33e+04, 7.43e+04, 5.23e+04, 5.5e+04, 6.72e+04, 5.85e+04, 6.47e+04, 1e+05, 4.29e+04, 7.4e+04, 4.58e+04, 6.45e+04, 4.65e+04, 4.98e+04, 4.01e+04, 4.32e+04, 4.01e+04, 5.09e+04, 7.67e+03, 5.01e+04], nu=2),
 'k1__constant_value': 9.999999999999997e-06,
 'k1__constant_value_bounds': (1e-05, 100000.0),
 'k2__length_scale': array([6.23170262e+04, 5.15936775e+04, 5.12662277e+04, 1.00000000e+00,
        6.59283024e+04, 3.96518608e+04, 3.56844325e+04, 4.03760757e+04,
        4.78200726e+04, 7.45059474e+04, 1.00000000e+00, 6.12592604e+04,
        7.27863173e+04, 3.14812698e+04, 3.44927639e+04, 7.21

In [34]:
# upsert model
trained_GPReg_pipe.upsert()

c3.GaussianProcessRegressionPipe(
 id='42264fbe-d73f-4143-83ef-0c9d8affed78',
 meta=c3.Meta(
        created=datetime.datetime(2022, 5, 9, 19, 11, 1, tzinfo=datetime.timezone.utc),
        updated=datetime.datetime(2022, 5, 9, 19, 11, 1, tzinfo=datetime.timezone.utc),
        timestamp=datetime.datetime(2022, 5, 9, 19, 11, 1, tzinfo=datetime.timezone.utc)),
 version=1,
 typeIdent='PIPE:LF:GPREG',
 noTrainScore=False,
 untrainableOverride=False)

### now get saved model from table and check on parameters

In [30]:
# use the id that came out of the previous cell
trainedPipe = c3.GaussianProcessRegressionPipe.get('9f5ea300-2f2f-41b6-a973-df6b1dd00781')

In [31]:
skm_after = c3.PythonSerialization.deserialize(serialized=trainedPipe.trainedModel.model)

In [32]:
skm_after.kernel_.get_params()

{'k1': 0.00316**2,
 'k2': Matern(length_scale=[6.23e+04, 5.16e+04, 5.13e+04, 1, 6.59e+04, 3.97e+04, 3.57e+04, 4.04e+04, 4.78e+04, 7.45e+04, 1, 6.13e+04, 7.28e+04, 3.15e+04, 3.45e+04, 7.21e+04, 5.3e+04, 1, 6.46e+04, 8.63e+04, 3.74e+04, 5.81e+04, 2.87e+04, 6.51e+04, 3.67e+04, 5.49e+04, 4.01e+04, 4.07e+04, 1, 4.3e+04, 3.98e+04, 5.07e+04, 7.29e+04, 5.78e+04, 5.11e+04, 2.69e+04, 4.7e+04, 2.43e+04, 6.29e+04, 3.33e+04, 7.43e+04, 5.23e+04, 5.5e+04, 6.72e+04, 5.85e+04, 6.47e+04, 1e+05, 4.29e+04, 7.4e+04, 4.58e+04, 6.45e+04, 4.65e+04, 4.98e+04, 4.01e+04, 4.32e+04, 4.01e+04, 5.09e+04, 7.67e+03, 5.01e+04], nu=2),
 'k1__constant_value': 9.999999999999997e-06,
 'k1__constant_value_bounds': (1e-05, 100000.0),
 'k2__length_scale': array([6.23170262e+04, 5.15936775e+04, 5.12662277e+04, 1.00000000e+00,
        6.59283024e+04, 3.96518608e+04, 3.56844325e+04, 4.03760757e+04,
        4.78200726e+04, 7.45059474e+04, 1.00000000e+00, 6.12592604e+04,
        7.27863173e+04, 3.14812698e+04, 3.44927639e+04, 7.21

Okay, seems to be matching what we had after training