# Data loading (run this before testing kernels)

In [1]:
def get_features_and_outputs_from_simulations(output_name, timestamp):
    """
    Reads the simulation parameter map, fetches Cassandra to obtain the correspoding output for the specified timestamp.
    Splits the data into train/test sets with, returns c3.Dataset for each one of them.
    
    
    Inputs:
    - str output_name: name of the variable in SimulationModelOutput under investigation
    - str timestamp: shape of the timestamp to fetch the timeseries, e.g. "2017-08-19T09:00:00.000"
    
    Returns:
    - c3.Dataset X_train: train set of features
    - c3.Dataset X_test: test set of features
    - c3.Dataset y_train: train set of outputs
    - c3.Dataset y_test: test set of outputs

    """
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    # fetch simulation parameters
    parameters = c3.SimulationModelParameters.fetch().objs
    parameters = parameters.toJson()
    df = pd.DataFrame(parameters)
    simulations = pd.DataFrame(df['id'])
    X = df[df.columns[5:]]
    
    # define simple metric
    metric_name = "Average_" + output_name + "_SimulationSample" 
    metric_descr = "Calculates average of " + output_name + " for a given set of SimulationSample"
    metric_expr = "avg(avg(normalized.data." + output_name + "))"
    metric = c3.SimpleMetric(id = metric_name,
                                    name = metric_name,
                                    description = metric_descr,
                                    srcType = "SimulationSample",
                                    path = "output",  # the timeseries is in the output field of SimSam
                                    expression = metric_expr
                                  )
    
    # define metric spec
    spec = c3.EvalMetricsSpec(
                                    ids = simulations['id'],
                                    expressions = [metric_name],
                                    start = timestamp,
                                    end = timestamp,
                                    interval = "SECOND" 
                                )

    # evaluate metric, cast it to pandas
    evalMetricsResult = c3.SimulationSample.evalMetricsWithMetadata(
                                                                        spec=spec,
                                                                        overrideMetrics=[metric]
                                                                    )
    y = c3.EvalMetricsResult.toPandas(result=evalMetricsResult)
    
    
    # split into train/test sets
    datasets = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # cast into c3 Datasets
    X_train = c3.Dataset.fromPython(datasets[0])
    X_test = c3.Dataset.fromPython(datasets[1])
    y_train = c3.Dataset.fromPython(datasets[2])
    y_test = c3.Dataset.fromPython(datasets[3])
    
    return X_train, X_test, y_train, y_test

In [2]:
X_train, X_test, y_train, y_test = get_features_and_outputs_from_simulations("mass_BC_acc", "2017-08-19T09:00:00.000")

# Constant Kernel

In [1]:
const_kernel = c3.SklearnGPRKernelConstant(constantValue=2).build()

In [2]:
const_kernel

c3.SklearnGPRKernelConstant(
 constantValue=2.0,
 kernel=c3.SklearnGPRKernel(
          name='Constant',
          hyperParameters=c3.Mapp<string, double>({'constantValue': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeZyzs8rLknMK/EG87kKGTQbCxlrC5k0IvgYGBiSobLxZYk5pamFzO4ODBAQIYohG5+UX5qXUlzI4m735MfWjscZH90dfmSBVbcVspYm6QEA8IMxAA=='))

In [3]:
python_kernel = c3.PythonSerialization.deserialize(serialized=const_kernel.kernel.pickledKernel)

In [4]:
type(python_kernel)

sklearn.gaussian_process.kernels.ConstantKernel

In [5]:
python_kernel.get_params()

{'constant_value': 2.0, 'constant_value_bounds': (1e-05, 100000.0)}

### In the pipe

In [None]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelConstant(constantValue=1.0).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [None]:
assert trained_GPReg_pipe.trainedModel is not None

In [None]:
trained_GPReg_pipe.hyperParams()

**WORKS!**

# Dot Product Kernel

In [6]:
kernel = c3.SklearnGPRKernelDotProduct(sigmaZero=2).build()
print(kernel)

python_kernel = c3.PythonSerialization.deserialize(serialized=kernel.kernel.pickledKernel)
print(type(python_kernel))
print(python_kernel.get_params())

c3.SklearnGPRKernelDotProduct(
 sigmaZero=2.0,
 kernel=c3.SklearnGPRKernel(
          name='DotProduct',
          hyperParameters=c3.Mapp<string, double>({'sigmaZero': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeZyyS8JKMpPKU0u4Spk0GwsZKwtZNKIYGdgYCjOTM9NjDcoZHZ3YICACD6EcHxSfmleSnEhi7vdkx9bOx5nfHR3+JEFVtZWyFqapAcASjEoww=='))
<class 'sklearn.gaussian_process.kernels.DotProduct'>
{'sigma_0': 2.0, 'sigma_0_bounds': (1e-05, 100000.0)}


### In the pipe

In [None]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelDotProduct(sigmaZero=1.0).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [None]:
assert trained_GPReg_pipe.trainedModel is not None

In [None]:
trained_GPReg_pipe.hyperParams()

**WORKS!**

# ExpSineSquared Kernel

In [12]:
kernel = c3.SklearnGPRKernelExpSineSquared(lengthScale=2, periodicity=2).build()
print(kernel)

python_kernel = c3.PythonSerialization.deserialize(serialized=kernel.kernel.pickledKernel)
print(type(python_kernel))
print(python_kernel.get_params())

c3.SklearnGPRKernelExpSineSquared(
 lengthScale=2.0,
 periodicity=2.0,
 kernel=c3.SklearnGPRKernel(
          name='ExpSineSquared',
          hyperParameters=c3.Mapp<string, double>({'lengthScale': 2.0,
                            'periodicity': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeZyrSgIzsxLDS4sTSxKTeEqZNBsLGSsLWTSiOBhYGDISc1LL8mIL05OzEktZHZ3YICACG4gUZBalJmfkpmcWVJZyIKQEkbTFp+UX5qXUlzI6m735MfWjscZH90dfmSB1bYVskUIoZoEU82OTTVHaZIeAKjRRiE='))
<class 'sklearn.gaussian_process.kernels.ExpSineSquared'>
{'length_scale': 2.0, 'periodicity': 2.0, 'length_scale_bounds': (1e-05, 100000.0), 'periodicity_bounds': (1e-05, 100000.0)}


### In the pipe

In [None]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelExpSineSquared(lengthScale=10, periodicity=1).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

This is not *training*, but it is an error of the sklearn implementation.

# Matern Kernel

In [8]:
kernel = c3.SklearnGPRKernelMatern(lengthScale=2, nu=2).build()
print(kernel)

python_kernel = c3.PythonSerialization.deserialize(serialized=kernel.kernel.pickledKernel)
print(type(python_kernel))
print(python_kernel.get_params())

c3.SklearnGPRKernelMatern(
 lengthScale=2.0,
 nu=2.0,
 kernel=c3.SklearnGPRKernel(
          name='Matern',
          hyperParameters=c3.Mapp<string, double>({'lengthScale': 2.0,
                            'nu': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKebyTSwBsrgKGTQbCxlrC5k0IngYGBhyUvPSSzLii5MTc1ILmd0dGCAgQhhNLj4pvzQvpbiQxd3uyY+tHY8zPro7/MgCq20rZI1gAtJ5pYVscANKk/QAf/kt+Q=='))
<class 'sklearn.gaussian_process.kernels.Matern'>
{'length_scale': 2.0, 'length_scale_bounds': (1e-05, 100000.0), 'nu': 2.0}


### In the pipe

In [None]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelMatern(lengthScale=2, nu=1).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [None]:
assert trained_GPReg_pipe.trainedModel is not None

In [None]:
trained_GPReg_pipe.hyperParams()

**WORKS!**

# RBF Kernel

In [9]:
kernel = c3.SklearnGPRKernelRBF(lengthScale=2).build()
print(kernel)

python_kernel = c3.PythonSerialization.deserialize(serialized=kernel.kernel.pickledKernel)
print(type(python_kernel))
print(python_kernel.get_params())

c3.SklearnGPRKernelRBF(
 lengthScale=2.0,
 kernel=c3.SklearnGPRKernel(
          name='RBF',
          hyperParameters=c3.Mapp<string, double>({'lengthScale': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeYKcnLjKmTQbCxkrC1k0ojgYWBgyEnNSy/JiC9OTsxJLWR2d2CAgAhhNLn4pPzSvJTiQhZ3uyc/tnY8zvjo7vAjC6y2rZC1NEkPAKMMKjE='))
<class 'sklearn.gaussian_process.kernels.RBF'>
{'length_scale': 2.0, 'length_scale_bounds': (1e-05, 100000.0)}


### In the pipe

In [None]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelRBF(lengthScale=2).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [None]:
assert trained_GPReg_pipe.trainedModel is not None

In [None]:
trained_GPReg_pipe.hyperParams()

**WORKS!**

# Rational Quadratic Kernel

In [10]:
kernel = c3.SklearnGPRKernelRationalQuadratic(lengthScale=2, alpha=1).build()
print(kernel)

python_kernel = c3.PythonSerialization.deserialize(serialized=kernel.kernel.pickledKernel)
print(type(python_kernel))
print(python_kernel.get_params())

c3.SklearnGPRKernelRationalQuadratic(
 lengthScale=2.0,
 alpha=1.0,
 kernel=c3.SklearnGPRKernel(
          name='RationalQuadratic',
          hyperParameters=c3.Mapp<string, double>({'alpha': 1.0,
                            'lengthScale': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeYKSizJzM9LzAksTUwpArKTuQoZNBsLGWsLmTQieBgYGHJS89JLMuKLkxNzUguZ3R0YICCCFUgk5hRkJBayuNt/gAoKo2mIT8ovzUspLmR1t3vyY2vH44yP7g4/ssBq2wrZwOaDzYCpY8emjqM0SQ8Ai8hDDQ=='))
<class 'sklearn.gaussian_process.kernels.RationalQuadratic'>
{'length_scale': 2.0, 'alpha': 1.0, 'length_scale_bounds': (1e-05, 100000.0), 'alpha_bounds': (1e-05, 100000.0)}


### In the pipe

In [None]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelRationalQuadratic(lengthScale=2, alpha=1).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [None]:
assert trained_GPReg_pipe.trainedModel is not None

In [None]:
trained_GPReg_pipe.hyperParams()

**WORKS!**

# White Kernel

In [11]:
kernel = c3.SklearnGPRKernelWhite(noiseLevel=2).build()
print(kernel)

python_kernel = c3.PythonSerialization.deserialize(serialized=kernel.kernel.pickledKernel)
print(type(python_kernel))
print(python_kernel.get_params())

c3.SklearnGPRKernelWhite(
 noiseLevel=2.0,
 kernel=c3.SklearnGPRKernel(
          name='White',
          hyperParameters=c3.Mapp<string, double>({'noiseLevel': 2.0}),
          pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeYKz8gsSfUGc7gKGTQbCxlrC5k0IrgZGBjy8jOLU+NzUstScwqZ3R0YICBCCFUqPim/NC+luJDF3e7Jj60djzM+ujv8yAIrbStkLU3SAwALsi0P'))
<class 'sklearn.gaussian_process.kernels.WhiteKernel'>
{'noise_level': 2.0, 'noise_level_bounds': (1e-05, 100000.0)}


### In the pipe

In [4]:
# create kernel
GPR_kernel = c3.SklearnGPRKernelWhite(noiseLevel=2).build().kernel

# build technique
GPReg_technique = c3.GaussianProcessRegressionTechnique(
                    randomState=42,
                    kernel = GPR_kernel
                )

# create pipe
GPReg_pipe = c3.GaussianProcessRegressionPipe(technique=GPReg_technique)

# train it
trained_GPReg_pipe = GPReg_pipe.train(input=X_train, targetOutput=y_train)

In [5]:
assert trained_GPReg_pipe.trainedModel is not None

In [6]:
trained_GPReg_pipe.hyperParams()

c3.Mapp<string, any>({'kernel': c3.SklearnGPRKernel(
            name='White',
            hyperParameters=c3.Arry<double>([2.0]),
            pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeYKz8gsSfUGc7gKGTQbCxlrC5k0IrgZGBjy8jOLU+NzUstScwqZ3R0YICBCCFUqPim/NC+luJDF3e7Jj60djzM+ujv8yAIrbStkLU3SAwALsi0P'),
 'randomState': 42})