In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Get the data

In [1]:
def get_features_and_outputs_from_simulations(output_name, timestamp):
    """
    Reads the simulation parameter map, fetches Cassandra to obtain the correspoding output for the specified timestamp.
    Splits the data into train/test sets with, returns c3.Dataset for each one of them.
    
    
    Inputs:
    - str output_name: name of the variable in SimulationModelOutput under investigation
    - str timestamp: shape of the timestamp to fetch the timeseries, e.g. "2017-08-19T09:00:00.000"
    
    Returns:
    - c3.Dataset X_train: train set of features
    - c3.Dataset X_test: test set of features
    - c3.Dataset y_train: train set of outputs
    - c3.Dataset y_test: test set of outputs

    """
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    # fetch simulation parameters
    parameters = c3.SimulationModelParameters.fetch().objs
    parameters = parameters.toJson()
    df = pd.DataFrame(parameters)
    simulations = pd.DataFrame(df['id'])
    X = df[df.columns[5:]]
    
    # define simple metric
    metric_name = "Average_" + output_name + "_SimulationSample" 
    metric_descr = "Calculates average of " + output_name + " for a given set of SimulationSample"
    metric_expr = "avg(avg(normalized.data." + output_name + "))"
    metric = c3.SimpleMetric(id = metric_name,
                                    name = metric_name,
                                    description = metric_descr,
                                    srcType = "SimulationSample",
                                    path = "output",  # the timeseries is in the output field of SimSam
                                    expression = metric_expr
                                  )
    
    # define metric spec
    spec = c3.EvalMetricsSpec(
                                    ids = simulations['id'],
                                    expressions = [metric_name],
                                    start = timestamp,
                                    end = timestamp,
                                    interval = "SECOND" 
                                )

    # evaluate metric, cast it to pandas
    evalMetricsResult = c3.SimulationSample.evalMetricsWithMetadata(
                                                                        spec=spec,
                                                                        overrideMetrics=[metric]
                                                                    )
    y = c3.EvalMetricsResult.toPandas(result=evalMetricsResult)
    
    
    # split into train/test sets
    datasets = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # cast into c3 Datasets
    X_train = c3.Dataset.fromPython(datasets[0])
    X_test = c3.Dataset.fromPython(datasets[1])
    y_train = c3.Dataset.fromPython(datasets[2])
    y_test = c3.Dataset.fromPython(datasets[3])
    
    return X_train, X_test, y_train, y_test

In [2]:
X_train, X_test, y_train, y_test = get_features_and_outputs_from_simulations("mass_BC_acc", "2017-08-19T09:00:00.000")

# Build the pipeline

In [3]:
# first pipe is PCA
pca = c3.MLStep(
    name="PCA",
    pipe=c3.PrincipalComponentAnalysisPipe(
        technique=c3.PrincipalComponentAnalysisTechnique(
            nComponents=20
        )
    )
)

In [4]:
# second pipe is GP Regression
GPR_kernel = c3.SklearnGPRKernelConstant(constantValue=1.0).build().kernel

gp_reg = c3.MLStep(
    name="GaussianProcessRegression",
    pipe=c3.GaussianProcessRegressionPipe(
        technique=c3.GaussianProcessRegressionTechnique(
            randomState=42,
            kernel=GPR_kernel
        )
    )
)

In [5]:
# plug pipes
pipeline = c3.MLSerialPipeline(
    steps=[
        pca,
        gp_reg
    ]
)

# Train the pipeline

In [6]:
trained_pipeline = pipeline.train(input=X_train, targetOutput=y_train)

# Make predictions

In [7]:
y_test_preds = trained_pipeline.process(input=X_test)
y_test_preds = c3.Dataset.toPandas(dataset=y_test_preds)
y_test_preds

Unnamed: 0,0
0,1.750878e-10
1,1.750878e-10
2,1.750878e-10
3,1.750878e-10
4,1.750878e-10
5,1.750878e-10
6,1.750878e-10
7,1.750878e-10
8,1.750878e-10
9,1.750878e-10


In [8]:
print(trained_pipeline.hyperParams())

c3.Mapp<string, any>({'GaussianProcessRegression__kernel': c3.SklearnGPRKernel(
                                       name='Constant',
                                       hyperParameters=c3.Arry<double>([1.0]),
                                       pickledKernel='eJxrYEouzs5JTSzK00tPLC0uzkzMiy8oyk9OLS7Wy04tykvNKeZyzs8rLknMK/EG87kKGTQbCxlrC5k0IvgYGBiSobLxZYk5pamFzO72HxjAIEIUQzY+Kb80L6W4kMXd7smPrR2PMz66O/zIAqtuK2QtTdIDACemMe8='),
 'GaussianProcessRegression__randomState': 42.0})


In [None]:
trained_pipeline.withHyperParameters()