In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [2]:
class Emulator:
    """
    Container for methods of training Gaussian Process emulators of various flexibilities and for evaluating the quality
    of these emulators.
    """

    def __init__(
        self,
        targetName="all",
        excludeFeats=[
            'acure_carb_res_ems_asi',
            'acure_carb_res_ems_lat',
            'acure_autoconv_exp_lwp',
            'acure_volc_so2',
            'acure_scav_diam',
            'acure_anth_so2_nam',
            'acure_sig_w',
            'acure_anth_so2_chi',
            'acure_carb_ff_ems_eur',
            'acure_carb_bb_ems_rsh',
            'acure_carb_bb_ems_sam',
            'acure_carb_bb_ems_bnh',
            'acure_oxidants_o3',
            'acure_kappa_oc',
            'acure_carb_bb_ems_rnh',
            'acure_carb_ff_ems_asi',
            'acure_carb_res_ems_r',
            'acure_anth_so2',
            'acure_carb_ff_ems_chi',
            'acure_rain_frac',
            'acure_carb_ff_ems_r',
            'acure_carb_bb_ems_saf',
            'm_ci',
            'acure_carb_ff_ems_nam',
            'c_r_correl',
            'acure_carb_bb_ems_naf',
            'acure_carb_res_ems_chi',
            'acure_carb_ff_ems_mar',
            'acure_carb_res_ems_afr',
            'acure_anth_so2_asi',
            'acure_prim_moc',
            'acure_anth_so2_eur',
            'two_d_fsd_factor',
            'acure_convective_plume_scavenging',
            'ai',
            'acure_carb_ff_diam',
            'acure_carb_res_ems',
            'acure_carb_bb_ems',
            'acure_carb_res_diam',
            'acure_carb_ff_ems',
            'acure_oxidants_oh',
            'acure_cloud_ice_thresh'
        ],
        nu=0.5,
        region=[0, 1, 0, 1],
        start_time="2017-07-01T00:00:00",
        end_time="2017-07-01T02:59:59",
        batchSize=10,
        centerTarget=True
    ):
        """
        Set the parameters of the emulator. Some parameters control the type of emulator in use (e.g. which features to
        include in training, which shape to use for the emulator) while others control where to emulate (in space and
        time).

        We repetitively use filters over the collection of GeoSurfaceTimePoints (a C3 Type) when training and finding
        models, so we declare for a particular Emulator instance what ranges on latitude, longitude, and time we want.
        To study another region in space and time, we must create a new Emulator instance.


        Arguments:

        targetname : string
            Choice of response variable. The default is the sum of all available categories of aerosol optical depth,
            a.k.a. "all."
        excludeFeats : list
            Which physical parameters to exclude from the emulator. The four default exclusions are parameters which
            were not perturbed in the given ensemble.
        nu : float
            The shape parameter for the Gaussian Process model for the emulator.
        region : list
            In the order: starting latitude, ending latitude, starting longitude, ending longitude
        start_time : string
        end_time : string
        batchSize : int
            If self.train=True, use this batch size when training the emulator as a (potentially large) collection of
            pixelwise models.
        centerTarget : bool
            When training, first center the target variable by subtracting off the mean? (Add it back to the predicted
            target to return to the correct scale.)


        Value:

        None
        """

        # Describe which model to fit
        self.targetName = targetName
        self.excludeFeats = excludeFeats
        self.inputs = list(
            set(self.__get_input_names__()) - set(self.excludeFeats)
        )
        self.nu = nu

        # Define where to consider emulation
        self.lon1 = region[0]
        self.lon2 = region[1]
        self.lat1 = region[2]
        self.lat2 = region[3]
        self.time1 = start_time
        self.time2 = end_time

        # Describe how to fit the emulator
        self.batchSize = batchSize
        self.centerTarget = centerTarget

        return



    """
    Training the emulator
    """



    def train(
        self,
        pixelwise=True,
        labels=None,
        lengthScales=None
    ):
        """
        This one method will train an emulator object of choice and return an object which can be used to obtain an
        emulation method. Some use cases require a pixelwise emulator, so in that case the value of this method is the
        batch training job. Other cases require an emulator which can incorporate geospatial correlations, in which case
        the Gaussian Process regression model used as the emulator will be returned.

        Creates attributes self.numLengthscales, self.GPR_kernel, and self.GPR_technique. Will create self.labels and
        self.lengthScales if corresponding arguments are not None.


        Arguments:

        pixelwise : bool
            Which type of emulator is desired, e.g. the pixelwise or full emulator?
        lengthScales : pandas.DataFrame
            If pixelwise is False, then supply a resource for finding which geoSurfaceTimePoints are to be used in
            training the full emulator regionally.
        labels : list
            If pixelwise is False, then supply a resource for finding which geoSurfaceTimePoints belong to each region so
            that the regional emulators may be trained separately.


        Value:

        C3 GaussianProcessRegressionPipe (if not pixelwise), or C3 AODGaussianMLTrainingJob (if pixelwise)
            Either a regional emulator itself or an object with which one can obtain all of the emulators.
        """


        # The regional emulator incorporates latitude, longitude, and time in predictions.
        self.numLengthscales = len(self.inputs)
        if not pixelwise:
            self.numLengthscales += 3


        # Labels and lengthscales help with regional emulation.
        if labels is not None:
            self.labels = labels
        if lengthScales is not None:
            self.lengthscales = lengthScales


        # Create objects used to define the training job.
        self.GPR_kernel = c3.SklearnGPRKernelMatern(
            lengthScale=[1.0]*self.numLengthscales,
            nu=self.nu,
            coefficient=1.0
        ).build().kernel.upsert()

        self.GPR_technique = c3.GaussianProcessRegressionTechnique(
            randomState=42,
            kernel=self.GPR_kernel
        )
        self.GPR_technique.centerTarget = self.centerTarget
        self.GPR_technique.upsert()


        # Start the training job.
        if pixelwise:
            return self.__train_pixelwise__()
        else:
            return self.__train_regional__()


    def __train_pixelwise__(
        self
    ):
        """
        During initial stages of the experiment, we aim to describe homogeneous regions of the globe over time where
        the simulated data behave similarly, and to do that we use individual emulators for each point in space and time.
        This task is quite large and is exemplified in more detail https://tc02d-dev.c3dti.ai/jupyter/notebooks/
        research_teams/gordon/applying-methods/BatchedML/AODMLTrainingJobControl.ipynb.


        Arguments:
        
        GPR_kernel : C3 SklearnGPRKernelMatern type
            A C3 container of an sklearn GaussianProcessRegressor.Matern kernel.
        GPR_technique : C3 GaussianProcessRegressionTechnique type
            A C3 container of all parameters required for defining an sklearn GaussianProcessRegressor.


        Value:

        C3 AODGaussianMLTrainingJob
            A batch job filtering out selected GeoSurfaceTimePoints and training models according to the GPR_technique at
            each one. The job is started upon return.
        """

        # Define the job
        jobOptions = c3.AODGaussianMLTrainingJobOptions(
            batchSize=self.batchSize,
            gstpFilter=self.__make_filter__(),
            targetName=self.targetName,
            gprTechnique=self.GPR_technique,
            excludeFeatures=self.excludeFeats
        )

        job = c3.AODGaussianMLTrainingJob(
            options=jobOptions
        ).upsert()
        
        # Start the job and return it to check status later
        job.start()

        return job


    def __train_regional__(
        self
    ):
        """
        A list of trained models used as an emulator for all of the data associated within all clusters / regions.


        Arguments:

        GPR_kernel : C3 SklearnGPRKernelMatern type
            A C3 container of an sklearn GaussianProcessRegressor.Matern kernel.
        GPR_technique : C3 GaussianProcessRegressionTechnique type
            A C3 container of all parameters required for defining an sklearn GaussianProcessRegressor.
        lengthScales : pandas.DataFrame
            A resource for finding which geoSurfaceTimePoints are to be used in training the full emulator
            regionally.
        labels : list
            A resource for finding which geoSurfaceTimePoints belong to each region so that the regional emulators may be
            trained separately.


        Value:

        C3 GaussianProcessRegressionPipe type
            A trained model.
        """

        model_training_actions = {}
        cluster_idxs = np.unique(self.labels)
        
        for idx in cluster_idxs:
            
            model_id = "regionalEmulator" + str(idx)
            
            gstp_ids = list(
                self.lengthscales.loc[self.labels==idx, "modelId"]
            )
            
            GPR_dataspec = c3.GPRDataSourceSpec(
            ).upsert()

            GPR_pipe = c3.GaussianProcessRegressionPipe(
                technique=self.GPR_technique,
                id=model_id
            )

            # action spec
            spec = c3.AsyncActionSpec(
                typeName="GaussianProcessRegressionPipe",
                action="trainWithListOfAODModels",
                args={
                    "modelIds": gstp_ids,
                    "excludeFeatures": [],
                    "this": GPR_pipe.toJson()
                }
            )
            
            # submit action
            model_training_actions[model_id] = c3.AsyncAction.submit(spec)

        return model_training_actions


    def fetch_regional_emulators(
        self,
        model_training_actions
    ):

        for action in model_training_actions.values():
            if not action.hasCompleted():
                return

        models = {}
        cluster_idxs = np.unique(self.labels)
        
        for idx, action in zip(cluster_idxs, model_training_actions.keys()):
            models[idx] = c3.GaussianProcessRegressionPipe.get(action)

        return models


    """
    Perform emulation
    """



    def emulate_variant(
        self,
        pixelwise=True,
        synthDataset=None,
        regionalEmulators=None,
        queryVariant=None,
        featureNames=None,
        lengthScales=None,
        labels=None
    ):
        """
        Acquire the emulated response for a particular model variant over the whole map.


        Arguments:
        
        queryVariant : list of floats
            Values of parameters at which to emulate AOD


        Value:

        pandas DataFrame
            Emulated AOD and emulation error standard deviation
        """

        if pixelwise:

            points = lengthScales.loc[:, ["latitude", "longitude", "time"]]
            preds = pd.DataFrame()

            for row in lengthScales.index:

                pipe = c3.GaussianProcessRegressionPipe.get(lengthScales.modelId[row])
                
                df_c = self.__predict_with_single_pipe__(
                    pipe,
                    queryVariant,
                    idx=0,
                    pixelwise=True,
                    lengthScales=None,
                    labels=None
                )
                
                if self.centerTarget:
                    df_c.iloc[:, 0] += pipe.trainedModel.parameters['targetMean']

                preds = pd.concat([preds, df_c], axis=0)

            preds.columns = ['meanResponse', 'sdResponse']
            results = pd.concat(
                [points, preds.reset_index(drop=True)],
                axis=1
            ).sort_values(
                ['time', 'longitude', 'latitude']
            ).reset_index(
                drop=True
            )

            return results

        else:
            
            cluster_idxs = np.unique(labels)
            emulations = {}
        
            for idx in cluster_idxs:
                
                my_emulator = c3.GaussianProcessRegressionPipe.get(regionalEmulators[idx])

                emulations[idx] = self.__predict_with_single_pipe__(
                    my_emulator,
                    queryVariant,
                    idx=idx,
                    pixelwise=pixelwise,
                    lengthScales=lengthScales,
                    labels=labels
                )

            preds = pd.concat(
                list(emulations.values()),
                axis=0
            ).fillna(
                0
            ).sort_values(
                ['time', 'longitude', 'latitude']
            ).reset_index(
                drop=True
            )

            return preds


    def __emulate_pixelwise__(
        self,
        nVariants=50
    ):
        self.synthDataset = self.__make_synth_dataset__(
            nVariants
        )

        job = c3.PredictAODGPR.makePredictionsJob(
            self.excludeFeats,
            self.__make_filter__(),
            self.targetName,
            self.synthDataset, ###
            self.GPR_technique,
            self.batchSize)
        
        return job


    def __emulate_regional__(
        self        
    ):
        return


    def __predict_with_single_pipe__(
        self,
        pipe,
        queryVariant,
        idx=0,
        pixelwise=True,
        lengthScales=None,
        labels=None
    ):
        """
        Acquire the emulated response for a particular model variant over the whole map.


        Arguments:
        
        pipe : C3 GaussianProcessRegressionPipe type
            Trained emulator.
        queryVariant : list of floats
            Values of parameters at which to emulate AOD


        Value:

        pandas DataFrame
            Emulated AOD and emulation error standard deviation
        """

        if pixelwise:

            # cast it into a c3.Dataset
            inputs = c3.Dataset.fromPython(pythonData=queryVariant)

            y_c = pipe.process(input=inputs, computeCov=True)
            df_c = c3.Dataset.toPandas(y_c)

            return df_c

        else:
        
            lengthscales = lengthScales.copy()

            predict_points = lengthscales.loc[labels==idx, ["latitude", "longitude", "time"]]

            variant_columns = np.array(
                pd.concat(
                    [queryVariant]*predict_points.shape[0],
                    axis=0
                )
            )

            predict_points[self.inputs] = variant_columns

            # cast it into a c3.Dataset
            inputs = c3.Dataset.fromPython(pythonData=predict_points)

            y_c = pipe.process(input=inputs, computeStd=True) # computeCov=True)
            df_c = np.array(
                c3.Dataset.toPandas(y_c)
            )

            predict_points[['meanResponse', 'sdResponse']] = df_c #+ list(predict_points.index)] = df_c
            if self.centerTarget:
                    predict_points.meanResponse += pipe.trainedModel.parameters['targetMean']

            return predict_points


    def __make_synth_dataset__(
        self,
        nVariants
    ):
        synth = np.random.rand(nVariants, self.numLengthscales)

        # cast it into a c3.Dataset
        return c3.Dataset.fromPython(pythonData=synth)



    """
    Visualize emulator
    """



    def plot_variant(
        self,
        preds
    ):
        """
        Given an emulated variant, plot it.


        Arguments:
        
        preds : pandas DataFrame
            The emulated variant. The result of method self.emulate_variant().


        Value:

        None
        """

        BBox = [self.lon1, self.lon2, self.lat1, self.lat2]

        for time in np.unique(preds.time):

            df = preds.loc[preds.time==time, :]

            data = df[
                (df.longitude >= BBox[0]) &
                (df.longitude <= BBox[1]) &
                (df.latitude >= BBox[2]) &
                (df.latitude <= BBox[3])
            ]
            
            projection = ccrs.PlateCarree(central_longitude=0)
            fig = plt.figure(figsize=(20,20), facecolor='yellow')

            # Draw island
            ax = fig.add_subplot(1, 1, 1, projection=projection)
            ax.coastlines()

            # Produce gridlines, coordinate labels
            ax.set_extent(BBox, ccrs.PlateCarree())
            ax.gridlines(draw_labels=True, crs=projection)

            # Add points along flight path and color for altitude
            scatter = ax.scatter(data.longitude, data.latitude, zorder=1, alpha=1,
                                 c=data.response, cmap="Reds")

            cbar = plt.colorbar(scatter, shrink=0.4)
            cbar.set_label('emulated AOD')

            plt.title(np.unique(preds.time)[0])
            plt.show()
        
        return



    """
    Helper functions
    """

    
    
    def __get_datasets__(
        self,
        N,
        GPR_pipe,
        lengthScales,
        labels
    ):
        """
        Make training datasets for a regional emulator. This requires doctoring the features table a bit beyond how it is
        obtained by default from the "getFeatures" method.
        """
        X_init = GPR_pipe.getFeatures()
        dfX_base = c3.Dataset.toPandas(dataset=X_init).loc[:, self.regionalFeatureNames] ### Try just a few inputs
        dfX = pd.DataFrame()
        for k in range(N):
            dfX_init = dfX_base.copy()
            dfX_init[["latitude", "longitude", "time"]] = [
                lengthScales.loc[k, "latitude"], 
                lengthScales.loc[k, "longitude"], 
                lengthScales.loc[k, "time"].timestamp()
            ]
            dfX = pd.concat([dfX, dfX_init], axis=0, ignore_index=True)

        X = c3.Dataset.fromPython(dfX)
        y = GPR_pipe.getTarget()
        
        return X, y


    def __get_input_names__(
        self
    ):
        """
        Get a table which lists the inputs to be considered for GP regression. Some inputs are assumed a priori to be 
        irrelevant (e.g. ones with 'carb' or 'ems' in the name). (Note: Maybe we want to make this selection step later, 
        during GP emulation.)


        Arguments:

        None


        Value:

        pandas DataFrame
            Table of parameters (inputs) available and their values for each member of the perturbed parameter ensemble
        """
        # Fetch the inputs table
        csv_table_metric = c3.SimulationModelParameters.fetch().objs.toJson()
        all_inputs = pd.DataFrame(csv_table_metric)

        # Filter out inputs we won't use
        use_names = []
        for input_name in all_inputs.columns:
            if input_name not in ['id', 'type', 'meta', 'version', 'ensemble']:
                use_names.append(str(input_name))

        return all_inputs[use_names].columns


    def __make_filter__(
        self
    ):
        """
        Based on the initial inputs for the Emulator, decide which GeoSurfaceTimePoints to filter.


        Arguments:

        None


        Value:

        C3 Filter type
            A filter that can be used to describe collections of GeoSurfaceTimePoint objects with intervals.
        """

        return c3.Filter() \
            .ge("latitude", self.lat1) \
            .and_().le("latitude", self.lat2) \
            .and_().ge("longitude", self.lon1) \
            .and_().le("longitude", self.lon2) \
            .and_().ge("time", self.time1) \
            .and_().le("time", self.time2)


    def __retrieve_emulator_pixelwise__(
        self
    ):
        """
        Collect the pixelwise emulator parameters.


        Arguments:

        None


        Value:

        C3 extractLearnedParametersJob type
            A DynMapReduce job which has been started.
        """
        
        # Filter data for job
        gstpFilter = self.__make_filter__()
        
        # Describe which model to fit
        self.inputs = list(
            set(self.__get_input_names__()) - set(self.excludeFeats)
        )

        # Recall the objects used to define the training job to find the resultant models
        GPR_kernel = c3.SklearnGPRKernelMatern(
            lengthScale=[1.0]*len(self.inputs),
            nu=0.5,
            coefficient=1.0
        ).build().kernel

        GPR_technique = c3.GaussianProcessRegressionTechnique(
            randomState=42,
            kernel=GPR_kernel,
            centerTarget=self.centerTarget
        )

        # Initiate the job
        job = c3.AODGPRModelFinder.extractLearnedParametersJob(
            self.excludeFeats,
            gstpFilter,
            "all",
            GPR_technique,
            10
        )
        
        return job


    def __retrieve_pixelwise_parameters__(
        self,
        job
    ):
        """
        If the pixelwise emulator parameters have been successfully collected, return them in a table without redundancy.


        Arguments
        
        job : C3 MapReduce type
            The job assigned to collect parameters, the status of which may be incomplete (in which case we may not return
            the table we request).


        Values

        pandas DataFrame
            The collected parameters, with latitude, longitude, timestamps, and model IDs used for training at that point.
        """

        if job.status().status == 'completed':
            # Make the data frame with a built-in method
            df = c3.AODGPRModelFinder.getDataframeFromJob(job)

            # Change types of descriptive columns
            df[["latitude", "longitude"]] = df[["latitude", "longitude"]].applymap(lambda x:float(x))
            df["time"] = pd.DataFrame(df["time"]).applymap(lambda x:pd.Timestamp(str(x)))

            # Remove duplicates
            df = df.drop_duplicates(
                subset=["latitude", "longitude", "time"]
            ).reset_index(
                drop=True
            )

            # Assign names of parameters to columns. Note that self.inputs is in a user-specified order, but the input
            # table returned by the GaussianProcessRegressionPipe's `getFeatures` method has an intrinsic order on the
            # inputs. The list comphrehension logic below is used in place of simply the `self.inputs` list to respect
            # the intrinsic order.
            df = df.rename(
                columns=dict(zip(
                    df.columns,
                    [x for x in self.__get_input_names__() if x in self.inputs]
                ))
            )

            return df

        else:
            print("Retrieval job not complete.")
            return
