In [33]:
import dp4gp_datasets
import dp4gp
import random
import numpy as np
import GPy
import matplotlib.pyplot as plt
import dp4gp_histogram
import pandas as pd
from sklearn.model_selection import cross_val_score # http://scikit-learn.org/stable/developers/contributing.html#estimators
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.cluster import KMeans
%matplotlib inline

kung = dp4gp_datasets.load_kung()

def dp_unnormalise(y,normalisation_parameters):
    y = y * normalisation_parameters['std']
    y = y + normalisation_parameters['mean']
    return y
    
def dp_normalise(y, sensitivity, clip='midpoint'):
    """new_y,actual_sensitivity,normalisation_parameters = dp_normalise(y, sensitivity)
    
    Normalises the data to have outputs mean zero, std one.
    It also clips the data to lie within half the sensitivity
    of the data's mid point*, thus inforcing the DP assumptions
    for the sensitivity.
    
    *This behaviour can be modified or disabled by setting the clip parameter:
      - None                = don't clip
      - 'midpoint' (default) = a point halfway between the max and min values
      - 'mean'               = use the mean
      - 'median'             = use the median
    
    The method returns the new y values, the new sensitivity (in the now
    normalised range), and a dictionary of the mean and std to allow future
    unnormalisation"""
    
    if clip is not None:
        middley = None
        if clip=='midpoint': middley = (np.max(y)+np.min(y))/2
        if clip=='mean': middley = np.mean(y)
        if clip=='median': middley = np.median(y)
        assert middley is not None, "clip option invalid"
        
        y[y>middley+sensitivity/2] = middley+sensitivity/2
        y[y<middley-sensitivity/2] = middley-sensitivity/2

    #normalise...
    normalisation_parameters = {}
    normalisation_parameters['mean'] = np.mean(y)
    #ysub = (max(y)+min(y))/2.0 #todo decide what's best to use here...
    new_y = y - normalisation_parameters['mean']
    normalisation_parameters['std'] = np.std(y)
    new_y = new_y / normalisation_parameters['std']
    actual_sensitivity = sensitivity/normalisation_parameters['std']
    return new_y,actual_sensitivity,normalisation_parameters

class DPCloaking(BaseEstimator):
    def __init__(self, kern=None, sensitivity=1.0, epsilon=1.0, delta=0.01, inducing=None):
        """
        kern = a GPy kernel, Default: uses a default 1d RBF kernel, with default hyperparameters if not specified.
        inducing = locations of inducing points, default to None - not using inducing points.
        """
        self.kern = kern
        self.sensitivity = sensitivity
        self.epsilon = epsilon
        self.delta = delta
        self.inducing = inducing
        
    def fit(self, X, y, **kwargs):    
        if self.kern is None:
            self.kern = GPy.kern.RBF(1.0)
        if self.inducing is None:
            self.model = GPy.models.GPRegression(X,y,kern,normalizer=None)
            self.dpgp = dp4gp.DPGP_cloaking(self.model,self.sensitivity,self.epsilon,self.delta)
        else:
            if isinstance(self.inducing, list):
                inducinglocs = self.inducing
            else:
                inducinglocs = KMeans(n_clusters=self.inducing, random_state=0).fit(X).cluster_centers_
            self.model = GPy.models.SparseGPRegression(X,y,kern,normalizer=None,Z=inducinglocs)
            self.dpgp = dp4gp.DPGP_inducing_cloaking(self.model,self.sensitivity,self.epsilon,self.delta)
        return self

    def predict(self, X, Nattempts=2, Nits=100):
        ypred,_,_= self.dpgp.draw_prediction_samples(X,Nattempts=Nattempts,Nits=Nits)
        return ypred

y,ac_sens,norm_params = dp_normalise(kung[kung[:,3]==0,0:1],100.0)
X = kung[kung[:,3]==0,1:3]
epsilon = 1.0
delta = 0.01

kern = GPy.kern.RBF(2.0,lengthscale=25.0,variance=1.0)
scores_inducing = -cross_val_score(DPCloaking(sensitivity=ac_sens,kern=kern.copy(),inducing = 5),X,y,scoring='neg_mean_squared_error',cv = 10)
scores_normal = -cross_val_score(DPCloaking(sensitivity=ac_sens,kern=kern.copy()),X,y,scoring='neg_mean_squared_error',cv = 10)

#unnormalise the RMSE
print scores_inducing
print scores_normal
scores_inducing = dp_unnormalise(np.sqrt(np.mean(scores_inducing)),norm_params)
scores_normal = dp_unnormalise(np.sqrt(np.mean(scores_normal)),norm_params)

[[ 11.43019746   3.39622642]
 [ 42.2306705   45.2328125 ]
 [ 43.33315861  25.65810811]
 [ 39.0259217   67.24333333]
 [ 24.54683599  12.86486486]]
(29, 258)
*
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Stopped before convergence
*
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . Stopped before convergence
(3.8846795501345945, 3.2552472614374586, array([[ 1.03212117]]), 1.0, 0.0)
[[ 11.47845483   3.5       ]
 [ 42.43980531  25.916     ]
 [ 42.20779109  45.18166667]
 [ 24.48451817  13.16666667]
 [ 38.39053853  67.01875   ]]
(29, 258)
*
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

In [31]:
scores_inducing

145.10347469466075

In [32]:
scores_normal

157.99371060841199