In [67]:
import dp4gp_datasets
import dp4gp
import random
import numpy as np
import GPy
import dp4gp_histogram
import pandas as pd


def get_citibike_data():
    allcb = dp4gp_datasets.load_citibike(station=None)
    subcb = allcb[allcb['usertype']=='Subscriber']

    cb = subcb.ix[random.sample(subcb.index, 5000)]
    inputs = np.c_[cb['start station latitude'],cb['end station latitude'],cb['start station longitude'],cb['end station longitude']]
    ys = cb['tripduration'].values
    
    return inputs, ys
    

In [68]:

import dp4gp
import GPy
from sklearn.metrics import mean_squared_error
import numpy as np
import sys
import scipy
from scipy.stats import multivariate_normal
from scipy.optimize import minimize
import matplotlib.pyplot as plt

def bin_data(Xtest,X,step,ys):
    """
    Bin data X into equally sized bins defined by Xtest and step.
    Xtest is the coordinates of the corner of each bin.
    step is a vector of step sizes.
    ys are the outputs (to be summed and averaged)
    
    Returns:
    bincounts
    bintotals
    binaverages = bintotals/bincounts
    """
    bintotals = np.zeros(Xtest.shape[0])
    bincounts = np.zeros(Xtest.shape[0])
    for i,tile in enumerate(Xtest): #loop through the tiles
        for x,y in zip(X,ys): #loop through the data
            intile = True
            for tiled,xd,s in zip(tile,x,step): #loop through the dimensions of the current tile, data and step
                if (xd<tiled) or (xd>tiled+s):
                    intile = False
                    break
            if intile:
                bintotals[i]+=y
                bincounts[i]+=1
    binaverages = bintotals/bincounts
    return bincounts, bintotals, binaverages

class DPGP_histogram(dp4gp.DPGP):
    """Using the histogram method"""
    
    def __init__(self,sens,epsilon,delta):      
        super(DPGP_histogram, self).__init__(None,sens,epsilon,delta)

    def prepare_model(self,Xtest,X,step,ys,variances=1.0,lengthscale=1):
        """
        Prepare the model, ready for making predictions"""
        bincounts, bintotals, binaverages = bin_data(Xtest,X,step,ys)
        sens_per_bin = self.sens/bincounts
        c = np.sqrt(2*np.log(1.25/self.delta)) #1.25 or 2 over delta?
        bin_sigma = c*sens_per_bin/self.epsilon #noise standard deviation to add to each bin
        #add DP noise to the binaverages
        dp_binaverages=binaverages+np.random.randn(binaverages.shape[0])*bin_sigma

        #we need to build the input for the integral kernel
        newXtest = np.zeros([Xtest.shape[0],2*Xtest.shape[1]])
        newXtest[:,0::2] = Xtest+step
        newXtest[:,1::2] = Xtest

        #we don't want outputs that have no training data in.
        empty = np.isnan(dp_binaverages)
        dp_binaverages[empty] = 0 #we'll make those averages zero

        self.Xtest = newXtest
        self.dp_binaverages = dp_binaverages
     
    def draw_prediction_samples(self,Xtest,N=1):
        assert N==1, "DPGP_histogram only returns one DP prediction sample (you will need to rerun prepare_model to get an additional sample)"
    
        #return mean+self.meanoffset, cov

In [69]:
def get_hist_prediction(training_inputs, training_ys, test_inputs, sens, eps, delta,noise,modvar,kernval,kern_ls,steps):
    Xtest, free_inputs, step = dp4gp.compute_Xtest(training_inputs,steps=steps)
    print step
    dpgp = dp4gp_histogram.DPGP_histogram(sens,eps,delta)
    dpgp.prepare_model(Xtest,training_inputs,step,training_ys,lengthscale=kern_ls)
    preds, cov = dpgp.draw_prediction_samples(test_inputs)
    return preds, cov


In [70]:
vs = []
for its in range(30):
    inputs, ys = get_citibike_data()
    ys[ys>2000] = 2000
    ys[ys<0]= 0

    ys_mean = np.mean(ys)
    ys_std = np.std(ys)
    ys = ys - ys_mean
    ys = ys / ys_std

    training_inputs = inputs[0:-100,:]
    training_ys = ys[0:-100][:,None]
    test_inputs = inputs[-100:,:]
    test_ys = ys[-100:][:,None]

    test_inputs = inputs[-100:,:]
    test_ys = ys[-100:][:,None]

    results = {}

    for steps in [3,6,10]:
        results[steps] = {} 
        for eps in [0.2,0.5,1.0,1000]:
            print "Steps = %d, eps = %0.1f" % (steps,eps)
            if eps>100:
                sens = 0
            else:
                sens = 2000
            kernvar = 10.0
            kern_ls = np.array([0.01,0.01,0.01,0.01])*1.0


            sens = sens / ys_std

            labels = ["Histogram"]

            preds, cov = get_hist_prediction(training_inputs,training_ys,test_inputs,sens,eps,0.01,10.0,1.0, kernvar, kern_ls, steps)
            RMSE = np.sqrt(np.mean((test_ys-preds)**2))
            results[steps][eps] = {'label':label, 'preds':preds, 'cov':cov, 'RMSE':RMSE, 'ys_std':ys_std}
    v = []
    for j in np.sort(results.keys()):
        for k in np.sort(results[j].keys())[::-1]:
            #print str(j)+","+str(k)+":",
            #print str(results[j][k]['RMSE']*results[j][k]['ys_std'])+",",
            v.append(results[j][k]['RMSE']*results[j][k]['ys_std']) 
    vs.append(v)

Steps = 3, eps = 0.2
[ 0.04332076  0.04332076  0.03489734  0.03489734]
Steps = 3, eps = 0.5
[ 0.04332076  0.04332076  0.03489734  0.03489734]
Steps = 3, eps = 1.0
[ 0.04332076  0.04332076  0.03489734  0.03489734]
Steps = 3, eps = 1000.0
[ 0.04332076  0.04332076  0.03489734  0.03489734]
Steps = 6, eps = 0.2
[ 0.02166038  0.02166038  0.01744867  0.01744867]
Steps = 6, eps = 0.5
[ 0.02166038  0.02166038  0.01744867  0.01744867]


KeyboardInterrupt: 

In [66]:
for j in np.sort(results.keys()):
    for k in np.sort(results[j].keys())[::-1]:
        print str(j)+","+str(k)+":",
        print str(results[j][k]['RMSE']*results[j][k]['ys_std'])+",",


3,1000.0: 626.930832709, 3,1.0: 659.28615086, 3,0.5: 656.084486665, 3,0.2: 781.520064025,


In [63]:
np.array(vs)

array([[  597.49867664,   866.92360136,   914.87441455],
       [  556.51963002,   573.17012207,   566.6756472 ],
       [  580.80305628,   762.08541878,  1614.43088419],
       [  562.38447   ,   625.89161002,   649.45262003],
       [  626.93083271,   659.28615086,   656.08448667]])