# Experimental Results for Paper

In [1]:
import dp4gp_datasets
import dp4gp
import random
import numpy as np
import GPy
import matplotlib.pyplot as plt
import dp4gp_histogram
import pandas as pd
%matplotlib inline



## House prices


Skip this if you want, and load precomputed data from the csv file below:

In [2]:
#dp4gp_datasets.prepare_preloaded_prices('test.csv', boundingbox=[480e3, 130e3, 580e3, 230e3], N=10000, col_list=['QS501EW'])

Load precomputed dataset:

In [3]:
import pandas as pd
dataset = pd.read_csv('price_dataset10k.csv') #london, 10k purchases

Reduce the size of the dataset and split into training and test data:

In [4]:
#dataset = dataset[dataset['years']>2005]
dataset = dataset.ix[random.sample(dataset.index, 500)]

Get it into a form we'll be able to predict over

In [5]:
#east, north, time, education -> price
inputs = np.vstack([dataset['easting'].values,dataset['northing'].values,dataset['seconds'].values,(dataset['QS501EW_6']/dataset['QS501EW_0']).values]).T
#training_inputs = training_inputs[:,0:3]
ys = dataset['price'].values

training_inputs = inputs[0:-100,:]
training_ys = ys[0:-100]
test_inputs = inputs[-100:,:]
test_ys = ys[-100:]

In [58]:
#squash data into £0k-£1M range
training_ys[training_ys>1e6] = 1e6
training_ys[training_ys<0] = 0
sens = 1e6-0

def get_cloaking_prediction(training_inputs, training_ys, test_inputs, sens, eps, delta):
    rbf = GPy.kern.RBF(training_inputs.shape[1],1000e3,[5e3,5e3,50*31536000,5.0],ARD=True)
    mod = GPy.models.GPRegression(training_inputs,training_ys,rbf)
    mod.Gaussian_noise = 2000e3
    dpgp = dp4gp.DPGP_cloaking(mod,sens,eps,delta)
    preds, mu, cov = dpgp.draw_prediction_samples(test_inputs,1,1,10)
    return preds, cov
    
def get_integral_prediction(training_inputs, training_ys, test_inputs, sens, eps, delta):
    Xtest, free_inputs, step = dp4gp.compute_Xtest(training_inputs,steps=10)
    dpgp = dp4gp_histogram.DPGP_histogram(sens,eps,delta)
    dpgp.prepare_model(Xtest,training_inputs,step,ys,lengthscale=[3e3,3e3])
    #dpgp.optimize()
    dpgp.model.optimize(max_iters=10,messages=True)
    preds, cov = dpgp.draw_prediction_samples(test_inputs)
    return preds, cov

def get_standard_prediction(training_inputs, training_ys, test_inputs, sens, eps, delta):
    rbf = GPy.kern.RBF(training_inputs.shape[1],1000e3,[5e3,5e3,50*31536000,5.0],ARD=True)
    mod = GPy.models.GPRegression(training_inputs,training_ys,rbf)
    mod.Gaussian_noise = 2000e3
    dpgp = dp4gp.DPGP_normal_prior(mod,sens,eps,delta)
    preds, mu, cov = dpgp.draw_prediction_samples(test_inputs,1)
    return preds, cov

def get_pseudo_prediction(training_inputs, training_ys, test_inputs, sens, eps, delta):
    rbf = GPy.kern.RBF(training_inputs.shape[1],1000e3,[5e3,5e3,50*31536000,5.0],ARD=True)
    print training_ys.ndim
    print training_inputs.shape
    mod = GPy.models.SparseGPRegression(training_inputs,training_ys,kernel=rbf,num_inducing=40) #no idea how many inducing!
    mod.inference_method = GPy.inference.latent_function_inference.FITC()
    mod.set_Z(training_inputs[0:40,:]) #grab random inputs as pseudoinputs
    mod.Gaussian_noise = 2000e3    
    dpgp = dp4gp.DPGP_pseudo_prior(mod,sens,eps,delta)
    preds, mu, cov = dpgp.draw_prediction_samples(test_inputs,1)
    return preds, cov

In [59]:
fns = [get_cloaking_prediction,get_pseudo_prediction,get_standard_prediction,get_integral_prediction]
labels = ["Cloaking","Pseudo","Standard","Integral"]
results = []
for fn,label in zip(fns,labels):
    preds, cov = fn(training_inputs,training_ys[:,None],test_inputs,sens,1.0,0.01)
    RMSE = np.sqrt(np.mean((test_ys-preds)**2))
    results.append({'label':label, 'preds':preds, 'cov':cov, 'RMSE':RMSE})

(100, 400)
*
. . . . . . . . . . Stopped before convergence
(1000000.0, 3.2552472614374586, array([[ 1.18160085]]), 1.0, 3.9257908262295129e-249)
2
(400, 4)


In [60]:
for r in results:
    print r['label'],r['RMSE']

Cloaking 752907.946882
Pseudo 369261.55233
Standard 353688.293534
Integral 347599.372364


### Other code

In [None]:
plt.figure(figsize=[13,10])
plt.errorbar(test_ys,preds,1.96*np.sqrt(np.diag(cov)),fmt=None,color='k',alpha=0.2)
plt.plot(test_ys,preds,'+k')
plt.xlim([0,1e6])
plt.ylim([0,1e6])
plt.plot([0,1e6],[0,1e6],'k-')

In [None]:
#dpgp.plot(fixed_inputs=[(2,np.mean(training_inputs[:,2])), (3,np.mean(training_inputs[:,3]))],steps=12)