This class uses the integral kernel to make DP predictions

In [1]:
import dp4gp_datasets
import dp4gp
import dp4gp_histogram

import random
import numpy as np
import GPy
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
c = pd.read_csv('price_dataset10k.csv')
#c = c.ix[random.sample(c.index, 4000)]
#east, north, time, education -> price
training_inputs = np.vstack([c['easting'].values,c['northing'].values]).T #c['seconds'].values)#,(c['QS501EW_6']/c['QS501EW_0']).values]).T
ys = c['price'].values

training_inputs = training_inputs[c['years'].values>2015,:]
ys = ys[c['years'].values>2015]
ys[ys>1e6] = 1e6 #threshold to make sensitivity manageable

Xtest, free_inputs, step = dp4gp.compute_Xtest(training_inputs,steps=10)



In [3]:
dpgp = dp4gp_histogram.DPGP_histogram(1e6,100.0,0.01)
dpgp.prepare_model(Xtest,training_inputs,step,ys)

In [4]:
p,c = dpgp.draw_prediction_sample(np.array([[520e3,240e3]]))

[[ 520000.       0.  240000.       0.]]
[[-1780.49620608]]
[[  2.62874063e+08]]


In [5]:
dpgp.model.predict(np.array([[520e3,0,180e3,0]]))

(array([[ 287402.68343799]]), array([[  1.04453618e+08]]))

In [None]:
c

In [None]:
xs = np.arange(480e3,550e3,10e3)
preds = []
for x in xs:
    p,c = dpgp.draw_prediction_sample(np.array([[x,240e3]]))
    preds.append(p[0,0])

In [None]:
preds

In [None]:
plt.plot(xs,np.array(preds))

In [None]:
def make_prediction(Xtest,X,step,ys):
    bincounts, bintotals, prices = bin_data(Xtest,X,step,ys)
    sens = 1e6
    sens_per_bin = sens/bincounts
    delta = 0.01
    epsilon = 100.0
    c = np.sqrt(2*np.log(2/delta)) #1.25 or 2 over delta?
    bin_sigma = c*sens_per_bin/epsilon #noise standard deviation to add to each bin
    #add DP noise to the prices
    dp_prices=prices+np.random.randn(prices.shape[0])*bin_sigma
    
    newXtest = np.zeros([Xtest.shape[0],2*Xtest.shape[1]])
    newXtest[:,0::2] = Xtest+step
    newXtest[:,1::2] = Xtest
    
    keep = ~np.isnan(dp_prices)
    #keep = bincounts>5
    finalXtest = newXtest[keep,:]
    #the integral kernel takes as y the integral... 
    #eg. if there's one dimension we're integrating over, km
    #then we need to give y in £.km
    #
    final_dp_prices = dp_prices[keep]
    meanoffset = np.mean(final_dp_prices)
    final_dp_prices-= meanoffset
    finalintegralprices = final_dp_prices * np.prod(step) 
    final_sigma = bin_sigma[keep]
    finalintegralsigma = final_sigma * np.prod(step)
    
    kernel = GPy.kern.Multidimensional_Integral_Limits(input_dim=newXtest.shape[1], variances=1.0, lengthscale=[3e3,3e3])
    kernel = kernel + GPy.kern.WhiteHeteroscedastic(input_dim=newXtest.shape[1], num_data=len(finalintegralsigma), variance=finalintegralsigma**2)
    m = GPy.models.GPRegression(finalXtest,finalintegralprices[:,None],kernel)
    #m.Gaussian_noise.variance.fix(1.0)
    m.optimize(max_iters=200)
    #m.predict_noiseless(np.array([[500e3,0,190e3,0]]))
    preds = []
    xtest=np.arange(480e3,580e3,1e3)
    for p in xtest:
        pred,v = m.predict_noiseless(np.array([[p,0,240e3,0]]))
        preds.append(pred[0,0]+meanoffset)
    return preds

In [None]:
preds = make_prediction(Xtest,training_inputs,step,ys)

In [None]:
keep = ~np.isnan(dp_prices)
#keep = bincounts>5
finalXtest = newXtest[keep,:]
#the integral kernel takes as y the integral... 
#eg. if there's one dimension we're integrating over, km
#then we need to give y in £.km
#
final_dp_prices = dp_prices[keep]
meanoffset = np.mean(final_dp_prices)
final_dp_prices-= meanoffset
finalintegralprices = final_dp_prices * np.prod(step) 
final_sigma = bin_sigma[keep]
finalintegralsigma = final_sigma * np.prod(step)

In [None]:
plt.figure(figsize=[13,13])
plt.scatter(X[:,0],X[:,1],ys/1e4)

    
for i in range(len(finalXtest)):
    l = final_dp_prices[i]/1e5
    plt.plot([finalXtest[i,0],finalXtest[i,0],finalXtest[i,1],finalXtest[i,1]],[finalXtest[i,2],finalXtest[i,3],finalXtest[i,3],finalXtest[i,2]],'k-',lw=l,alpha=0.6)
    plt.plot([finalXtest[i,0],finalXtest[i,1]],[finalXtest[i,2],finalXtest[i,3]],'-k',lw=l,alpha=0.6)
    plt.plot([finalXtest[i,1],finalXtest[i,0]],[finalXtest[i,2],finalXtest[i,3]],'-k',lw=l,alpha=0.6)

In [None]:
kernel = GPy.kern.Multidimensional_Integral_Limits(input_dim=newXtest.shape[1], variances=1.0, lengthscale=[3e3,3e3])
kernel = kernel + GPy.kern.WhiteHeteroscedastic(input_dim=newXtest.shape[1], num_data=len(finalintegralsigma), variance=finalintegralsigma**2)
m = GPy.models.GPRegression(finalXtest,finalintegralprices[:,None],kernel)
#m.Gaussian_noise.variance.fix(1.0)
m.optimize()
#m.predict_noiseless(np.array([[500e3,0,190e3,0]]))
preds = []
xtest=np.arange(480e3,580e3,1e3)
for p in xtest:
    pred,v = m.predict_noiseless(np.array([[p,0,240e3,0]]))
    preds.append(pred[0,0]+meanoffset)

In [None]:
plt.plot(finalintegralprices,finalintegralsigma,'x')

In [None]:
plt.plot(Xtest[:,0],preds,'x')
#plt.plot(X,ys,'.',alpha=0.1)

In [None]:
m.plot_noiseless(fixed_inputs=[(1,0),(3,0)])#,(0,200e3)])
plt.scatter(training_inputs[:,0],training_inputs[:,1],ys/10000,alpha=0.3)

In [None]:
AX = 1.0*np.array([[0,10,0,10],[10,20,0,10],[0,10,10,20],[10,20,10,20]])
Ay = 1.0*np.array([[100,200,200,100]]).T 
kernel = GPy.kern.Multidimensional_Integral_Limits(input_dim=4, variances=0.1, lengthscale=[1.0,1.0])
m = GPy.models.GPRegression(AX,Ay,kernel)
m.Gaussian_noise.variance.fix(0.000002)
#m.optimize()
m.predict_noiseless(np.array([[5.0,0,5.0,0]]))

In [None]:
kernel = GPy.kern.Multidimensional_Integral_Limits(input_dim=2, variances=1.0, lengthscale=10.0)
m = GPy.models.GPRegression(training_inputs,Y,kernel)
if lengthscale is not None:
    m.integral.lengthscale.fix(lengthscale)
m.optimize()

In [None]:
prices[nitems<10] = np.nan

In [None]:
sens_per_bin.shape

In [None]:
Xtest.shape

In [None]:
plt.figure(figsize=[12,12])
plt.scatter(X[:,0],X[:,1],(ys/20000)**2,'r',alpha=0.1)
plt.scatter(Xtest[:,0],Xtest[:,1],(prices/20000)**2)
