In [142]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal
#import pysal.lib
import gpytorch
import torch
import math

In [143]:
data = pd.read_csv("C:/Users/kklemmer/OneDrive - The Alan Turing Institute/SpaceGAN/housing.csv")
#Create ID column
data["id"] = np.asarray(list(range(0,len(data["longitude"])))).reshape(-1,1)

In [144]:
data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2


In [145]:
data_drop = data.head(18713) #Last column is broken!
data.drop(data_drop.tail(1).index,inplace=True)
data = data[np.isfinite(data["total_bedrooms"])]
data = data[np.isfinite(data["total_rooms"])]
data = data[np.isfinite(data["housing_median_age"])]
data = data[np.isfinite(data["population"])]
data = data[np.isfinite(data["households"])]
data = data[np.isfinite(data["median_income"])]
data = data[np.isfinite(data["median_house_value"])]
data.shape

(20432, 11)

In [146]:
#data.to_csv("test.csv")
#data = data.head(100)

In [147]:
train_x = np.asarray(data[["longitude","latitude"]])
train_y = np.asarray(data[["housing_median_age","total_rooms","total_bedrooms","population","households","median_income","median_house_value"]])

In [148]:
from sklearn import preprocessing
train_x = train_x / 100
train_y = preprocessing.MinMaxScaler().fit_transform(train_y)

In [149]:
train_x = torch.from_numpy(train_x).float()
train_y = torch.from_numpy(train_y).float()

In [150]:
#train_x = torch.tensor(data[["longitude","latitude"]].values).float()
#train_y = torch.tensor(data[["housing_median_age","total_rooms","total_bedrooms","population","households","median_income","median_house_value"]].values).float()

In [151]:
train_y.shape

torch.Size([20432, 7])

In [152]:
class MultitaskGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(MultitaskGPModel, self).__init__(train_x, train_y, likelihood)

        # SKI requires a grid size hyperparameter. This util can help with that
        grid_size = gpytorch.utils.grid.choose_grid_size(train_x)

        self.mean_module = gpytorch.means.MultitaskMean(
            gpytorch.means.ConstantMean(), num_tasks=7
        )
        self.covar_module = gpytorch.kernels.MultitaskKernel(
            gpytorch.kernels.GridInterpolationKernel(
                gpytorch.kernels.RBFKernel(ard_num_dim=2,has_lengthscale=True), grid_size=grid_size, num_dims=2,
            ), num_tasks=7, rank=1
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)


likelihood = gpytorch.likelihoods.MultitaskGaussianLikelihood(num_tasks=7)
model = MultitaskGPModel(train_x, train_y, likelihood)

In [153]:
# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

n_iter = 100
for i in range(n_iter):
    optimizer.zero_grad()
    output = model(train_x)
    loss = -mll(output, train_y)
    loss.backward()
    print('Iter %d/%d - Loss: %.3f' % (i + 1, n_iter, loss.item()))
    optimizer.step()

Iter 1/100 - Loss: 22262.031
Iter 2/100 - Loss: 21528.453
Iter 3/100 - Loss: 20779.219
Iter 4/100 - Loss: 20006.783
Iter 5/100 - Loss: 19236.756
Iter 6/100 - Loss: 18433.982
Iter 7/100 - Loss: 17631.145
Iter 8/100 - Loss: 16802.260
Iter 9/100 - Loss: 15973.414
Iter 10/100 - Loss: 15126.259
Iter 11/100 - Loss: 14265.186
Iter 12/100 - Loss: 13390.544
Iter 13/100 - Loss: 12522.266
Iter 14/100 - Loss: 11625.705
Iter 15/100 - Loss: 10736.096
Iter 16/100 - Loss: 9827.631
Iter 17/100 - Loss: 8921.054
Iter 18/100 - Loss: 8007.621


KeyboardInterrupt: 

In [154]:
train_y[-1:,:]

tensor([[0.2941, 0.0708, 0.0954, 0.0388, 0.0870, 0.1303, 0.1534]])

In [155]:
train_x[-1:,:]

tensor([[-1.2124,  0.3937]])

In [156]:
#mll.likelihood(output).log_prob(train_y).size()

torch.Size([])

In [157]:
# Set into eval mode
model.eval()
likelihood.eval()

MultitaskGaussianLikelihood()

In [158]:
# Make predictions
with torch.no_grad(), gpytorch.fast_pred_var():
    #test_x = torch.linspace(0, 1, 51)
    observed_pred = likelihood(model(train_x))
    # Get mean
    mean = observed_pred.mean
    # Get lower and upper confidence bounds
    lower, upper = observed_pred.confidence_region()



In [159]:
mean

tensor([[ 0.9529, -0.0604, -0.0281,  ...,  0.3429,  0.6419,  2.0207],
        [ 0.9541, -0.0597, -0.0284,  ...,  0.3422,  0.6434,  2.0227],
        [ 0.9565, -0.0581, -0.0278,  ...,  0.3410,  0.6464,  2.0262],
        ...,
        [ 0.7545, -0.1835, -0.0688,  ...,  0.4760,  0.3842,  1.8439],
        [ 0.7633, -0.1800, -0.0653,  ...,  0.4713,  0.3917,  1.8456],
        [ 0.7599, -0.1804, -0.0672,  ...,  0.4708,  0.3915,  1.8428]])

In [160]:
mean[:1,:]

tensor([[ 0.9529, -0.0604, -0.0281,  0.8632,  0.3429,  0.6419,  2.0207]])

In [161]:
train_y[:1,:]

tensor([[0.7843, 0.0223, 0.0199, 0.0089, 0.0206, 0.5397, 0.9023]])

In [None]:
list(data[["housing_median_age","total_rooms","total_bedrooms","population","households","median_income","median_house_value"]])

In [None]:
np.log10(710414720)