## Setup

Load required packages

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal
import pysal.lib
#import sklearn

  from .sqlite import head_to_sql, start_sql


Read data

In [2]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data = pd.read_csv("C:/Users/Konstantin Klemmer/Documents/GitHub/spacegan/data/housing.csv")
#Create ID column
data["id"] = np.asarray(list(range(0,len(data["longitude"])))).reshape(-1,1)

In [3]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,4


## Spatial Grid Expansion

This function takes in a regular point grid `data` (with coordinates `longitude` and `latitude`, and features given in the California housing dataset), an array of weight matrices `w_mat` and a number `n` of desired synthetic points. First though, let's create a set of neighbourhood-matrices `w_mat`:

In [9]:
test = data.head(100)

In [12]:
#Train GP interpolation
from sklearn.multioutput import MultiOutputRegressor
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared, ConstantKernel


#Interpolate house price
x = np.asarray(test[["longitude","latitude"]])
y = np.asarray(test[["housing_median_age","total_rooms"]])

kernel = 1 *RBF(length_scale=(0.01,0.01), length_scale_bounds=(1e-3,1e3)) + WhiteKernel(noise_level=1, noise_level_bounds=(1e-2, 1e2))
gp = MultiOutputRegressor(GaussianProcessClassifier(kernel=kernel,n_restarts_optimizer=10,random_state=2))
gp.fit(x,y)



MultiOutputRegressor(estimator=GaussianProcessClassifier(copy_X_train=True,
             kernel=1**2 * RBF(length_scale=[0.01, 0.01]) + WhiteKernel(noise_level=1),
             max_iter_predict=100, multi_class='one_vs_rest', n_jobs=None,
             n_restarts_optimizer=10, optimizer='fmin_l_bfgs_b',
             random_state=2, warm_start=False),
           n_jobs=None)

In [15]:
np.min(test["latitude"])

37.79

In [18]:
y__synth = gp.predict(np.asarray([-122.35,37.89]).reshape(1,-1))
y__synth

array([[ 41., 880.]])

The synthetic point generation can then be done as:

In [None]:
def synth_point_gen(data,w_mat,n):
    
    #Create running variable 
    i = 1
    #Create indicator column for synthetic data
    data["synth"] = 0
    
    #Create synthetic points until condition is met
    while i <= n:
        #Chose random datapoint
        random_sample = data.sample(n=1)
        #Chose random weihgtmatrix from set
        w = random.choice(w_mat)
        #Extract neighbourhood of random point
        temp_id = []
        for k in random_sample.index: 
                temp_id = np.unique(np.concatenate([temp_id,w.neighbors[k]]).ravel().astype(np.int32))
        #for l in temp_id: #Include second degree neighbors
        #       temp_id = np.unique(np.concatenate([temp_id,w.neighbors[l]]).ravel().astype(np.int32))
        temp = data.iloc[temp_id]
        #Bind random point with its neighbourhood
        random_sample = random_sample.append(temp)

        #Create new point using mean interpolation (for continuous variables) and random sampling (for discrete variables)
        new_point = pd.DataFrame(columns=data.columns)
        new_point = pd.DataFrame({"longitude":np.mean(random_sample["longitude"]),
                                  "latitude":np.mean(random_sample["latitude"]),
                                  "housing_median_age":float(int(np.mean(random_sample["housing_median_age"]))),
                                  "total_rooms":float(int(np.mean(random_sample["total_rooms"]))),
                                  "total_bedrooms":float(int(np.mean(random_sample["total_bedrooms"]))),
                                  "population":float(int(np.mean(random_sample["population"]))),
                                  "households":float(int(np.mean(random_sample["households"]))),
                                  "median_income":np.mean(random_sample["median_income"]),
                                  "median_house_value":float(int(np.mean(random_sample["median_house_value"]))),
                                  "ocean_proximity":random.sample(list(random_sample["ocean_proximity"].unique()),1),
                                  "id":np.max(data["id"])+1,
                                  "synth":1})
        #Bind with existing data
        data = data.append(new_point)
        #Increase counter
        i = i+1
    
    #Return data
    return data

Run with the california housing dataset.

In [None]:
new_data = synth_point_gen(data,w_mat,10000)
new_data

Save the newly expanded dataframes.

In [None]:
new_data.to_csv("housing_synth.csv")