## Setup

Load required packages

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal
import pysal.lib

  from .sqlite import head_to_sql, start_sql


Read data

In [3]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data = pd.read_csv("raw_data/housing.csv")
#Create ID column
data["id"] = np.asarray(list(range(0,len(data["longitude"])))).reshape(-1,1)

In [4]:
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,4


## Spatial Grid Expansion

This function takes in a regular point grid `data` (with coordinates `longitude` and `latitude`, and features given in the California housing dataset), an array of weight matrices `w_mat` and a number `n` of desired synthetic points. First though, let's create a set of neighbourhood-matrices `w_mat`:

In [5]:
kd = pysal.lib.cg.kdtree.KDTree(np.array(data[["longitude","latitude"]]))
w_knn15 = pysal.lib.weights.KNN(kd, 15, ids=data["id"]) #KNN based weights
w_knn20 = pysal.lib.weights.KNN(kd, 25, ids=data["id"]) 
w_knn50 = pysal.lib.weights.KNN(kd, 50, ids=data["id"]) 
w_mat = [w_knn15,w_knn20,w_knn50]



The synthetic point generation can then be done as:

In [5]:
def synth_point_gen(data,w_mat,n):
    
    #Create running variable 
    i = 1
    #Create indicator column for synthetic data
    data["synth"] = 0
    
    #Create synthetic points until condition is met
    while i <= n:
        #Chose random datapoint
        random_sample = data.sample(n=1)
        #Chose random weihgtmatrix from set
        w = random.choice(w_mat)
        #Extract neighbourhood of random point
        temp_id = []
        for k in random_sample.index: 
                temp_id = np.unique(np.concatenate([temp_id,w.neighbors[k]]).ravel().astype(np.int32))
        #for l in temp_id: #Include second degree neighbors
        #       temp_id = np.unique(np.concatenate([temp_id,w.neighbors[l]]).ravel().astype(np.int32))
        temp = data.iloc[temp_id]
        #Bind random point with its neighbourhood
        random_sample = random_sample.append(temp)

        #Create new point using mean interpolation (for continuous variables) and random sampling (for discrete variables)
        new_point = pd.DataFrame(columns=data.columns)
        new_point = pd.DataFrame({"longitude":np.mean(random_sample["longitude"]),
                                  "latitude":np.mean(random_sample["latitude"]),
                                  "housing_median_age":float(int(np.mean(random_sample["housing_median_age"]))),
                                  "total_rooms":float(int(np.mean(random_sample["total_rooms"]))),
                                  "total_bedrooms":float(int(np.mean(random_sample["total_bedrooms"]))),
                                  "population":float(int(np.mean(random_sample["population"]))),
                                  "households":float(int(np.mean(random_sample["households"]))),
                                  "median_income":np.mean(random_sample["median_income"]),
                                  "median_house_value":float(int(np.mean(random_sample["median_house_value"]))),
                                  "ocean_proximity":random.sample(list(random_sample["ocean_proximity"].unique()),1),
                                  "id":np.max(data["id"])+1,
                                  "synth":1})
        #Bind with existing data
        data = data.append(new_point)
        #Increase counter
        i = i+1
    
    #Return data
    return data

Run with the california housing dataset.

In [6]:
random.seed(99)
new_data = synth_point_gen(data,w_mat,10000)
new_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id,synth
0,-122.230000,37.880000,41.0,880.0,129.0,322.0,126.0,8.325200,452600.0,NEAR BAY,0,0
1,-122.220000,37.860000,21.0,7099.0,1106.0,2401.0,1138.0,8.301400,358500.0,NEAR BAY,1,0
2,-122.240000,37.850000,52.0,1467.0,190.0,496.0,177.0,7.257400,352100.0,NEAR BAY,2,0
3,-122.250000,37.850000,52.0,1274.0,235.0,558.0,219.0,5.643100,341300.0,NEAR BAY,3,0
4,-122.250000,37.850000,52.0,1627.0,280.0,565.0,259.0,3.846200,342200.0,NEAR BAY,4,0
5,-122.250000,37.850000,52.0,919.0,213.0,413.0,193.0,4.036800,269700.0,NEAR BAY,5,0
6,-122.250000,37.840000,52.0,2535.0,489.0,1094.0,514.0,3.659100,299200.0,NEAR BAY,6,0
7,-122.250000,37.840000,52.0,3104.0,687.0,1157.0,647.0,3.120000,241400.0,NEAR BAY,7,0
8,-122.260000,37.840000,42.0,2555.0,665.0,1206.0,595.0,2.080400,226700.0,NEAR BAY,8,0
9,-122.250000,37.840000,52.0,3549.0,707.0,1551.0,714.0,3.691200,261100.0,NEAR BAY,9,0


Save the newly expanded dataframes.

In [7]:
new_data.to_csv("housing_synth_interpolate.csv")