## Setup

Load required packages

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal

  from .sqlite import head_to_sql, start_sql


Read data

In [2]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data=pd.read_csv("C:/Users/Konstantin Klemmer/Documents/GitHub/spacegan/data/housing.csv")

In [3]:
data["id"]=data.index
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,id
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,3
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,4


Create spatial points object ([KDTree](https://pysal.readthedocs.io/en/dev/library/cg/kdtree.html))). 

In [4]:
import pysal.lib
kd = pysal.lib.cg.kdtree.KDTree(np.array(data[["longitude","latitude"]]))

Compute spatial neighbourhoods weight matrix by distance threshold ("radius")


In [5]:
%%time
wnn = pysal.lib.weights.KNN(kd, 50, ids=data["id"]) #KNN based weights

Wall time: 1min 13s


## Spatial CV: Lat/Lon slicing

Create labels based on latitude / longitude binning and add the labels to the original data

In [6]:
data["lon_group"] = pd.cut(data["longitude"],bins=5,labels=[1,2,3,4,5])
data["lat_group"] = pd.cut(data["latitude"],bins=5,labels=[1,2,3,4,5])

This method loops over our lat/lon groups, keeping each group as test data and the rest as train data. However, as we want to do spatial cross-validation, we remove neighbors of the test set. This can help to prevent model overfitting. Here, we remove 1st and 2nd degree neighbors, but the method can be adapted as needed. We create 10 folds (5 lon, 5 lat slicing) and save these in the columns `lat_group[1-5]` and `lon_group[1-5]`. For the values in each of these columns, `1` indicates testing data, `2` training data and `0`indicates data to be removed.

In [7]:
for q in list(data)[-2::]: #Loop over the two slicing label columns 
    data["s_id"] = data[q] #Define which label column to use for slicing
    
    for j in np.unique(data["s_id"]): #Loop over the unique labels in the slicing column 
        
        data[q+str(j)] = 0
        
        test = data[data["s_id"]==j] #Define test data 
        data.loc[data["id"].isin(np.array(test["id"])),q+str(j)] = 1
        
        temp_id = [] #Create empty neighbourhood index
        
        for k in test.index: #Fill neighborhood index using first degree neighbors of test data
            temp_id = np.unique(np.concatenate([temp_id,wnn.neighbors[k]]).ravel().astype(np.int32))
            
        #for l in temp_id: #Include second degree knn-neighbors
        #    temp_id = np.unique(np.concatenate([temp_id,wnn.neighbors[l]]).ravel().astype(np.int32))
        
        #for m in temp_id: #Include third degree knn neighbors
        #    temp_id = np.unique(np.concatenate([temp_id,nn.neighbors[m]]).ravel().astype(np.int32))
            
        train = data[data["s_id"]!=j] #Define train data 
        train = train.drop(temp_id,errors="ignore") #Exclude neighbors from index
        data.loc[data["id"].isin(np.array(train["id"])),q+str(j)] = 2
        
        #INSERT DATA AUGMENTATION METHOD HERE
        #train_aug = ...
        
        #INSERT MODEL TRAINING HERE
        #model1 = f(train)
        #predict = predict(model1,test)
        
    data.drop(columns="s_id")

In [8]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,lon_group1,lon_group2,lon_group3,lon_group4,lon_group5,lat_group1,lat_group2,lat_group3,lat_group4,lat_group5
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.1200,241400.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,...,2,1,2,2,2,2,2,1,2,2


We can now save the data:

In [9]:
data.to_csv("grid_aug_housing_knn50.csv",index=False)