## Setup

Load required packages

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal

  from .sqlite import head_to_sql, start_sql


Read data

In [2]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data=pd.read_csv("C:/Users/Konstantin Klemmer/Documents/GitHub/spacegan/data/grid_aug.csv")

In [3]:
data.head(5)

Unnamed: 0,id,y,longitude,latitude,z,nn0,nn1,nn2,nn3,nn4,...,nn20,nn21,nn22,nn23,nn24,nn25,nn26,nn27,nn28,nn29
0,0,-0.844504,2.5,2.5,1.370958,-0.832568,-0.832449,-0.8455,-0.793995,-0.794071,...,-0.794673,-0.527385,-0.526857,-0.832357,-0.832634,-0.641253,-0.641905,-0.730428,-0.731069,-0.845475
1,1,-0.832568,7.5,2.5,-0.564698,-0.844504,-0.8455,-0.793995,-0.832449,-0.832383,...,-0.641905,-0.79413,-0.641186,-0.832634,-0.832357,-0.730428,-0.641253,-0.793685,-0.845475,-0.387048
2,2,-0.793995,12.5,2.5,0.363128,-0.832568,-0.73034,-0.832383,-0.8455,-0.793699,...,-0.832634,-0.730428,-0.387048,-0.79413,-0.730511,-0.527461,-0.832357,-0.793685,-0.641186,-0.845475
3,3,-0.73034,17.5,2.5,0.632863,-0.793699,-0.793995,-0.641493,-0.730159,-0.832383,...,-0.832634,-0.794071,-0.793685,-0.794082,-0.642062,-0.222415,-0.832357,-0.388055,-0.845475,-0.79413
4,4,-0.641493,22.5,2.5,0.404268,-0.730159,-0.73034,-0.527385,-0.641905,-0.793699,...,-0.844502,-0.526827,-0.832475,-0.832016,-0.730473,-0.844504,-0.031306,-0.845475,-0.832357,-0.832004


Create a pairwise distance matrix (Euclidean) between the points

In [4]:
dist = pysal.lib.cg.distance_matrix(np.array(data[["longitude","latitude"]]))

Get the `k` smallest distances (of the whole matrix, since the points are equally distributed)

In [5]:
k=10
u_dist = np.unique(dist)
k_min_dist = np.sort(u_dist.flatten())[:k]

In [6]:
k_min_dist

array([ 0.        ,  5.        ,  7.07106781, 10.        , 11.18033989,
       14.14213562, 15.        , 15.8113883 , 18.02775638, 20.        ])

Create spatial points object ([KDTree](https://pysal.readthedocs.io/en/dev/library/cg/kdtree.html))). 

In [7]:
import pysal.lib
kd = pysal.lib.cg.kdtree.KDTree(np.array(data[["longitude","latitude"]]))

Compute spatial neighbourhoods weight matrix by distance threshold ("radius")


In [8]:
#wnn = pysal.lib.weights.KNN(kd, 8, ids=data["id"]) #KNN based weights
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[1],binary=False,p=2) #Rook
wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[2],binary=True,p=2) #Queen
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[4],binary=True,p=2) #Queen 2nd degree

## Spatial Bootstrap

We create a simple spatial bootstrap method as follows:

* 1: We select a random random observation.
* 2: We also select all neighbors of the random observation from Step 1 and bind them together.
* 3: We select a new random points, repeat Step 2 and bind the new data to the existing.
* 4: We repeat Step 3 until we have reached a desired number of subsamples.

In [47]:
def spatial_bootstrap(data,w,n):
    
    boot = data.sample(n=1)
    random_sample = boot
    temp_id = []
    for k in random_sample.index: 
            temp_id = np.unique(np.concatenate([temp_id,w.neighbors[k]]).ravel().astype(np.int32))
    for l in temp_id: #Include second degree neighbors
            temp_id = np.unique(np.concatenate([temp_id,w.neighbors[l]]).ravel().astype(np.int32))        
    boot = data.iloc[temp_id]
    
    while len(boot.index) <= n:
        random_sample = data.sample(n=1)
        temp_id = []
        for k in random_sample.index: 
                temp_id = np.unique(np.concatenate([temp_id,w.neighbors[k]]).ravel().astype(np.int32))
        for l in temp_id: #Include second degree neighbors
                temp_id = np.unique(np.concatenate([temp_id,w.neighbors[l]]).ravel().astype(np.int32)) 
        random_sample = data.iloc[temp_id]
        boot = boot.append(random_sample, ignore_index=True)
        
    return boot

In [48]:
bootstrap_test = spatial_bootstrap(data,wdist,1000)
bootstrap_test

Unnamed: 0,id,y,longitude,latitude,z,nn0,nn1,nn2,nn3,nn4,...,nn20,nn21,nn22,nn23,nn24,nn25,nn26,nn27,nn28,nn29
0,8,-0.031306,42.5,2.5,2.018424,-0.222415,-0.222141,0.184221,-0.388055,-0.032067,...,-0.641905,-0.730428,0.183894,-0.730473,-0.222806,-0.641493,-0.641284,0.984285,-0.526959,-0.730733
1,9,0.184221,47.5,2.5,-0.062714,-0.031306,-0.032067,0.426173,-0.222141,0.184476,...,0.692964,-0.641255,-0.642062,0.426191,-0.031810,1.302316,-0.526959,-0.527385,-0.641284,-0.387056
2,10,0.426173,52.5,2.5,1.304870,0.184221,0.692910,0.184476,0.426012,-0.032067,...,-0.031810,0.184549,-0.526927,0.692831,-0.526827,-0.387056,-0.387048,1.645473,-0.527461,-0.222600
3,11,0.692910,57.5,2.5,2.286645,0.984285,0.426012,0.426173,0.184476,0.692964,...,-0.222806,0.425268,-0.387220,-0.387768,0.984830,-0.222415,2.014341,-0.222600,1.645038,-0.031456
4,12,0.984285,62.5,2.5,-1.388861,0.692964,0.692910,1.302316,0.426012,0.984494,...,-0.032067,0.692264,1.302595,-0.222057,-0.222806,-0.031456,2.407809,-0.031306,2.013680,-0.222600
5,28,-0.222141,42.5,7.5,0.460097,-0.388055,-0.032067,-0.387768,-0.031306,-0.222415,...,0.426012,-0.641905,0.183894,0.692910,-0.526959,-0.527385,-0.730733,-0.730428,-0.387056,-0.794285
6,29,-0.032067,47.5,7.5,-0.639995,-0.222057,0.184476,-0.222141,0.184221,-0.031603,...,0.692964,-0.526959,-0.642062,-0.387056,-0.641284,0.984285,-0.387048,0.426191,-0.730473,-0.730733
7,30,0.184476,52.5,7.5,0.455450,-0.032067,0.426173,-0.031603,0.426012,0.692910,...,0.984494,-0.388055,0.692831,1.302316,-0.222600,-0.526827,-0.222415,-0.526959,-0.031456,0.425268
8,31,0.426012,57.5,7.5,0.704837,0.184476,0.692964,0.692910,0.183894,0.984285,...,-0.222141,1.302702,0.984830,1.645473,-0.031456,-0.387056,-0.387768,-0.031306,-0.526927,0.184753
9,32,0.692964,62.5,7.5,1.035104,0.984285,0.426012,0.426191,0.984494,1.302316,...,-0.031456,1.645038,2.014341,-0.222057,1.302595,-0.222600,0.184753,0.184221,0.425227,-0.387056


This method loops over our lat/lon groups, keeping each group as test data and the rest as train data. However, as we want to do spatial cross-validation, we remove neighbors of the test set. This can help to prevent model overfitting. Here, we remove 1st and 2nd degree neighbors, but the method can be adapted as needed. We create 10 folds (5 lon, 5 lat slicing) and save these in the columns `lat_group[1-5]` and `lon_group[1-5]`. For the values in each of these columns, `1` indicates testing data, `2` training data and `0`indicates data to be removed.

In [None]:
for q in list(data)[-2::]: #Loop over the two slicing label columns 
    data["s_id"] = data[q] #Define which label column to use for slicing
    
    for j in np.unique(data["s_id"]): #Loop over the unique labels in the slicing column 
        
        data[q+str(j)] = 0
        
        test = data[data["s_id"]==j] #Define test data 
        data.loc[data["id"].isin(np.array(test["id"])),q+str(j)] = 1
        
        temp_id = [] #Create empty neighbourhood index
        
        for k in test.index: #Fill neighborhood index using first degree neighbors of test data
            temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[k]]).ravel().astype(np.int32))
            
        for l in temp_id: #Include second degree neighbors
            temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[l]]).ravel().astype(np.int32))
        
        #for m in temp_id: #Include third degree neighbors
        #    temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[m]]).ravel().astype(np.int32))
            
        train = data[data["s_id"]!=j] #Define train data 
        train = train.drop(temp_id,errors="ignore") #Exclude neighbors from index
        data.loc[data["id"].isin(np.array(train["id"])),q+str(j)] = 2
        
        #INSERT DATA AUGMENTATION METHOD HERE
        #train_aug = ...
        
        #INSERT MODEL TRAINING HERE
        #model1 = f(train)
        #predict = predict(model1,test)

We can now save the data:

In [None]:
data.to_csv("grid_aug_train.csv")