## Setup

Load required packages

In [5]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal
import pysal.lib

Read data

In [2]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data=pd.read_csv("C:/Users/Konstantin Klemmer/Documents/GitHub/spacegan/data/synth_data_ex1.csv")

In [3]:
data.head(5)

Unnamed: 0,id,y,z,latitude,longitude
0,1,-0.844504,1.370958,2.5,2.5
1,2,-0.832568,-0.564698,2.5,7.5
2,3,-0.793995,0.363128,2.5,12.5
3,4,-0.73034,0.632863,2.5,17.5
4,5,-0.641493,0.404268,2.5,22.5


Create a pairwise distance matrix (Euclidean) between the points

In [4]:
#dist = pysal.lib.cg.distance_matrix(np.array(data[["longitude","latitude"]]))

Get the `k` smallest distances (of the whole matrix, since the points are equally distributed)

In [6]:
#k=10
#u_dist = np.unique(dist)
#k_min_dist = np.sort(u_dist.flatten())[:k]

In [7]:
#k_min_dist

Create spatial points object ([KDTree](https://pysal.readthedocs.io/en/dev/library/cg/kdtree.html))). 

In [8]:
#import pysal.lib
#kd = pysal.lib.cg.kdtree.KDTree(np.array(data[["longitude","latitude"]]))

Compute spatial neighbourhoods weight matrix by distance threshold ("radius")


In [9]:
#wnn = pysal.lib.weights.KNN(kd, 8, ids=data["id"]) #KNN based weights
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[1],binary=False,p=2) #Rook
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[2],binary=True,p=2) #Queen
#wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[4],binary=True,p=2) #Queen 2nd degree

## Spatial Grid Expansion

Extract points coordinates:

In [None]:
def grid_expand(data,fact):
    
    #Copy column names from existing grid
    new_points = pd.DataFrame(columns=data[["longitude","latitude"]].columns)
    
    #Create new grid:
    #Expand longitude by factor `fact`
    dst = (data[["longitude"]].max() - data[["longitude"]].min()) / (len(data.longitude.unique())-1)
    exp = float(dst/fact)
    x = []
    d = float(data[["longitude"]].min())
    while d <= float(data[["longitude"]].max()):
        x.append(d)    
        d = d + exp
    #Expand latitude by factor `fact`
    dst = (data[["latitude"]].max() - data[["latitude"]].min()) / (len(data.longitude.unique())-1)
    exp = float(dst/fact)
    y = []
    d = float(data[["latitude"]].min())
    while d <= float(data[["latitude"]].max()):
        y.append(d)    
        d = d + exp
    #Bind new lat and lon values
    lon = x * len(y)
    lat = sorted(y*len(x))
    #Create new points dataframe
    new_points = pd.DataFrame({"longitude":lon,"latitude":lat})
    #Merge with existing dataframe
    new_data = pd.merge(new_points,data, how="left", on=["longitude","latitude"])
    #Create column to indicate whether observation is original or synthetic 
    new_data["synth"] = 0
    new_data.loc[new_data.id.isnull(), "synth"] = 1
    #Re-assign IDs
    new_data[["id"]] = np.asarray(list(range(0,len(new_data["id"])))).reshape(-1,1)
    
    #Spatial operations:
    #Compute distance matrix of new grid
    dist = pysal.lib.cg.distance_matrix(np.array(new_data[["longitude","latitude"]]))
    #Extract and flatten 10 nearest distances
    k=10
    u_dist = np.unique(dist)
    k_min_dist = np.sort(u_dist.flatten())[:k]
    #Create KD tree
    kd = pysal.lib.cg.kdtree.KDTree(np.array(new_data[["longitude","latitude"]]))
    #Define neighbourhood (here Queen structure)
    w = pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[2],binary=True,p=2) #Queen
    
    #Interpolate missing grid values
    #OPTION 1:
    #Interpolate new grid data based on mean of neighborhood
    for k in new_data.index:
        fill = new_data.iloc[k]
        temp_id = []
        if fill["synth"] == 1:
            temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[k]]).ravel().astype(np.int32))
            neighbors = new_data.iloc[temp_id]
            fill["z"] = np.mean(neighbors["z"])
            fill["y"] = np.mean(neighbors["y"]) #Delete this line if OPTION 2 is preferred
            new_data.iloc[k] = fill
        else:
            continue 
            
    #OPTION 2:
    #Interpolate Y using a linear model (works very poorly)

    #from sklearn.linear_model import LinearRegression
    #train = new_data[new_data["synth"]==0]
    #test = new_data[new_data["synth"]==1]
    #x_train = train[["longitude","latitude","z"]]
    #y_train = train[["y"]]
    #x_test = test[["longitude","latitude","z"]]
    #reg = LinearRegression().fit(x_train,y_train)
    #y_pred = reg.predict(x_test)
    #new_data["y"][new_data["synth"]==1] = y_pred
    
    #Return new dataframe
    return new_data

In [None]:
new_data = grid_expand(new_data,2)

new_data

Expand grid by factor 2 (defined by the `exp` variable).

In [None]:
#Expand longitude (double)
dst = (data[["longitude"]].max() - data[["longitude"]].min()) / (len(data.longitude.unique())-1)
exp = float(dst/2)
x = []
d = float(data[["longitude"]].min())
while d <= float(data[["longitude"]].max()):
    x.append(d)    
    d = d + exp

#Expand latitude (double)
dst = (data[["latitude"]].max() - data[["latitude"]].min()) / (len(data.longitude.unique())-1)
exp = float(dst/fact)
y = []
d = float(data[["latitude"]].min())
while d <= float(data[["latitude"]].max()):
    y.append(d)    
    d = d + exp

lon = x * len(y)
lat = sorted(y*len(x))

new_points = pd.DataFrame({"longitude":lon,"latitude":lat})

In [None]:
new_data = pd.merge(new_points,data, how="left", on=["longitude","latitude"])

In [None]:
new_data["synth"] = 0
new_data.loc[new_data.id.isnull(), "synth"] = 1

In [None]:
new_data[["id"]] = np.asarray(list(range(0,len(new_data["id"])))).reshape(-1,1)

In [None]:
dist = pysal.lib.cg.distance_matrix(np.array(new_data[["longitude","latitude"]]))

In [None]:
k=10
u_dist = np.unique(dist)
k_min_dist = np.sort(u_dist.flatten())[:k]

In [None]:
kd = pysal.lib.cg.kdtree.KDTree(np.array(new_data[["longitude","latitude"]]))

In [None]:
wdist= pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[2],binary=True,p=2) #Queen

In [None]:
#Interpolate new grid data based on mean of neighborhood
for k in new_data.index:
    fill = new_data.iloc[k]
    temp_id = []
    if fill["synth"] == 1:
        temp_id = np.unique(np.concatenate([temp_id,wdist.neighbors[k]]).ravel().astype(np.int32))
        neighbors = new_data.iloc[temp_id]
        fill["z"] = np.mean(neighbors["z"])
        fill["y"] = np.mean(neighbors["y"])
        new_data.iloc[k] = fill
    else:
        continue 

In [None]:
#Interpolate Y using a linear model (works very poorly)

#from sklearn.linear_model import LinearRegression
#train = new_data[new_data["synth"]==0]
#test = new_data[new_data["synth"]==1]
#x_train = train[["longitude","latitude","z"]]
#y_train = train[["y"]]
#x_test = test[["longitude","latitude","z"]]
#reg = LinearRegression().fit(x_train,y_train)
#y_pred = reg.predict(x_test)
#new_data["y"][new_data["synth"]==1] = y_pred

This method loops over our lat/lon groups, keeping each group as test data and the rest as train data. However, as we want to do spatial cross-validation, we remove neighbors of the test set. This can help to prevent model overfitting. Here, we remove 1st and 2nd degree neighbors, but the method can be adapted as needed. We create 10 folds (5 lon, 5 lat slicing) and save these in the columns `lat_group[1-5]` and `lon_group[1-5]`. For the values in each of these columns, `1` indicates testing data, `2` training data and `0`indicates data to be removed.

In [None]:
new_data.to_csv("grid_expanded_ex1.csv")