## Setup

Load required packages

In [1]:
import pandas as pd
import io
import requests
import numpy as np
import random
import libpysal
import pysal
import pysal.lib

  from .sqlite import head_to_sql, start_sql


Read data

In [2]:
#Read from URL (Doesnt work for private repos)
#url="https://github.com/konstantinklemmer/spacegan/raw/master/data/synth_data.csv"
#s=requests.get(url).content
#data=pd.read_csv(io.StringIO(s.decode('utf-8')))

#Read from local file
data_ex1 = pd.read_csv("C:/Users/Konstantin Klemmer/Documents/GitHub/spacegan/data/synth_data_ex1.csv")
data_ex2 = pd.read_csv("C:/Users/Konstantin Klemmer/Documents/GitHub/spacegan/data/synth_data_ex2.csv")

In [3]:
data_ex2.head(5)

Unnamed: 0,id,y,z,latitude,longitude
0,1,-0.111466,1.334913,1.75,1.75
1,2,-0.067598,-0.869272,1.75,5.25
2,3,-0.131009,0.055487,1.75,8.75
3,4,-0.054521,0.049067,1.75,12.25
4,5,-0.136375,-0.578356,1.75,15.75


## Spatial Grid Expansion

This function takes in a regular point grid `data` (with coordinates `longitude` and `latitude`, input variable `z` and output variable `y`) and a factor `fact` by which the grid is to be expanded (e.g., `fact=2` doubles the grid cells (roughly))

In [4]:
def grid_expand(data,fact):
    
    #Copy column names from existing grid
    new_points = pd.DataFrame(columns=data[["longitude","latitude"]].columns)
    
    #Create new grid:
    #Expand longitude by factor `fact`
    dst = (data[["longitude"]].max() - data[["longitude"]].min()) / (len(data.longitude.unique())-1)
    exp = float(dst/fact)
    x = []
    d = float(data[["longitude"]].min())
    while d <= float(data[["longitude"]].max()):
        x.append(d)    
        d = d + exp
    #Expand latitude by factor `fact`
    dst = (data[["latitude"]].max() - data[["latitude"]].min()) / (len(data.longitude.unique())-1)
    exp = float(dst/fact)
    y = []
    d = float(data[["latitude"]].min())
    while d <= float(data[["latitude"]].max()):
        y.append(d)    
        d = d + exp
    #Bind new lat and lon values
    lon = x * len(y)
    lat = sorted(y*len(x))
    #Create new points dataframe
    new_points = pd.DataFrame({"longitude":lon,"latitude":lat})
    #Merge with existing dataframe
    new_data = pd.merge(new_points,data, how="left", on=["longitude","latitude"])
    #Create column to indicate whether observation is original or synthetic 
    new_data["synth"] = 0
    new_data.loc[new_data.id.isnull(), "synth"] = 1
    #Re-assign IDs
    new_data[["id"]] = np.asarray(list(range(0,len(new_data["id"])))).reshape(-1,1)
    
    #Spatial operations:
    #Compute distance matrix of new grid
    dist = pysal.lib.cg.distance_matrix(np.array(new_data[["longitude","latitude"]]))
    #Extract and flatten 10 nearest distances
    k=10
    u_dist = np.unique(dist)
    k_min_dist = np.sort(u_dist.flatten())[:k]
    #Create KD tree
    kd = pysal.lib.cg.kdtree.KDTree(np.array(new_data[["longitude","latitude"]]))
    #Define neighbourhood (here Queen structure)
    w = pysal.lib.weights.distance.DistanceBand(kd, threshold=k_min_dist[2],binary=True,p=2) #Queen
    
    #Interpolate missing grid values
    #OPTION 1:
    #Interpolate new grid data based on mean of neighborhood
    for k in new_data.index:
        fill = new_data.iloc[k]
        temp_id = []
        if fill["synth"] == 1:
            temp_id = np.unique(np.concatenate([temp_id,w.neighbors[k]]).ravel().astype(np.int32))
            neighbors = new_data.iloc[temp_id]
            fill["z"] = np.mean(neighbors["z"])
            fill["y"] = np.mean(neighbors["y"]) #Delete this line if OPTION 2 is preferred
            new_data.iloc[k] = fill
        else:
            continue 
            
    #OPTION 2:
    #Interpolate Y using a linear model (works very poorly)

    #from sklearn.linear_model import LinearRegression
    #train = new_data[new_data["synth"]==0]
    #test = new_data[new_data["synth"]==1]
    #x_train = train[["longitude","latitude","z"]]
    #y_train = train[["y"]]
    #x_test = test[["longitude","latitude","z"]]
    #reg = LinearRegression().fit(x_train,y_train)
    #y_pred = reg.predict(x_test)
    #new_data["y"][new_data["synth"]==1] = y_pred
    
    #Return new dataframe
    return new_data

Run with both synthetic grid datasets.

In [5]:
new_data_ex1 = grid_expand(data_ex1,2)
new_data_ex2 = grid_expand(data_ex2,2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Save the newly expanded dataframes.

In [6]:
new_data_ex1.to_csv("grid_expanded_ex1.csv")
new_data_ex2.to_csv("grid_expanded_ex2.csv")