Reference: https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html#local-geary

# Univariate local geary

Interestingly, following [this equation](https://www.biomedware.com/files/documentation/spacestat/Statistics/Gearys_C/Geary_s_C_statistic.htm) which explicitly calls for standardization of input data. We also do NOT divide by 2.

$$ c_i = \sum_j w_{ij} (z_i - z_j)^2 $$ 

where: 

$z_i = x_i - \bar{x}$ and $z_j = x_j - \bar{x}$, and $w_{ij}$ are the elements of the row-standardized binary symmetric spatial weight matrix W. 

or, $$ c_i = (1/m^2) * \sum_j w_{ij} (x_i - x_j)^2 $$

where,

$$ m^2 = \sum_i (x_i−\bar{x})^2/n $$

## Load in example data

In [1]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import numpy as np
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))

In [2]:
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [3]:
wq[0]

{66: 1.0, 35: 1.0, 68: 1.0, 36: 1.0}

In [4]:
wq.transform = 'r'
wq[0]

{66: 0.25, 35: 0.25, 68: 0.25, 36: 0.25}

In [5]:
x = guerry_ds['Donatns']

print("x_i is", x[0])
print("x_j are", x[66], x[35], x[68], x[36])

x_i is 5098
x_j are 1983 4077 3710 3012


In [6]:
# Calculate score
zscore_x = (x - np.mean(x))/np.std(x)
zscore_x

0    -0.336188
1     0.450441
2     0.879023
3    -0.825375
4     0.049370
        ...   
80    1.512380
81    0.454785
82    1.467288
83   -0.555029
84   -0.506214
Name: Donatns, Length: 85, dtype: float64

# Build observed local geary values

In [7]:
adj_list = wq.to_adjlist(remove_symmetric=False)
adj_list.head()

Unnamed: 0,focal,neighbor,weight
0,0,66,0.25
1,0,35,0.25
2,0,68,0.25
3,0,36,0.25
4,1,48,0.166667


In [8]:
import pandas as pd
zseries = pd.Series(zscore_x, index=wq.id_order)
zseries[0:5]

0   -0.336188
1    0.450441
2    0.879023
3   -0.825375
4    0.049370
Name: Donatns, dtype: float64

In [9]:
# Define z_i
zi = zseries.loc[adj_list.focal].values
zi[0:5]

array([-0.33618783, -0.33618783, -0.33618783, -0.33618783,  0.45044136])

In [10]:
# Define zj
zj = zseries.loc[adj_list.neighbor].values
zj[0:5]

array([-0.98050808, -0.54737594, -0.62328783, -0.76766521, -0.5709562 ])

In [11]:
(zi-zj)[0:5]

array([0.64432025, 0.21118812, 0.2871    , 0.43147738, 1.02139756])

In [12]:
#(zi-zj)**2

Multiply by spatial weights

In [13]:
#sum(list(wq.weights.values()), [])

In [113]:
# sum(list(wq.weights.values()), []) * (zi-zj)**2
diff = zi-zj

In [108]:
test = sum(list(wq.weights.values()), []) * (diff)**2

In [120]:
# Create a df that uses the adjacency list focal values and the BBs counts
temp = pd.DataFrame(adj_list.focal.values, test).reset_index()
temp[2] = diff
temp.head()

Unnamed: 0,index,0,2
0,0.103787,0,0.64432
1,0.01115,0,0.211188
2,0.020607,0,0.2871
3,0.046543,0,0.431477
4,0.173875,1,1.021398


In [121]:
0.644*0.644

0.41473600000000005

In [122]:
# Temporarily rename the columns
temp.columns = ['E_ij', 'ID', 'Diff_ij']
temp = temp.groupby(by='ID').sum()

In [123]:
temp.E_ij.values[0:5]

array([0.18208704, 0.56001403, 0.97529461, 0.21590694, 0.61737256])

# Create GeoDa quads calculation

from https://github.com/GeoDaCenter/geoda/blob/master/Explore/LocalGearyCoordinator.cpp#L731-L736
```
// assign the cluster
			if (W[i].Size() > 0) {
				if (data1[i] > 0 && Wdata > 0) cluster[i] = 1;
				else if (data1[i] < 0 && Wdata > 0) cluster[i] = 3;
				else if (data1[i] < 0 && Wdata < 0) cluster[i] = 2;
				else cluster[i] = 4; //data1[i] > 0 && Wdata < 0
```

### Cluster 1 high high

In [183]:
import libpysal
lag = libpysal.weights.lag_spatial(y=zscore_x, w=w)
lag > 0

array([False, False,  True, False, False, False, False, False, False,
       False, False, False,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True, False,  True,
       False, False, False,  True, False,  True,  True,  True, False,
       False, False,  True, False, False,  True, False, False, False,
       False,  True,  True, False, False, False,  True, False,  True,
        True, False, False,  True,  True, False,  True,  True, False,
       False, False,  True, False, False, False, False, False,  True,
       False, False,  True, False, False, False, False, False,  True,
        True,  True,  True, False])

In [188]:
ll = (temp.Diff_ij > 0) & (lag > 0) & (functest.p_sim<=0.05)
ll.value_counts()

False    76
True      9
Name: Diff_ij, dtype: int64

### Cluster 2 low low

In [189]:
ll = (temp.Diff_ij < 0) & (lag < 0) & (functest.p_sim<=0.05)
ll.value_counts()

False    78
True      7
Name: Diff_ij, dtype: int64

### Cluster 3 other

In [190]:
ll = (temp.Diff_ij < 0) & (lag > 0) & (functest.p_sim<=0.05)
ll.value_counts()

False    83
True      2
Name: Diff_ij, dtype: int64

### Cluster 4 negative

### Cluster 5 undefined

# Start building function

Need to add quads functionality to the statistic!

In [20]:
#%load_ext pycodestyle_magic

In [21]:
#%pycodestyle_off

In [22]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)


PERMUTATIONS = 999


class Local_Geary(BaseEstimator):
    """Local Geary - Univariate"""

    def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1,
                 keep_simulations=True, seed=None):
        """
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           number of random permutations for calculation
                           of pseudo p_values
        n_jobs           : int
                           Number of cores to be used in the conditional
                           randomisation. If -1, all available cores are used.
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under
                           the null is stored in memory and accessible;
                           otherwise, replications are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional
                           randomizations. Must be set here, and not outside
                           of the function, since numba does not correctly
                           interpret external seeds nor
                           numpy.random.RandomState instances.

        Attributes
        ----------
        localG          : numpy array
                          array containing the observed univariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, x, n_jobs=1, permutations=999):
        """
        Arguments
        ---------
        x                : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal as lp
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> y = guerry_ds['Donatns']
        >>> lG = Local_Geary(connectivity=w).fit(y)
        >>> lG.localG[0:5]
        >>> lG.p_sim[0:5]
        """
        x = np.asarray(x).flatten()

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(x, w)

        if self.permutations:
            self.p_sim, self.rlocalG = _crand_plus(
                z=(x - np.mean(x))/np.std(x),
                w=w,
                observed=self.localG,
                permutations=permutations,
                keep=True,
                n_jobs=n_jobs,
                stat_func=_local_geary
            )

        del (self.keep_simulations, self.n_jobs,
             self.permutations, self.seed, self.rlocalG,
             self.connectivity)

        return self

    @staticmethod
    def _statistic(x, w):
        # Caclulate z-scores for x
        zscore_x = (x - np.mean(x))/np.std(x)
        # Create focal (xi) and neighbor (zi) values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = pd.Series(zscore_x, index=w.id_order)
        zi = zseries.loc[adj_list.focal].values
        zj = zseries.loc[adj_list.neighbor].values
        # Carry out local Geary calculation
        gs = sum(list(w.weights.values()), []) * (zi-zj)**2
        # Reorganize data
        adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
        adj_list_gs.columns = ['gs', 'ID']
        adj_list_gs = adj_list_gs.groupby(by='ID').sum()

        localG = adj_list_gs.gs.values

        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter


@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [23]:
functest = Local_Geary(connectivity=wq).fit(x)

In [24]:
functest.localG

array([1.82087039e-01, 5.60014026e-01, 9.75294606e-01, 2.15906938e-01,
       6.17372564e-01, 3.84450059e-02, 2.43181756e-01, 9.71802819e-01,
       4.06447101e-02, 7.24722785e-01, 6.30952854e-02, 2.42104497e-02,
       1.59496916e+01, 9.29326006e-01, 9.65188634e-01, 1.32383286e+00,
       3.31775497e-01, 2.99446505e+00, 9.43946814e-01, 2.99570159e+00,
       3.66702291e-01, 2.09592365e+00, 1.46515861e+00, 1.82118455e-01,
       3.10216680e+00, 5.43063937e-01, 5.74532559e+00, 4.79160197e-02,
       1.58993089e-01, 7.18327253e-01, 1.24297849e+00, 8.72629331e-02,
       7.52809650e-01, 4.56515485e-01, 3.86766562e-01, 1.17632604e-01,
       6.90884685e-01, 2.87206102e+00, 4.10455112e-01, 4.04349959e-01,
       1.14211758e-01, 9.59519953e-01, 3.51347976e-01, 7.30240974e-01,
       4.40370938e-01, 7.20360356e-02, 1.66241706e+00, 5.83258909e+00,
       2.30332507e-01, 4.38369688e-01, 8.41461470e-01, 1.52959486e+00,
       4.32157479e-02, 2.08325903e+00, 1.19722984e+00, 1.28169257e+00,
      

In [25]:
functest.p_sim

array([0.196, 0.056, 0.079, 0.188, 0.457, 0.008, 0.182, 0.43 , 0.033,
       0.284, 0.017, 0.014, 0.146, 0.486, 0.011, 0.033, 0.002, 0.152,
       0.312, 0.055, 0.001, 0.169, 0.314, 0.113, 0.072, 0.353, 0.03 ,
       0.014, 0.028, 0.289, 0.248, 0.008, 0.485, 0.002, 0.122, 0.086,
       0.327, 0.189, 0.294, 0.15 , 0.053, 0.365, 0.192, 0.491, 0.281,
       0.005, 0.199, 0.022, 0.137, 0.249, 0.292, 0.262, 0.027, 0.066,
       0.467, 0.318, 0.068, 0.291, 0.023, 0.01 , 0.251, 0.395, 0.44 ,
       0.137, 0.401, 0.141, 0.078, 0.097, 0.306, 0.192, 0.106, 0.215,
       0.016, 0.276, 0.006, 0.167, 0.036, 0.039, 0.012, 0.001, 0.031,
       0.485, 0.002, 0.328, 0.262])

In [27]:
import libpysal as lp
import geopandas as gpd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
w = lp.weights.Queen.from_dataframe(guerry_ds)
y = guerry_ds['Donatns']
lG = Local_Geary(connectivity=w).fit(y)
lG.localG[0:5]
lG.p_sim[0:5]

array([0.191, 0.053, 0.078, 0.173, 0.473])

## Start working on inference (note: now implemented above)

### 'New' `_crand()` engine

In [28]:
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)

In [29]:
@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [30]:
p_sim, rlocalG = _crand_plus(z=np.array(zscore_x), w=wq, observed=np.array(functest.localG), 
            permutations=999, keep=True, n_jobs=1, 
            stat_func=_local_geary)

print(p_sim)
print(rlocalG)

[0.183 0.061 0.071 0.154 0.464 0.007 0.137 0.46  0.03  0.282 0.017 0.015
 0.132 0.48  0.006 0.027 0.003 0.143 0.328 0.061 0.001 0.169 0.33  0.109
 0.07  0.356 0.032 0.014 0.033 0.3   0.232 0.004 0.485 0.002 0.119 0.092
 0.334 0.193 0.323 0.162 0.056 0.362 0.201 0.486 0.294 0.004 0.209 0.025
 0.144 0.264 0.29  0.25  0.031 0.062 0.449 0.334 0.066 0.307 0.036 0.017
 0.239 0.392 0.449 0.129 0.427 0.14  0.093 0.102 0.287 0.18  0.102 0.215
 0.022 0.254 0.007 0.182 0.047 0.041 0.013 0.001 0.041 0.497 0.003 0.353
 0.269]
[[0.22902088 1.64542964 0.27203742 ... 0.01936886 0.12045171 0.14658819]
 [0.95115526 1.20089016 0.90539951 ... 0.62945623 1.13650425 1.19745782]
 [1.62314946 1.64094721 1.45892817 ... 1.46796409 2.20724474 2.30894762]
 ...
 [2.47226611 3.64330211 1.92229684 ... 2.90022066 4.89098889 3.8971247 ]
 [2.60619344 0.29030848 1.8635947  ... 1.13727414 0.05315186 5.59084174]
 [2.0054071  0.27222139 2.12235478 ... 1.20549358 0.0879714  1.79704415]]


# Multivariate Local Geary

$$ c_i = \sum_{h=1}^m \sum_j w_{ij} (x_{hi} - x_{hj})^2 $$

Load in the sample data

In [None]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import pandas as pd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))

In [None]:
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [None]:
x = guerry_ds['Donatns']
y = guerry_ds['Suicids']

In [None]:
variables = [x,y]

Standardize each variable

In [None]:
from scipy import stats
zseries = [stats.zscore(i) for i in variables]

Build the adj lists

In [None]:
adj_list = wq.to_adjlist(remove_symmetric=False)

In [None]:
# The zseries
zseries = [pd.Series(i, index=wq.id_order) for i in zseries]

In [None]:
zseries[0:10]

In [None]:
# The focal values
focal = [zseries[i].loc[adj_list.focal].values for
         i in range(len(variables))]
# The neighbor values
neighbor = [zseries[i].loc[adj_list.neighbor].values for
            i in range(len(variables))]

In [None]:
np.array(focal)[0][0]

In [None]:
np.array(neighbor)[0][0]

In [None]:
(np.array(focal)[0][0] - np.array(neighbor)[0][0])**2

In [None]:
temp = (np.array(focal) - np.array(neighbor))**2
temp[0][0]

In [None]:
gs = sum(list(wq.weights.values()), []) * (np.array(focal) - np.array(neighbor))**2

In [None]:
temp = pd.DataFrame(gs).T

In [None]:
temp['ID'] = adj_list.focal.values

In [None]:
adj_list_gs = temp.groupby(by='ID').sum()
adj_list_gs.head()

In [None]:
k = len(variables)
k

In [None]:
adj_list_gs.sum(axis=1)/k

In [None]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp

PERMUTATIONS=999

class Local_Geary_MV(BaseEstimator):
    """Local Geary - Multivariate"""

    def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, 
                 keep_simulations=True, seed=None):
        """
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           number of random permutations for calculation of pseudo
                           p_values
        n_jobs           : int
                           Number of cores to be used in the conditional randomisation. If -1,
                           all available cores are used.    
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under the null 
                           is stored in memory and accessible; otherwise, replications 
                           are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional randomizations. 
                           Must be set here, and not outside of the function, since numba 
                           does not correctly interpret external seeds 
                           nor numpy.random.RandomState instances.  
                           
        Attributes
        ----------
        localG          : numpy array
                          array containing the observed multivariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, variables, n_jobs=1, permutations=999):
        """
        Arguments
        ---------
        variables        : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        """
        self.variables = np.array(variables, dtype='float')

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(variables, w)

        if permutations:
            pass

        return self

    @staticmethod
    def _statistic(variables, w):
        # Caclulate z-scores for input variables
        zseries = [stats.zscore(i) for i in variables]
        # Define denominator adjustment
        k = len(variables)
        # Create focal and neighbor values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = [pd.Series(i, index=wq.id_order) for i in zseries]
        focal = [zseries[i].loc[adj_list.focal].values for
                 i in range(len(variables))]
        neighbor = [zseries[i].loc[adj_list.neighbor].values for
                    i in range(len(variables))]
        # Carry out local Geary calculation
        gs = sum(list(wq.weights.values()), []) * \
        (np.array(focal) - np.array(neighbor))**2
        # Reorganize data
        temp = pd.DataFrame(gs).T
        temp['ID'] = adj_list.focal.values
        adj_list_gs = temp.groupby(by='ID').sum()
        localG = adj_list_gs.sum(axis=1)/k
        
        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter

@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i    

In [None]:
functest = Local_Geary_MV(connectivity=wq).fit([x,y])
functest.localG

## Working on inference

In [None]:
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)

In [None]:
@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [None]:
p_sim, rlocalG = _crand_plus(z=np.array(x, dtype='float'), w=wq, observed=np.array(functest.localG), 
            permutations=999, keep=True, n_jobs=1, 
            stat_func=_local_geary)

print(p_sim)
print(rlocalG)

In [None]:
x_zscore = stats.zscore(x)
y_zscore = stats.zscore(y)

In [None]:
result = zip(x_zscore, y_zscore)
result = list(result)
result = np.array(result)
result[0:5]

In [None]:
permutations = 999
# Get length based on first variable
n = len(result)
localGs = np.zeros((n, permutations))
n_1 = n - 1
prange = list(range(permutations))
k = wq.max_neighbors + 1
nn = n - 1
rids = np.array([np.random.permutation(nn)[0:k] for i in prange])
ids = np.arange(wq.n)
ido = wq.id_order
w = [wq.weights[ido[i]] for i in ids]
wc = [wq.cardinalities[ido[i]] for i in ids]


Note: below **are** z-score standardized

In [None]:
wq.weights[0]

In [None]:
for i in range(wq.n):
    idsi = ids[ids != i]
    np.random.shuffle(idsi)
    tmp = result[idsi[rids[:, 0:wc[i]]]]
    # define zi?
    zi = result[i][0]
    # define zrand?
    zrand = tmp[i][:,1]
    # Subtract and square?
    temp = (zi - zrand)**2
    # Multiply by weights?
    localGs[i] = (temp * wq.weights[i]).sum(0)
    #joins[i] = result[i] * (w[i] * tmp).sum(1)
    #localGs[i] = ((result[i,0] - tmp2)**2 * wq.weights[i][0]).sum(0)
rlocalGs = localGs

In [None]:
rlocalGs

In [None]:
np.transpose(rlocalGs)

In [None]:
sim = np.transpose(rlocalGs)
print(sim[0])
print(sim[1])

In [None]:
sim[0]

In [None]:
larger[False]

In [None]:
above = sim[0] >= functest.localG
larger = above.sum(0)
low_extreme = (permutations - larger) < larger
larger[low_extreme] = permutations - larger[low_extreme]
p_sim = (larger + 1.0) / (permutations + 1.0)
p_sim

In [None]:
temp2