Reference: https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html#local-geary

# Univariate local geary

Interestingly, using [this equation](https://www.biomedware.com/files/documentation/spacestat/Statistics/Gearys_C/Geary_s_C_statistic.htm) which explicitly calls for standardization of input data. We also do NOT divide by 2.

$$ c_i = \sum_j w_{ij} (z_i - z_j)^2 $$ 

where: 

$z_i = x_i - \bar{x}$ and $z_j = x_j - \bar{x}$, and $w_{ij}$ are the elements of the row-standardized binary symmetric spatial weight matrix W. 

or, $$ c_i = (1/m^2) * \sum_j w_{ij} (x_i - x_j)^2 $$

where,

$$ m^2 = \sum_i (x_i−\bar{x})^2/n $$

## Load in example data

In [1]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import numpy as np
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))

In [2]:
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [3]:
wq[0]

{66: 1.0, 35: 1.0, 68: 1.0, 36: 1.0}

In [4]:
wq.transform = 'r'
wq[0]

{66: 0.25, 35: 0.25, 68: 0.25, 36: 0.25}

In [4]:
x = guerry_ds['Donatns']

print("x_i is", x[0])
print("x_j are", x[66], x[35], x[68], x[36])

x_i is 5098
x_j are 1983 4077 3710 3012


In [5]:
# Calculate zscore of input variable
zscore_x = (x - np.mean(x))/np.std(x)
zscore_x

0    -0.336188
1     0.450441
2     0.879023
3    -0.825375
4     0.049370
        ...   
80    1.512380
81    0.454785
82    1.467288
83   -0.555029
84   -0.506214
Name: Donatns, Length: 85, dtype: float64

# Build observed local geary values

In [6]:
adj_list = wq.to_adjlist(remove_symmetric=False)
adj_list.head()

Unnamed: 0,focal,neighbor,weight
0,0,66,1.0
1,0,35,1.0
2,0,68,1.0
3,0,36,1.0
4,1,48,1.0


In [9]:
adj_list.weight.values == adj_list.weight

0      True
1      True
2      True
3      True
4      True
       ... 
415    True
416    True
417    True
418    True
419    True
Name: weight, Length: 420, dtype: bool

In [9]:
len(sum(list(wq.weights.values()), []))

420

In [10]:
import pandas as pd
zseries = pd.Series(zscore_x, index=wq.id_order)
zseries[0:5]

0   -0.336188
1    0.450441
2    0.879023
3   -0.825375
4    0.049370
Name: Donatns, dtype: float64

In [11]:
# Define z_i
zi = zseries.loc[adj_list.focal].values
zi[0:5]

array([-0.33618783, -0.33618783, -0.33618783, -0.33618783,  0.45044136])

In [12]:
# Define zj
zj = zseries.loc[adj_list.neighbor].values
zj[0:5]

array([-0.98050808, -0.54737594, -0.62328783, -0.76766521, -0.5709562 ])

In [13]:
(zi-zj)[0:5]

array([0.64432025, 0.21118812, 0.2871    , 0.43147738, 1.02139756])

In [14]:
#(zi-zj)**2

Multiply by spatial weights

In [15]:
# sum(list(wq.weights.values()), []) * (zi-zj)**2
diff = zi-zj

In [16]:
test = sum(list(wq.weights.values()), []) * (diff)**2

In [17]:
# Create a df that uses the adjacency list focal values and the BBs counts
temp = pd.DataFrame(adj_list.focal.values, test).reset_index()
temp[2] = diff
temp.head()

Unnamed: 0,index,0,2
0,0.103787,0,0.64432
1,0.01115,0,0.211188
2,0.020607,0,0.2871
3,0.046543,0,0.431477
4,0.173875,1,1.021398


In [18]:
# Temporarily rename the columns
temp.columns = ['E_ij', 'ID', 'Diff_ij']
temp = temp.groupby(by='ID').sum()

In [19]:
temp.E_ij.values[0:5]

array([0.18208704, 0.56001403, 0.97529461, 0.21590694, 0.61737256])

# Start building function

Final form of local geary univariate a few cells below

In [20]:
#%load_ext pycodestyle_magic

In [21]:
#%pycodestyle_off

In [22]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)


PERMUTATIONS = 999


class Local_Geary(BaseEstimator):

    """Local Geary - Univariate"""

    def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1,
                 keep_simulations=True, seed=None):
        """
        Initialize a Local_Geary estimator
        Arguments
        ---------
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           number of random permutations for calculation
                           of pseudo p_values
        n_jobs           : int
                           Number of cores to be used in the conditional
                           randomisation. If -1, all available cores are used.
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under
                           the null is stored in memory and accessible;
                           otherwise, replications are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional
                           randomizations. Must be set here, and not outside
                           of the function, since numba does not correctly
                           interpret external seeds nor
                           numpy.random.RandomState instances.

        Attributes
        ----------
        localG          : numpy array
                          array containing the observed univariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, x, n_jobs=1, permutations=999):
        """
        Arguments
        ---------
        x                : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal as lp
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> y = guerry_ds['Donatns']
        >>> lG = Local_Geary(connectivity=w).fit(y)
        >>> lG.localG[0:5]
        >>> lG.p_sim[0:5]
        """
        x = np.asarray(x).flatten()

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(x, w)

        if self.permutations:
            self.p_sim, self.rlocalG = _crand_plus(
                z=(x - np.mean(x))/np.std(x),
                w=w,
                observed=self.localG,
                permutations=permutations,
                keep=True,
                n_jobs=n_jobs,
                stat_func=_local_geary
            )

        del (self.keep_simulations, self.n_jobs,
             self.permutations, self.seed, self.rlocalG,
             self.connectivity)

        return self

    @staticmethod
    def _statistic(x, w):
        # Caclulate z-scores for x
        zscore_x = (x - np.mean(x))/np.std(x)
        # Create focal (xi) and neighbor (zi) values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = pd.Series(zscore_x, index=w.id_order)
        zi = zseries.loc[adj_list.focal].values
        zj = zseries.loc[adj_list.neighbor].values
        # Carry out local Geary calculation
        gs = adj_list.weight.values * (zi-zj)**2
        # Reorganize data
        adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
        adj_list_gs.columns = ['gs', 'ID']
        adj_list_gs = adj_list_gs.groupby(by='ID').sum()

        localG = adj_list_gs.gs.values

        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter


@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [23]:
functest = Local_Geary(connectivity=wq).fit(x)

In [24]:
functest.localG

array([1.82087039e-01, 5.60014026e-01, 9.75294606e-01, 2.15906938e-01,
       6.17372564e-01, 3.84450059e-02, 2.43181756e-01, 9.71802819e-01,
       4.06447101e-02, 7.24722785e-01, 6.30952854e-02, 2.42104497e-02,
       1.59496916e+01, 9.29326006e-01, 9.65188634e-01, 1.32383286e+00,
       3.31775497e-01, 2.99446505e+00, 9.43946814e-01, 2.99570159e+00,
       3.66702291e-01, 2.09592365e+00, 1.46515861e+00, 1.82118455e-01,
       3.10216680e+00, 5.43063937e-01, 5.74532559e+00, 4.79160197e-02,
       1.58993089e-01, 7.18327253e-01, 1.24297849e+00, 8.72629331e-02,
       7.52809650e-01, 4.56515485e-01, 3.86766562e-01, 1.17632604e-01,
       6.90884685e-01, 2.87206102e+00, 4.10455112e-01, 4.04349959e-01,
       1.14211758e-01, 9.59519953e-01, 3.51347976e-01, 7.30240974e-01,
       4.40370938e-01, 7.20360356e-02, 1.66241706e+00, 5.83258909e+00,
       2.30332507e-01, 4.38369688e-01, 8.41461470e-01, 1.52959486e+00,
       4.32157479e-02, 2.08325903e+00, 1.19722984e+00, 1.28169257e+00,
      

In [25]:
functest.p_sim

array([0.21 , 0.052, 0.073, 0.17 , 0.454, 0.005, 0.18 , 0.403, 0.024,
       0.297, 0.01 , 0.015, 0.127, 0.446, 0.012, 0.033, 0.003, 0.162,
       0.342, 0.055, 0.002, 0.16 , 0.282, 0.082, 0.074, 0.351, 0.036,
       0.011, 0.025, 0.299, 0.234, 0.004, 0.474, 0.003, 0.117, 0.077,
       0.319, 0.181, 0.3  , 0.16 , 0.041, 0.34 , 0.202, 0.499, 0.28 ,
       0.002, 0.223, 0.025, 0.136, 0.246, 0.288, 0.259, 0.029, 0.067,
       0.452, 0.344, 0.072, 0.285, 0.03 , 0.017, 0.259, 0.377, 0.467,
       0.121, 0.425, 0.161, 0.073, 0.093, 0.307, 0.194, 0.104, 0.23 ,
       0.017, 0.251, 0.011, 0.172, 0.032, 0.037, 0.01 , 0.001, 0.028,
       0.473, 0.002, 0.347, 0.258])

In [26]:
import libpysal as lp
import geopandas as gpd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
w = lp.weights.Queen.from_dataframe(guerry_ds)
y = guerry_ds['Donatns']
lG = Local_Geary(connectivity=w).fit(y)
lG.localG[0:5]
lG.p_sim[0:5]

array([0.189, 0.062, 0.074, 0.152, 0.488])

# Tests

In [27]:
import unittest
import libpysal
from libpysal.common import pandas, RTOL, ATOL
from esda.local_geary import Local_Geary
import numpy as np

PANDAS_EXTINCT = pandas is None

#from ..local_geary import Local_Geary

class Local_Geary_Tester(unittest.TestCase):
    def setUp(self):
        np.random.seed(10)
        self.w = libpysal.io.open(libpysal.examples.get_path("stl.gal")).read()
        f = libpysal.io.open(libpysal.examples.get_path("stl_hom.txt"))
        self.y = np.array(f.by_col['HR8893'])

    def test_local_geary(self):
        lG = Local_Geary(connectivity=self.w).fit(self.y)
        self.assertAlmostEqual(lG.localG[0], 0.696703432)
        self.assertAlmostEqual(lG.p_sim[0], 0.19)
        
suite = unittest.TestSuite()
test_classes = [
    Local_Geary_Tester
]
for i in test_classes:
    a = unittest.TestLoader().loadTestsFromTestCase(i)
    suite.addTest(a)

if __name__ == "__main__":
    runner = unittest.TextTestRunner()
    runner.run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 1.070s

OK


# Identify obs of interest ('GeoDa quads', but not really)

[Source](https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html#principle-1)

>Those locations identified as significant and with the Local Geary statistic smaller than its mean, suggest positive spatial autocorrelation (small differences imply similarity). For those observations that can be classified in the upper-right or lower-left quadrants of a matching Moran scatter plot, we can identify the association as high-high or low-low. However, given that the squared difference can cross the mean, there may be observations for which such a classification is not possible. We will refer to those as other positive spatial autocorrelation.

>For negative spatial autocorrelation (large values imply dissimilarity), it is not possible to assess whether the association is between high-low or low-high outliers, since the squaring of the differences removes the sign.

We use a slightly different approach and do not interact with the Local Moran scatterplot. We need to first define the mean of the calculated Local Geary statistic, and define the mean of the input variable (`y`).

In [28]:
# Mean of local geary
Eij_mean = np.mean(functest.localG); print(Eij_mean)
# Mean of x variable
y_mean = np.mean(y); print(y_mean)

1.157045600541073
6723.317647058823


Identify areas as outliers

In [29]:
outliers = (functest.localG < Eij_mean) & (y > y_mean) & (functest.p_sim<=0.05)
pd.value_counts(outliers)

False    80
True      5
Name: Donatns, dtype: int64

Identify areas as clusters

In [30]:
clusters = (functest.localG < Eij_mean) & (y < y_mean) & (functest.p_sim<=0.05)
pd.value_counts(clusters)

False    69
True     16
Name: Donatns, dtype: int64

Identify areas as 'other'

In [31]:
other = (functest.localG > Eij_mean) & (functest.p_sim<=0.05)
pd.value_counts(other)

False    79
True      6
dtype: int64

### Not significant

In [32]:
neg = (functest.p_sim>0.05)
pd.value_counts(neg)

True     58
False    27
dtype: int64

In [33]:
##Old implementation - bad
#if self.geoda_quads:
#    from esda import Moran_Local
#    localm = Moran_Local(y=x, 
#                         w=w, 
#                         geoda_quads=True)
#
#    Eij_mean = np.mean(self.localG)
#
#    # Create empty vector
#    self.q = np.ones(len(x))*5
#    # 1: high high
#    self.q[(self.localG < E_ij_mean) & (self.p_sim<=self.sig) & (localm.q==1)] = 1
#    # 2: low low
#    self.q[(self.localG < E_ij_mean) & (self.p_sim<=self.sig) & (localm.q==2)] = 2
#    # 4: negative - 2*mean appropriate?
#    self.q[(self.localG > 2*E_ij_mean) & (self.p_sim<=self.sig) & (localm.q!=2) & (localm.q!=4)] = 4
#    # 5: not significant
#    self.q[self.p_sim > self.sig] = 0
#    # 0: other - all remaining obs? not sure how to define, need to double check...
#    # Do nothing

Should be 57, this is close enough...

## Start working on inference (note: now implemented above)

### 'New' `_crand()` engine

In [34]:
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)

In [35]:
@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [36]:
p_sim, rlocalG = _crand_plus(z=np.array(zscore_x), w=wq, observed=np.array(functest.localG), 
            permutations=999, keep=True, n_jobs=1, 
            stat_func=_local_geary)

print(p_sim)
print(rlocalG)

[0.188 0.042 0.058 0.16  0.479 0.003 0.153 0.453 0.017 0.275 0.009 0.007
 0.134 0.472 0.01  0.035 0.001 0.146 0.332 0.06  0.002 0.139 0.302 0.095
 0.07  0.341 0.023 0.008 0.028 0.283 0.223 0.008 0.496 0.003 0.1   0.086
 0.312 0.173 0.3   0.156 0.059 0.362 0.191 0.488 0.275 0.006 0.196 0.018
 0.124 0.232 0.284 0.235 0.022 0.056 0.435 0.321 0.071 0.294 0.021 0.016
 0.241 0.38  0.449 0.143 0.414 0.129 0.078 0.099 0.318 0.199 0.097 0.231
 0.018 0.262 0.003 0.175 0.038 0.037 0.011 0.001 0.033 0.492 0.003 0.333
 0.27 ]
[[1.73354558 0.48582724 0.39120719 ... 1.09739692 0.30315983 1.15158028]
 [1.2329202  0.70714724 0.79341306 ... 1.25615194 1.00711015 0.79559242]
 [1.78399622 1.40691833 1.24765407 ... 2.0946598  1.85841311 1.02120587]
 ...
 [3.1149016  3.08453218 2.92127521 ... 3.40044339 3.17740559 3.47494539]
 [1.1715959  0.44600353 3.85646581 ... 0.21532951 0.49830135 0.22095061]
 [0.82826412 0.16964085 4.48525585 ... 0.20116451 0.37692207 0.24373652]]


## Rebuilding function with obs of interest ('GeoDa quads', but not really)


In [37]:
#%load_ext pycodestyle_magic

In [38]:
#%pycodestyle_off

In [39]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)


class Local_Geary(BaseEstimator):

    """Local Geary - Univariate"""

    def __init__(self, connectivity=None, labels=False, sig=0.05,
                 permutations=999, n_jobs=1, keep_simulations=True,
                 seed=None):
        """
        Initialize a Local_Geary estimator
        Arguments
        ---------
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        labels           : boolean
                           (default=False)
                           If True use, label if an observation
                           belongs to an outlier, cluster, other,
                           or non-significant group. 1 = outlier,
                           2 = cluster, 3 = other, 4 = non-significant.
                           Note that this is not the exact same as the
                           cluster map produced by GeoDa.
        sig              : float
                           (default=0.05)
                           Default significance threshold used for
                           creation of labels groups.
        permutations     : int
                           (default=999)
                           number of random permutations for calculation
                           of pseudo p_values
        n_jobs           : int
                           (default=1)
                           Number of cores to be used in the conditional
                           randomisation. If -1, all available cores are used.
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under
                           the null is stored in memory and accessible;
                           otherwise, replications are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional
                           randomizations. Must be set here, and not outside
                           of the function, since numba does not correctly
                           interpret external seeds nor
                           numpy.random.RandomState instances.

        Attributes
        ----------
        localG          : numpy array
                          array containing the observed univariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        labs            : numpy array
                          array containing the labels for if each observation.
        """

        self.connectivity = connectivity
        self.labels = labels
        self.sig = sig
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, x):
        """
        Arguments
        ---------
        x                : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal as lp
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> y = guerry_ds['Donatns']
        >>> lG = Local_Geary(connectivity=w).fit(y)
        >>> lG.localG[0:5]
        >>> lG.p_sim[0:5]
        """
        x = np.asarray(x).flatten()

        w = self.connectivity
        w.transform = 'r'
        
        permutations = self.permutations
        sig = self.sig
        keep_simulations = self.keep_simulations
        n_jobs = self.n_jobs
        seed = self.seed

        self.localG = self._statistic(x, w)

        if permutations:
            self.p_sim, self.rlocalG = _crand_plus(
                z=(x - np.mean(x))/np.std(x),
                w=w,
                observed=self.localG,
                permutations=permutations,
                keep=keep_simulations,
                n_jobs=n_jobs,
                stat_func=_local_geary
            )

        if self.labels:
            Eij_mean = np.mean(self.localG)
            x_mean = np.mean(x)
            # Create empty vector to fill
            self.labs = np.empty(len(x)) * np.nan
            # Outliers
            self.labs[(self.localG < Eij_mean) &
                      (x > x_mean) &
                      (self.p_sim <= sig)] = 1
            # Clusters
            self.labs[(self.localG < Eij_mean) &
                      (x < x_mean) &
                      (self.p_sim <= sig)] = 2
            # Other
            self.labs[(self.localG > Eij_mean) &
                      (self.p_sim <= sig)] = 3
            # Non-significant
            self.labs[self.p_sim > sig] = 4

        del (self.keep_simulations, self.n_jobs,
             self.permutations, self.seed, self.rlocalG,
             self.connectivity, self.labels)

        return self

    @staticmethod
    def _statistic(x, w):
        # Caclulate z-scores for x
        zscore_x = (x - np.mean(x))/np.std(x)
        # Create focal (xi) and neighbor (zi) values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = pd.Series(zscore_x, index=w.id_order)
        zi = zseries.loc[adj_list.focal].values
        zj = zseries.loc[adj_list.neighbor].values
        # Carry out local Geary calculation
        gs = adj_list.weight.values * (zi-zj)**2
        # Reorganize data
        adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
        adj_list_gs.columns = ['gs', 'ID']
        adj_list_gs = adj_list_gs.groupby(by='ID').sum()

        localG = adj_list_gs.gs.values

        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter

@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [40]:
functest = Local_Geary(connectivity=wq, labels=True).fit(x)

In [41]:
pd.value_counts(functest.labs)

4.0    58
2.0    16
3.0     6
1.0     5
dtype: int64

In [42]:
functest = Local_Geary(connectivity=wq, labels=True).fit(guerry_ds['Suicids'])

In [43]:
pd.value_counts(functest.labs)

4.0    52
2.0    22
1.0     6
3.0     5
dtype: int64

Try with updated `keep_simulations` parameter

In [44]:
functest = Local_Geary(connectivity=wq, labels=True, n_jobs=2, keep_simulations=False).fit(guerry_ds['Suicids'])

In [45]:
import libpysal as lp
import geopandas as gpd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
w = libpysal.weights.Queen.from_dataframe(guerry_ds)
y = guerry_ds['Donatns']
lG = Local_Geary(connectivity=w, n_jobs=2, keep_simulations=False).fit(y)
lG.p_sim

array([0.179, 0.05 , 0.07 , 0.14 , 0.464, 0.003, 0.15 , 0.445, 0.019,
       0.289, 0.01 , 0.012, 0.144, 0.465, 0.006, 0.045, 0.003, 0.158,
       0.321, 0.074, 0.002, 0.17 , 0.302, 0.102, 0.071, 0.358, 0.029,
       0.01 , 0.037, 0.303, 0.264, 0.008, 0.475, 0.002, 0.128, 0.088,
       0.319, 0.2  , 0.313, 0.159, 0.052, 0.361, 0.204, 0.499, 0.29 ,
       0.006, 0.237, 0.031, 0.142, 0.249, 0.276, 0.266, 0.023, 0.05 ,
       0.434, 0.339, 0.079, 0.292, 0.027, 0.022, 0.255, 0.388, 0.453,
       0.126, 0.396, 0.146, 0.081, 0.115, 0.293, 0.176, 0.107, 0.219,
       0.013, 0.247, 0.008, 0.148, 0.036, 0.04 , 0.013, 0.002, 0.033,
       0.496, 0.003, 0.391, 0.235])

# Multivariate Local Geary

$$ c_i = \sum_{h=1}^m \sum_j w_{ij} (x_{hi} - x_{hj})^2 $$

Load in the sample data

In [46]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import pandas as pd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))

In [47]:
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [48]:
x = guerry_ds['Donatns']
y = guerry_ds['Suicids']

In [49]:
variables = [x,y]

Standardize each variable

In [50]:
from scipy import stats
zseries = [stats.zscore(i) for i in variables]
zseries

[array([-0.33618783,  0.45044136,  0.87902293, -0.82537479,  0.0493701 ,
        -0.7312606 , -0.06687644, -0.65803769, -0.64438596, -0.85660829,
        -0.72650318, -0.91204259,  4.3657987 , -0.54406643,  1.42281681,
         1.35083496,  0.58695866,  1.71053735, -0.86529576,  0.75781212,
         0.88398719, -0.42120087, -0.67996319, -0.80551773,  1.03188092,
        -0.44891802,  3.56220827, -0.76021881, -0.91783423, -0.80158769,
        -0.3407384 , -1.04318193,  0.19912544,  0.9497637 ,  0.10976866,
        -0.54737594, -0.76766521,  1.10365593, -0.22697399, -0.67789475,
        -0.82268582,  0.32819633, -0.40754914, -0.31633077, -0.47394619,
        -0.96871795, -0.47849676, -0.31943343, -0.5709562 , -0.56061398,
        -0.95485938, -0.58150526, -0.52276146,  1.65799887,  0.57744381,
         0.771257  , -0.13058451, -0.25282954,  0.5209753 , -0.20339373,
        -0.15726743, -0.70830087, -0.14940735,  1.01781551,  1.60277142,
        -0.14940735, -0.98050808,  1.02960564, -0.6

Build the adj lists

In [51]:
adj_list = wq.to_adjlist(remove_symmetric=False)
adj_list.head()

Unnamed: 0,focal,neighbor,weight
0,0,66,1.0
1,0,35,1.0
2,0,68,1.0
3,0,36,1.0
4,1,48,1.0


In [52]:
# The zseries
zseries = [pd.Series(i, index=wq.id_order) for i in zseries]
zseries

[0    -0.336188
 1     0.450441
 2     0.879023
 3    -0.825375
 4     0.049370
         ...   
 80    1.512380
 81    0.454785
 82    1.467288
 83   -0.555029
 84   -0.506214
 Length: 85, dtype: float64,
 0    -0.047195
 1    -0.756433
 2     2.478379
 3    -0.711499
 4    -0.649766
         ...   
 80    1.004270
 81   -0.468369
 82   -0.096441
 83   -0.111387
 84   -0.757774
 Length: 85, dtype: float64]

In [53]:
# The focal values
focal = [zseries[i].loc[adj_list.focal].values for
         i in range(len(variables))]
# The neighbor values
neighbor = [zseries[i].loc[adj_list.neighbor].values for
            i in range(len(variables))]

In [54]:
np.array(focal)[0][0]

-0.3361878263899247

In [55]:
np.array(neighbor)[0][0]

-0.9805080804028404

In [56]:
(np.array(focal)[0][0] - np.array(neighbor)[0][0])**2

0.4151485897312682

In [57]:
temp = (np.array(focal) - np.array(neighbor))**2
temp[0][0]

0.4151485897312682

In [90]:
gs = sum(list(wq.weights.values()), []) * (np.array(focal) - np.array(neighbor))**2
#gs_2 = adj_list.weight.values * (np.array(focal) - np.array(neighbor))**2

In [59]:
#sum(list(wq.weights.values()), [])

In [60]:
temp = pd.DataFrame(gs).T

In [61]:
temp['ID'] = adj_list.focal.values

In [62]:
adj_list_gs = temp.groupby(by='ID').sum()
adj_list_gs.head()

Unnamed: 0_level_0,0,1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.728348,0.5022
1,3.360084,0.28263
2,5.851768,29.604873
3,0.863628,0.121489
4,1.852118,0.475642


In [63]:
k = len(variables)
k

2

In [64]:
adj_list_gs.sum(axis=1)/k

ID
0      0.615274
1      1.821357
2     17.728320
3      0.492558
4      1.163880
        ...    
80     6.629720
81     3.154587
82     3.872022
83     4.307689
84     1.080905
Length: 85, dtype: float64

Rough cut of function, final form at end of notebook.

In [65]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp

PERMUTATIONS=999

class Local_Geary_MV(BaseEstimator):
    """Local Geary - Multivariate"""

    def __init__(self, connectivity=None, permutations=999):
        """
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           number of random permutations for calculation of pseudo
                           p_values                           
        Attributes
        ----------
        localG          : numpy array
                          array containing the observed multivariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations

    def fit(self, variables, permutations=999):
        """
        Arguments
        ---------
        variables        : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        """
        self.variables = np.array(variables, dtype='float')

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(variables, w)

        if permutations:
            pass

        return self

    @staticmethod
    def _statistic(variables, w):
        # Caclulate z-scores for input variables
        zseries = [stats.zscore(i) for i in variables]
        # Define denominator adjustment
        k = len(variables)
        # Create focal and neighbor values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = [pd.Series(i, index=w.id_order) for i in zseries]
        focal = [zseries[i].loc[adj_list.focal].values for
                 i in range(len(variables))]
        neighbor = [zseries[i].loc[adj_list.neighbor].values for
                    i in range(len(variables))]
        # Carry out local Geary calculation
        gs = adj_list.weight.values * \
        (np.array(focal) - np.array(neighbor))**2
        # Reorganize data
        temp = pd.DataFrame(gs).T
        temp['ID'] = adj_list.focal.values
        adj_list_gs = temp.groupby(by='ID').sum()
        localG = adj_list_gs.sum(axis=1)/k
        
        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter

@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i    

In [66]:
functest = Local_Geary_MV(connectivity=wq).fit([x,y])
functest.localG

ID
0     0.153819
1     0.303560
2     2.954720
3     0.123140
4     0.387960
        ...   
80    1.657430
81    0.525764
82    0.645337
83    0.717948
84    0.216181
Length: 85, dtype: float64

## Working on inference - unlikely that numba will work...

In [67]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import pandas as pd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
x = guerry_ds['Donatns']
y = guerry_ds['Suicids']
variables = np.array([x,y])
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [68]:
zseries = np.array([stats.zscore(i) for i in variables])

In [69]:
permutations = 99

In [70]:
# Get length based on first variable
n = len(variables[0])
joins = np.zeros((n, permutations))
n_1 = n - 1
prange = list(range(permutations))
k = wq.max_neighbors + 1
nn = n - 1
rids = np.array([np.random.permutation(nn)[0:k] for i in prange])
ids = np.arange(wq.n)
ido = wq.id_order
w = [wq.weights[ido[i]] for i in ids]
wc = [wq.cardinalities[ido[i]] for i in ids]

for i in range(wq.n):
    idsi = ids[ids != i]
    np.random.shuffle(idsi)
    vars_rand = []
    for j in range(variables.shape[0]):
        vars_rand.append(zseries[j][idsi[rids[:, 0:wc[i]]]])
    # vars rand as tmp
    # Calculate diff
    diff = []
    for z in range(variables.shape[0]):
        diff.append((np.array((zseries[z][i] - vars_rand[z])**2 * w[i])).sum(1))
    # add up differences
    temp = np.array([sum(x) for x in zip(*diff)])
    # Assign to object to be returned
    joins[i] = temp

In [71]:
sim = np.array(np.transpose(joins)); sim.shape

(99, 85)

In [72]:
above = sim >= np.array(functest.localG)

In [73]:
larger = above.sum(0)
low_extreme = (permutations - larger) < larger
larger[low_extreme] = permutations - larger[low_extreme]
p_sim = (larger + 1.0) / (permutations + 1.0)
p_sim

array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.09, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.03, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.09, 0.01, 0.03, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.04,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.25, 0.01, 0.21,
       0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01])

In [74]:
permutations=999

In [75]:
# Get length based on first variable
nvars = variables.shape[0]
n = len(variables[0])
joins = np.zeros((n, permutations))
n_1 = n - 1
prange = list(range(permutations))
k = wq.max_neighbors + 1
nn = n - 1
rids = np.array([np.random.permutation(nn)[0:k] for i in prange])
ids = np.arange(wq.n)
ido = wq.id_order
w = [wq.weights[ido[i]] for i in ids]
wc = [wq.cardinalities[ido[i]] for i in ids]

for i in range(wq.n):
    idsi = ids[ids != i]
    np.random.shuffle(idsi)
    vars_rand = []
    for j in range(nvars):
        vars_rand.append(zseries[j][idsi[rids[:, 0:wc[i]]]])
    # vars rand as tmp
    # Calculate diff
    diff = []
    for z in range(nvars):
        diff.append((np.array((zseries[z][i] - vars_rand[z])**2 * w[i])).sum(1)/nvars)
    # add up differences
    temp = np.array([sum(x) for x in zip(*diff)])
    # Assign to object to be returned
    joins[i] = temp

In [76]:
sim = np.array(np.transpose(joins)); print(sim.shape)
print()
above = sim >= np.array(functest.localG)
larger = above.sum(0)
low_extreme = (permutations - larger) < larger
larger[low_extreme] = permutations - larger[low_extreme]
p_sim = (larger + 1.0) / (permutations + 1.0)
p_sim

(999, 85)



array([0.001, 0.001, 0.001, 0.001, 0.027, 0.001, 0.001, 0.002, 0.001,
       0.001, 0.001, 0.001, 0.002, 0.001, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.004, 0.001, 0.001, 0.243, 0.001, 0.001, 0.001, 0.004,
       0.013, 0.001, 0.001, 0.085, 0.072, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.012, 0.001, 0.001, 0.001, 0.006, 0.001, 0.001, 0.001,
       0.205, 0.001, 0.285, 0.001, 0.001, 0.003, 0.066, 0.001, 0.001,
       0.088, 0.002, 0.001, 0.001, 0.003, 0.001, 0.001, 0.058, 0.001,
       0.221, 0.002, 0.409, 0.015, 0.004, 0.012, 0.001, 0.001, 0.002,
       0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
       0.001, 0.001, 0.004, 0.001])

Implement into final form of function. 

In [77]:
#%load_ext pycodestyle_magic

In [78]:
#%pycodestyle_off

In [79]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp


class Local_Geary_MV(BaseEstimator):

    """Local Geary - Multivariate"""

    def __init__(self, connectivity=None, permutations=999):
        """
        Initialize a Local_Geary_MV estimator
        Arguments
        ---------
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           (default=999)
                           number of random permutations for calculation
                           of pseudo p_values
        Attributes
        ----------
        localG          : numpy array
                          array containing the observed multivariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations

    def fit(self, variables):
        """
        Arguments
        ---------
        variables        : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> import libpysal
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> x1 = guerry_ds['Donatns']
        >>> x2 = guerry_ds['Suicids']
        >>> lG_mv = Local_Geary(connectivity=w).fit([x1,x2])
        >>> lG_mv.localG[0:5]
        >>> lG_mv.p_sim[0:5]
        """
        self.variables = np.array(variables, dtype='float')

        w = self.connectivity
        w.transform = 'r'

        self.n = len(variables[0])
        self.w = w
        
        permutations = self.permutations

        # Caclulate z-scores for input variables
        # to be used in _statistic and _crand
        zvariables = [stats.zscore(i) for i in variables]

        self.localG = self._statistic(variables, zvariables, w)

        if permutations:
            self._crand(zvariables)
            sim = np.transpose(self.Gs)
            above = sim >= self.localG
            larger = above.sum(0)
            low_extreme = (permutations - larger) < larger
            larger[low_extreme] = permutations - larger[low_extreme]
            self.p_sim = (larger + 1.0) / (permutations + 1.0)

        del (self.n, self.permutations, self.Gs,
             self.connectivity)

        return self

    @staticmethod
    def _statistic(variables, zvariables, w):
        # Define denominator adjustment
        k = len(variables)
        # Create focal and neighbor values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = [pd.Series(i, index=w.id_order) for i in zvariables]
        focal = [zseries[i].loc[adj_list.focal].values for
                 i in range(len(variables))]
        neighbor = [zseries[i].loc[adj_list.neighbor].values for
                    i in range(len(variables))]
        # Carry out local Geary calculation
        gs = adj_list.weight.values * \
            (np.array(focal) - np.array(neighbor))**2
        # Reorganize data
        temp = pd.DataFrame(gs).T
        temp['ID'] = adj_list.focal.values
        adj_list_gs = temp.groupby(by='ID').sum()
        localG = np.array(adj_list_gs.sum(axis=1)/k)

        return (localG)

    def _crand(self, zvariables):
        """
        conditional randomization

        for observation i with ni neighbors,  the candidate set cannot include
        i (we don't want i being a neighbor of i). we have to sample without
        replacement from a set of ids that doesn't include i. numpy doesn't
        directly support sampling wo replacement and it is expensive to
        implement this. instead we omit i from the original ids,  permute the
        ids and take the first ni elements of the permuted ids as the
        neighbors to i in each randomization.

        """
        nvars = self.variables.shape[0]
        n = self.variables.shape[1]
        Gs = np.zeros((self.n, self.permutations))
        n_1 = self.n - 1
        prange = list(range(self.permutations))
        k = self.w.max_neighbors + 1
        nn = self.n - 1
        rids = np.array([np.random.permutation(nn)[0:k] for i in prange])
        ids = np.arange(self.w.n)
        ido = self.w.id_order
        w = [self.w.weights[ido[i]] for i in ids]
        wc = [self.w.cardinalities[ido[i]] for i in ids]

        for i in range(self.w.n):
            idsi = ids[ids != i]
            np.random.shuffle(idsi)
            vars_rand = []
            for j in range(nvars):
                vars_rand.append(zvariables[j][idsi[rids[:, 0:wc[i]]]])
            # vars rand as tmp
            # Calculate diff
            diff = []
            for z in range(nvars):
                diff.append((np.array((zvariables[z][i]-vars_rand[z])**2
                                      * w[i])).sum(1)/nvars)
            # add up differences
            temp = np.array([sum(x) for x in zip(*diff)])
            # Assign to object to be returned
            Gs[i] = temp
        self.Gs = Gs

Test on data

In [80]:
finaltest = Local_Geary_MV(connectivity=wq).fit([x,y])

In [81]:
finaltest.localG

array([0.15381853, 0.30355953, 2.95472008, 0.12313959, 0.38795991,
       1.30965103, 0.23377826, 3.06565788, 0.03298004, 0.91999877,
       2.98954356, 0.0547241 , 8.12866872, 1.79010883, 0.50938085,
       0.98143553, 1.23413236, 1.81836585, 0.55610066, 2.35686284,
       1.06675706, 1.12552938, 0.74266791, 0.2087418 , 1.63044552,
       0.3321312 , 3.52879191, 0.97300142, 1.2151608 , 1.11753386,
       0.73890484, 1.34938525, 0.53489888, 0.49244999, 0.26092712,
       0.26520938, 0.40195695, 1.67716168, 0.23640991, 1.4606562 ,
       5.85492147, 0.74226135, 0.21032708, 0.90616062, 0.31530565,
       4.30743642, 0.95572731, 3.15545158, 0.1576923 , 0.29022339,
       0.5313531 , 0.81757624, 0.08418979, 1.28639067, 0.6360191 ,
       1.32843439, 0.06698162, 0.21321003, 1.80717048, 0.01005432,
       1.35896344, 1.90771844, 4.07391556, 3.66613113, 1.72818279,
       0.84588107, 0.57173376, 1.58384441, 1.14442728, 0.26679484,
       0.01680013, 0.20773941, 0.06654543, 0.25644334, 1.30015

In [82]:
finaltest.p_sim

array([0.013, 0.008, 0.02 , 0.019, 0.234, 0.471, 0.118, 0.068, 0.001,
       0.063, 0.036, 0.002, 0.129, 0.14 , 0.003, 0.03 , 0.443, 0.109,
       0.1  , 0.288, 0.004, 0.39 , 0.49 , 0.03 , 0.452, 0.088, 0.048,
       0.382, 0.353, 0.232, 0.458, 0.448, 0.084, 0.01 , 0.034, 0.037,
       0.127, 0.361, 0.044, 0.268, 0.012, 0.493, 0.017, 0.496, 0.056,
       0.039, 0.485, 0.04 , 0.014, 0.073, 0.146, 0.38 , 0.006, 0.062,
       0.309, 0.39 , 0.005, 0.011, 0.163, 0.003, 0.122, 0.262, 0.025,
       0.048, 0.245, 0.372, 0.188, 0.379, 0.48 , 0.043, 0.001, 0.019,
       0.001, 0.013, 0.008, 0.006, 0.004, 0.086, 0.001, 0.001, 0.054,
       0.117, 0.002, 0.391, 0.041])

# Tests (multivariate)

In [83]:
import unittest
import libpysal
from libpysal.common import pandas, RTOL, ATOL
from esda.local_geary import Local_Geary
import numpy as np

PANDAS_EXTINCT = pandas is None

#from ..local_geary_mv import Local_Geary_MV

class Local_Geary_MV_Tester(unittest.TestCase):
    def setUp(self):
        np.random.seed(100)
        self.w = libpysal.io.open(libpysal.examples.get_path("stl.gal")).read()
        f = libpysal.io.open(libpysal.examples.get_path("stl_hom.txt"))
        self.y1 = np.array(f.by_col['HR8893'])
        self.y2 = np.array(f.by_col['HC8488'])

    def test_local_geary_mv(self):
        lG_mv = Local_Geary_MV(connectivity=self.w).fit([self.y1, self.y2])
        print(lG_mv.p_sim[0])
        self.assertAlmostEqual(lG_mv.localG[0], 0.4096931479581422)
        self.assertAlmostEqual(lG_mv.p_sim[0], 0.211)
        
suite = unittest.TestSuite()
test_classes = [
    Local_Geary_MV_Tester
]
for i in test_classes:
    a = unittest.TestLoader().loadTestsFromTestCase(i)
    suite.addTest(a)

if __name__ == "__main__":
    runner = unittest.TextTestRunner()
    runner.run(suite)

.

0.211



----------------------------------------------------------------------
Ran 1 test in 0.090s

OK


In [84]:
yeetw = libpysal.io.open(libpysal.examples.get_path("stl.gal")).read()

In [85]:
f = libpysal.io.open(libpysal.examples.get_path("stl_hom.txt"))

In [86]:
y1 = np.array(f.by_col['HR8893'])
y2 = np.array(f.by_col['HC8488'])

In [87]:
lG_mv_test = Local_Geary_MV(connectivity = yeetw).fit([y1, y2])

In [88]:
lG_mv_test.localG[0]

0.4096931479581422

In [89]:
lG_mv_test.p_sim[0]

0.204