Reference: https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html#local-geary

# Univariate local geary

Interestingly, using [this equation](https://www.biomedware.com/files/documentation/spacestat/Statistics/Gearys_C/Geary_s_C_statistic.htm) which explicitly calls for standardization of input data. We also do NOT divide by 2.

$$ c_i = \sum_j w_{ij} (z_i - z_j)^2 $$ 

where: 

$z_i = x_i - \bar{x}$ and $z_j = x_j - \bar{x}$, and $w_{ij}$ are the elements of the row-standardized binary symmetric spatial weight matrix W. 

or, $$ c_i = (1/m^2) * \sum_j w_{ij} (x_i - x_j)^2 $$

where,

$$ m^2 = \sum_i (x_i−\bar{x})^2/n $$

## Load in example data

In [1]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import numpy as np
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))

In [2]:
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [3]:
wq[0]

{66: 1.0, 35: 1.0, 68: 1.0, 36: 1.0}

In [4]:
wq.transform = 'r'
wq[0]

{66: 0.25, 35: 0.25, 68: 0.25, 36: 0.25}

In [5]:
x = guerry_ds['Donatns']

print("x_i is", x[0])
print("x_j are", x[66], x[35], x[68], x[36])

x_i is 5098
x_j are 1983 4077 3710 3012


In [6]:
# Calculate zscore of input variable
zscore_x = (x - np.mean(x))/np.std(x)
zscore_x

0    -0.336188
1     0.450441
2     0.879023
3    -0.825375
4     0.049370
        ...   
80    1.512380
81    0.454785
82    1.467288
83   -0.555029
84   -0.506214
Name: Donatns, Length: 85, dtype: float64

# Build observed local geary values

In [7]:
adj_list = wq.to_adjlist(remove_symmetric=False)
adj_list.head()

Unnamed: 0,focal,neighbor,weight
0,0,66,0.25
1,0,35,0.25
2,0,68,0.25
3,0,36,0.25
4,1,48,0.166667


In [8]:
import pandas as pd
zseries = pd.Series(zscore_x, index=wq.id_order)
zseries[0:5]

0   -0.336188
1    0.450441
2    0.879023
3   -0.825375
4    0.049370
Name: Donatns, dtype: float64

In [9]:
# Define z_i
zi = zseries.loc[adj_list.focal].values
zi[0:5]

array([-0.33618783, -0.33618783, -0.33618783, -0.33618783,  0.45044136])

In [10]:
# Define zj
zj = zseries.loc[adj_list.neighbor].values
zj[0:5]

array([-0.98050808, -0.54737594, -0.62328783, -0.76766521, -0.5709562 ])

In [11]:
(zi-zj)[0:5]

array([0.64432025, 0.21118812, 0.2871    , 0.43147738, 1.02139756])

In [12]:
#(zi-zj)**2

Multiply by spatial weights

In [13]:
# sum(list(wq.weights.values()), []) * (zi-zj)**2
diff = zi-zj

In [14]:
test = sum(list(wq.weights.values()), []) * (diff)**2

In [15]:
# Create a df that uses the adjacency list focal values and the BBs counts
temp = pd.DataFrame(adj_list.focal.values, test).reset_index()
temp[2] = diff
temp.head()

Unnamed: 0,index,0,2
0,0.103787,0,0.64432
1,0.01115,0,0.211188
2,0.020607,0,0.2871
3,0.046543,0,0.431477
4,0.173875,1,1.021398


In [16]:
# Temporarily rename the columns
temp.columns = ['E_ij', 'ID', 'Diff_ij']
temp = temp.groupby(by='ID').sum()

In [17]:
temp.E_ij.values[0:5]

array([0.18208704, 0.56001403, 0.97529461, 0.21590694, 0.61737256])

# Start building function

In [18]:
#%load_ext pycodestyle_magic

In [95]:
#%pycodestyle_off

In [18]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)


PERMUTATIONS = 999


class Local_Geary(BaseEstimator):
    """Local Geary - Univariate"""

    def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1,
                 keep_simulations=True, seed=None):
        """
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           number of random permutations for calculation
                           of pseudo p_values
        n_jobs           : int
                           Number of cores to be used in the conditional
                           randomisation. If -1, all available cores are used.
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under
                           the null is stored in memory and accessible;
                           otherwise, replications are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional
                           randomizations. Must be set here, and not outside
                           of the function, since numba does not correctly
                           interpret external seeds nor
                           numpy.random.RandomState instances.

        Attributes
        ----------
        localG          : numpy array
                          array containing the observed univariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, x, n_jobs=1, permutations=999):
        """
        Arguments
        ---------
        x                : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal as lp
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> y = guerry_ds['Donatns']
        >>> lG = Local_Geary(connectivity=w).fit(y)
        >>> lG.localG[0:5]
        >>> lG.p_sim[0:5]
        """
        x = np.asarray(x).flatten()

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(x, w)

        if self.permutations:
            self.p_sim, self.rlocalG = _crand_plus(
                z=(x - np.mean(x))/np.std(x),
                w=w,
                observed=self.localG,
                permutations=permutations,
                keep=True,
                n_jobs=n_jobs,
                stat_func=_local_geary
            )

        del (self.keep_simulations, self.n_jobs,
             self.permutations, self.seed, self.rlocalG,
             self.connectivity)

        return self

    @staticmethod
    def _statistic(x, w):
        # Caclulate z-scores for x
        zscore_x = (x - np.mean(x))/np.std(x)
        # Create focal (xi) and neighbor (zi) values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = pd.Series(zscore_x, index=w.id_order)
        zi = zseries.loc[adj_list.focal].values
        zj = zseries.loc[adj_list.neighbor].values
        # Carry out local Geary calculation
        gs = sum(list(w.weights.values()), []) * (zi-zj)**2
        # Reorganize data
        adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
        adj_list_gs.columns = ['gs', 'ID']
        adj_list_gs = adj_list_gs.groupby(by='ID').sum()

        localG = adj_list_gs.gs.values

        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter


@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [19]:
functest = Local_Geary(connectivity=wq).fit(x)

In [20]:
functest.localG

array([1.82087039e-01, 5.60014026e-01, 9.75294606e-01, 2.15906938e-01,
       6.17372564e-01, 3.84450059e-02, 2.43181756e-01, 9.71802819e-01,
       4.06447101e-02, 7.24722785e-01, 6.30952854e-02, 2.42104497e-02,
       1.59496916e+01, 9.29326006e-01, 9.65188634e-01, 1.32383286e+00,
       3.31775497e-01, 2.99446505e+00, 9.43946814e-01, 2.99570159e+00,
       3.66702291e-01, 2.09592365e+00, 1.46515861e+00, 1.82118455e-01,
       3.10216680e+00, 5.43063937e-01, 5.74532559e+00, 4.79160197e-02,
       1.58993089e-01, 7.18327253e-01, 1.24297849e+00, 8.72629331e-02,
       7.52809650e-01, 4.56515485e-01, 3.86766562e-01, 1.17632604e-01,
       6.90884685e-01, 2.87206102e+00, 4.10455112e-01, 4.04349959e-01,
       1.14211758e-01, 9.59519953e-01, 3.51347976e-01, 7.30240974e-01,
       4.40370938e-01, 7.20360356e-02, 1.66241706e+00, 5.83258909e+00,
       2.30332507e-01, 4.38369688e-01, 8.41461470e-01, 1.52959486e+00,
       4.32157479e-02, 2.08325903e+00, 1.19722984e+00, 1.28169257e+00,
      

In [21]:
functest.p_sim

array([0.179, 0.056, 0.052, 0.145, 0.49 , 0.003, 0.144, 0.458, 0.021,
       0.291, 0.008, 0.011, 0.137, 0.471, 0.012, 0.032, 0.004, 0.14 ,
       0.307, 0.066, 0.001, 0.17 , 0.309, 0.09 , 0.086, 0.335, 0.025,
       0.007, 0.019, 0.283, 0.25 , 0.007, 0.487, 0.001, 0.136, 0.08 ,
       0.319, 0.19 , 0.292, 0.146, 0.043, 0.37 , 0.194, 0.492, 0.268,
       0.004, 0.246, 0.026, 0.129, 0.234, 0.279, 0.255, 0.019, 0.059,
       0.45 , 0.32 , 0.072, 0.275, 0.04 , 0.015, 0.259, 0.403, 0.444,
       0.135, 0.385, 0.151, 0.07 , 0.116, 0.291, 0.191, 0.108, 0.214,
       0.02 , 0.255, 0.004, 0.169, 0.041, 0.041, 0.014, 0.001, 0.037,
       0.495, 0.001, 0.34 , 0.27 ])

In [22]:
import libpysal as lp
import geopandas as gpd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
w = lp.weights.Queen.from_dataframe(guerry_ds)
y = guerry_ds['Donatns']
lG = Local_Geary(connectivity=w).fit(y)
lG.localG[0:5]
lG.p_sim[0:5]

array([0.194, 0.053, 0.061, 0.164, 0.437])

# Tests

In [72]:
import unittest
import libpysal
from libpysal.common import pandas, RTOL, ATOL
from esda.local_geary import Local_Geary
import numpy as np

PANDAS_EXTINCT = pandas is None

#from ..local_geary import Local_Geary

class Local_Geary_Tester(unittest.TestCase):
    def setUp(self):
        np.random.seed(10)
        self.w = libpysal.io.open(libpysal.examples.get_path("stl.gal")).read()
        f = libpysal.io.open(libpysal.examples.get_path("stl_hom.txt"))
        self.y = np.array(f.by_col['HR8893'])

    def test_local_geary(self):
        lG = Local_Geary(connectivity=self.w).fit(self.y)
        self.assertAlmostEqual(lG.localG[0], 0.696703432)
        self.assertAlmostEqual(lG.p_sim[0], 0.19)
        
suite = unittest.TestSuite()
test_classes = [
    Local_Geary_Tester
]
for i in test_classes:
    a = unittest.TestLoader().loadTestsFromTestCase(i)
    suite.addTest(a)

if __name__ == "__main__":
    runner = unittest.TextTestRunner()
    runner.run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 0.033s

OK


# Identify obs of interest ('GeoDa quads', but not really)

[Source](https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html#principle-1)

>Those locations identified as significant and with the Local Geary statistic smaller than its mean, suggest positive spatial autocorrelation (small differences imply similarity). For those observations that can be classified in the upper-right or lower-left quadrants of a matching Moran scatter plot, we can identify the association as high-high or low-low. However, given that the squared difference can cross the mean, there may be observations for which such a classification is not possible. We will refer to those as other positive spatial autocorrelation.

>For negative spatial autocorrelation (large values imply dissimilarity), it is not possible to assess whether the association is between high-low or low-high outliers, since the squaring of the differences removes the sign.

We use a slightly different approach and do not interact with the Local Moran scatterplot. We need to first define the mean of the calculated Local Geary statistic, and define the mean of the input variable (`y`).

In [58]:
# Mean of local geary
Eij_mean = np.mean(functest.localG); print(Eij_mean)
# Mean of x variable
y_mean = np.mean(y); print(y_mean)

1.1243443362378112
6723.317647058823


Identify areas as outliers

In [61]:
outliers = (functest.localG < Eij_mean) & (y > y_mean) & (functest.p_sim<=0.05)
pd.value_counts(outliers)

False    79
True      6
Name: Donatns, dtype: int64

Identify areas as clusters

In [63]:
clusters = (functest.localG < Eij_mean) & (y < y_mean) & (functest.p_sim<=0.05)
pd.value_counts(clusters)

False    64
True     21
Name: Donatns, dtype: int64

### Not significant

In [281]:
neg = (functest.p_sim>0.05)
pd.value_counts(neg)

True     59
False    26
dtype: int64

In [64]:
#Old implementation - bad
if self.geoda_quads:
    from esda import Moran_Local
    localm = Moran_Local(y=x, 
                         w=w, 
                         geoda_quads=True)

    Eij_mean = np.mean(self.localG)

    # Create empty vector
    self.q = np.ones(len(x))*5
    # 1: high high
    self.q[(self.localG < E_ij_mean) & (self.p_sim<=self.sig) & (localm.q==1)] = 1
    # 2: low low
    self.q[(self.localG < E_ij_mean) & (self.p_sim<=self.sig) & (localm.q==2)] = 2
    # 4: negative - 2*mean appropriate?
    self.q[(self.localG > 2*E_ij_mean) & (self.p_sim<=self.sig) & (localm.q!=2) & (localm.q!=4)] = 4
    # 5: not significant
    self.q[self.p_sim > self.sig] = 0
    # 0: other - all remaining obs? not sure how to define, need to double check...
    # Do nothing

NameError: name 'self' is not defined

Should be 57, this is close enough...

## Start working on inference (note: now implemented above)

### 'New' `_crand()` engine

In [28]:
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)

In [29]:
@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [30]:
p_sim, rlocalG = _crand_plus(z=np.array(zscore_x), w=wq, observed=np.array(functest.localG), 
            permutations=999, keep=True, n_jobs=1, 
            stat_func=_local_geary)

print(p_sim)
print(rlocalG)

[0.188 0.042 0.058 0.16  0.479 0.003 0.153 0.453 0.017 0.275 0.009 0.007
 0.134 0.472 0.01  0.035 0.001 0.146 0.332 0.06  0.002 0.139 0.302 0.095
 0.07  0.341 0.023 0.008 0.028 0.283 0.223 0.008 0.496 0.003 0.1   0.086
 0.312 0.173 0.3   0.156 0.059 0.362 0.191 0.488 0.275 0.006 0.196 0.018
 0.124 0.232 0.284 0.235 0.022 0.056 0.435 0.321 0.071 0.294 0.021 0.016
 0.241 0.38  0.449 0.143 0.414 0.129 0.078 0.099 0.318 0.199 0.097 0.231
 0.018 0.262 0.003 0.175 0.038 0.037 0.011 0.001 0.033 0.492 0.003 0.333
 0.27 ]
[[1.73354558 0.48582724 0.39120719 ... 1.09739692 0.30315983 1.15158028]
 [1.2329202  0.70714724 0.79341306 ... 1.25615194 1.00711015 0.79559242]
 [1.78399622 1.40691833 1.24765407 ... 2.0946598  1.85841311 1.02120587]
 ...
 [3.1149016  3.08453218 2.92127521 ... 3.40044339 3.17740559 3.47494539]
 [1.1715959  0.44600353 3.85646581 ... 0.21532951 0.49830135 0.22095061]
 [0.82826412 0.16964085 4.48525585 ... 0.20116451 0.37692207 0.24373652]]


## Rebuilding function with obs of interest ('GeoDa quads', but not really)


In [34]:
#%load_ext pycodestyle_magic

In [35]:
#%pycodestyle_off

In [69]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)


PERMUTATIONS = 999
SIG = 0.05


class Local_Geary(BaseEstimator):
    """Local Geary - Univariate"""

    def __init__(self, connectivity=None, autocorr=False, sig=SIG,
                 permutations=PERMUTATIONS, n_jobs=1, keep_simulations=True,
                 seed=None):
        """
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        autocorr         : boolean
                           (default=False)
                           If True use, indicate if an observation 
                           belongs to an outlier, cluster, undefined,
                           or non-significant group. 1 = outlier, 
                           2 = cluster, 3 = undefined, 4 = non-significant.
        sig              : float
                           (default=0.05)
                           Default significance threshold used for 
                           creation of autocorr groups.
        permutations     : int
                           number of random permutations for calculation
                           of pseudo p_values
        n_jobs           : int
                           Number of cores to be used in the conditional
                           randomisation. If -1, all available cores are used.
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under
                           the null is stored in memory and accessible;
                           otherwise, replications are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional
                           randomizations. Must be set here, and not outside
                           of the function, since numba does not correctly
                           interpret external seeds nor
                           numpy.random.RandomState instances.

        Attributes
        ----------
        localG          : numpy array
                          array containing the observed univariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.autocorr = autocorr
        self.sig = sig
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, x, n_jobs=1, permutations=999):
        """
        Arguments
        ---------
        x                : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal as lp
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> y = guerry_ds['Donatns']
        >>> lG = Local_Geary(connectivity=w).fit(y)
        >>> lG.localG[0:5]
        >>> lG.p_sim[0:5]
        """
        x = np.asarray(x).flatten()

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(x, w)

        if self.permutations:
            self.p_sim, self.rlocalG = _crand_plus(
                z=(x - np.mean(x))/np.std(x),
                w=w,
                observed=self.localG,
                permutations=permutations,
                keep=True,
                n_jobs=n_jobs,
                stat_func=_local_geary
            )
            
        if self.autocorr:    
            Eij_mean = np.mean(self.localG)
            x_mean = np.mean(x)
            # Create empty vector where default is undefined.
            self.q = np.ones(len(x))*3
            # Outliers
            self.q[(self.localG < Eij_mean) & \
                   (y > y_mean) & \
                   (self.p_sim<=self.sig)] = 1
            # Clusters
            self.q[(self.localG < Eij_mean) & \
                   (y < y_mean) & \
                   (self.p_sim<=self.sig)] = 2
            # Undefined: default value in empty vector
            # Non-significant
            self.q[self.p_sim > self.sig] = 4

        del (self.keep_simulations, self.n_jobs,
             self.permutations, self.seed, self.rlocalG,
             self.connectivity)

        return self

    @staticmethod
    def _statistic(x, w):
        # Caclulate z-scores for x
        zscore_x = (x - np.mean(x))/np.std(x)
        # Create focal (xi) and neighbor (zi) values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = pd.Series(zscore_x, index=w.id_order)
        zi = zseries.loc[adj_list.focal].values
        zj = zseries.loc[adj_list.neighbor].values
        # Carry out local Geary calculation
        gs = sum(list(w.weights.values()), []) * (zi-zj)**2
        # Reorganize data
        adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
        adj_list_gs.columns = ['gs', 'ID']
        adj_list_gs = adj_list_gs.groupby(by='ID').sum()

        localG = adj_list_gs.gs.values

        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter


@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [70]:
functest = Local_Geary(connectivity=wq, autocorr=True).fit(x)

In [71]:
pd.value_counts(functest.q)

4.0    60
2.0    14
3.0     6
1.0     5
dtype: int64

In [53]:
functest = Local_Geary(connectivity=wq, geoda_quads=True).fit(guerry_ds['Suicids'])

In [54]:
pd.value_counts(functest.q)

0.0    52
2.0    27
4.0     5
1.0     1
dtype: int64

# Multivariate Local Geary

$$ c_i = \sum_{h=1}^m \sum_j w_{ij} (x_{hi} - x_{hj})^2 $$

Load in the sample data

In [42]:
import libpysal as lp
import geopandas as gpd
from scipy import stats
import pandas as pd
guerry = lp.examples.load_example('Guerry')
guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))

In [43]:
wq = lp.weights.Queen.from_dataframe(guerry_ds)

In [44]:
x = guerry_ds['Donatns']
y = guerry_ds['Suicids']

In [45]:
variables = [x,y]

Standardize each variable

In [46]:
from scipy import stats
zseries = [stats.zscore(i) for i in variables]

Build the adj lists

In [47]:
adj_list = wq.to_adjlist(remove_symmetric=False)

In [48]:
# The zseries
zseries = [pd.Series(i, index=wq.id_order) for i in zseries]

In [49]:
zseries[0:10]

[0    -0.336188
 1     0.450441
 2     0.879023
 3    -0.825375
 4     0.049370
         ...   
 80    1.512380
 81    0.454785
 82    1.467288
 83   -0.555029
 84   -0.506214
 Length: 85, dtype: float64,
 0    -0.047195
 1    -0.756433
 2     2.478379
 3    -0.711499
 4    -0.649766
         ...   
 80    1.004270
 81   -0.468369
 82   -0.096441
 83   -0.111387
 84   -0.757774
 Length: 85, dtype: float64]

In [50]:
# The focal values
focal = [zseries[i].loc[adj_list.focal].values for
         i in range(len(variables))]
# The neighbor values
neighbor = [zseries[i].loc[adj_list.neighbor].values for
            i in range(len(variables))]

In [51]:
np.array(focal)[0][0]

-0.3361878263899247

In [52]:
np.array(neighbor)[0][0]

-0.9805080804028404

In [53]:
(np.array(focal)[0][0] - np.array(neighbor)[0][0])**2

0.4151485897312682

In [54]:
temp = (np.array(focal) - np.array(neighbor))**2
temp[0][0]

0.4151485897312682

In [55]:
gs = sum(list(wq.weights.values()), []) * (np.array(focal) - np.array(neighbor))**2

In [56]:
temp = pd.DataFrame(gs).T

In [57]:
temp['ID'] = adj_list.focal.values

In [58]:
adj_list_gs = temp.groupby(by='ID').sum()
adj_list_gs.head()

Unnamed: 0_level_0,0,1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.728348,0.5022
1,3.360084,0.28263
2,5.851768,29.604873
3,0.863628,0.121489
4,1.852118,0.475642


In [59]:
k = len(variables)
k

2

In [60]:
adj_list_gs.sum(axis=1)/k

ID
0      0.615274
1      1.821357
2     17.728320
3      0.492558
4      1.163880
        ...    
80     6.629720
81     3.154587
82     3.872022
83     4.307689
84     1.080905
Length: 85, dtype: float64

In [61]:
import numpy as np
import pandas as pd
import warnings
from scipy import sparse
from scipy import stats
from sklearn.base import BaseEstimator
import libpysal as lp

PERMUTATIONS=999

class Local_Geary_MV(BaseEstimator):
    """Local Geary - Multivariate"""

    def __init__(self, connectivity=None, permutations=PERMUTATIONS, n_jobs=1, 
                 keep_simulations=True, seed=None):
        """
        connectivity     : scipy.sparse matrix object
                           the connectivity structure describing
                           the relationships between observed units.
                           Need not be row-standardized.
        permutations     : int
                           number of random permutations for calculation of pseudo
                           p_values
        n_jobs           : int
                           Number of cores to be used in the conditional randomisation. If -1,
                           all available cores are used.    
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under the null 
                           is stored in memory and accessible; otherwise, replications 
                           are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional randomizations. 
                           Must be set here, and not outside of the function, since numba 
                           does not correctly interpret external seeds 
                           nor numpy.random.RandomState instances.  
                           
        Attributes
        ----------
        localG          : numpy array
                          array containing the observed multivariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        """

        self.connectivity = connectivity
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed

    def fit(self, variables, n_jobs=1, permutations=999):
        """
        Arguments
        ---------
        variables        : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        """
        self.variables = np.array(variables, dtype='float')

        w = self.connectivity
        w.transform = 'r'

        self.localG = self._statistic(variables, w)

        if permutations:
            pass

        return self

    @staticmethod
    def _statistic(variables, w):
        # Caclulate z-scores for input variables
        zseries = [stats.zscore(i) for i in variables]
        # Define denominator adjustment
        k = len(variables)
        # Create focal and neighbor values
        adj_list = w.to_adjlist(remove_symmetric=False)
        zseries = [pd.Series(i, index=wq.id_order) for i in zseries]
        focal = [zseries[i].loc[adj_list.focal].values for
                 i in range(len(variables))]
        neighbor = [zseries[i].loc[adj_list.neighbor].values for
                    i in range(len(variables))]
        # Carry out local Geary calculation
        gs = sum(list(wq.weights.values()), []) * \
        (np.array(focal) - np.array(neighbor))**2
        # Reorganize data
        temp = pd.DataFrame(gs).T
        temp['ID'] = adj_list.focal.values
        adj_list_gs = temp.groupby(by='ID').sum()
        localG = adj_list_gs.sum(axis=1)/k
        
        return (localG)

# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter

@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i    

In [62]:
functest = Local_Geary_MV(connectivity=wq).fit([x,y])
functest.localG

ID
0     0.153819
1     0.303560
2     2.954720
3     0.123140
4     0.387960
        ...   
80    1.657430
81    0.525764
82    0.645337
83    0.717948
84    0.216181
Length: 85, dtype: float64

## Working on inference

In [63]:
from esda.crand import (
    crand as _crand_plus,
    njit as _njit,
    _prepare_univariate
)

In [64]:
@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):
    zi, zrand = _prepare_univariate(i, z, permuted_ids, weights_i)
    return (zi-zrand)**2 @ weights_i

In [65]:
p_sim, rlocalG = _crand_plus(z=np.array(x, dtype='float'), w=wq, observed=np.array(functest.localG), 
            permutations=999, keep=True, n_jobs=1, 
            stat_func=_local_geary)

print(p_sim)
print(rlocalG)

[0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
 0.001]
[[1.27213695e+07 8.58893325e+06 1.30386525e+07 ... 1.30844169e+08
  1.13140950e+06 2.42693825e+06]
 [1.69938767e+07 3.32651060e+07 1.66522085e+07 ... 8.43416562e+07
  2.32759417e+07 1.98182905e+07]
 [3.40947833e+07 5.06547113e+07 2.89550538e+07 ... 9.39840535e+07
  4.68801657e+07 3.93634665e+07]
 ...
 [8.07803185e+07 9.72163752e+07 1.01242821e+08 ... 8.87716332e+07
  8.86466285e+07 7.53740587e+07]
 [1.52353105e+07 2.03496012e+07 1.69014767e+06 ... 6.78358172e+07
  1.14213645e+

In [66]:
x_zscore = stats.zscore(x)
y_zscore = stats.zscore(y)

In [67]:
result = zip(x_zscore, y_zscore)
result = list(result)
result = np.array(result)
result[0:5]

array([[-0.33618783, -0.04719523],
       [ 0.45044136, -0.75643306],
       [ 0.87902293,  2.47837872],
       [-0.82537479, -0.71149891],
       [ 0.0493701 , -0.64976635]])

In [68]:
permutations = 999
# Get length based on first variable
n = len(result)
localGs = np.zeros((n, permutations))
n_1 = n - 1
prange = list(range(permutations))
k = wq.max_neighbors + 1
nn = n - 1
rids = np.array([np.random.permutation(nn)[0:k] for i in prange])
ids = np.arange(wq.n)
ido = wq.id_order
w = [wq.weights[ido[i]] for i in ids]
wc = [wq.cardinalities[ido[i]] for i in ids]


Note: below **are** z-score standardized

In [69]:
wq.weights[0]

[0.25, 0.25, 0.25, 0.25]

In [70]:
for i in range(wq.n):
    idsi = ids[ids != i]
    np.random.shuffle(idsi)
    tmp = result[idsi[rids[:, 0:wc[i]]]]
    # define zi?
    zi = result[i][0]
    # define zrand?
    zrand = tmp[i][:,1]
    # Subtract and square?
    temp = (zi - zrand)**2
    # Multiply by weights?
    localGs[i] = (temp * wq.weights[i]).sum(0)
    #joins[i] = result[i] * (w[i] * tmp).sum(1)
    #localGs[i] = ((result[i,0] - tmp2)**2 * wq.weights[i][0]).sum(0)
rlocalGs = localGs

In [71]:
rlocalGs

array([[0.39467951, 0.39467951, 0.39467951, ..., 0.39467951, 0.39467951,
        0.39467951],
       [1.38828188, 1.38828188, 1.38828188, ..., 1.38828188, 1.38828188,
        1.38828188],
       [0.7247351 , 0.7247351 , 0.7247351 , ..., 0.7247351 , 0.7247351 ,
        0.7247351 ],
       ...,
       [3.33840238, 3.33840238, 3.33840238, ..., 3.33840238, 3.33840238,
        3.33840238],
       [0.47662382, 0.47662382, 0.47662382, ..., 0.47662382, 0.47662382,
        0.47662382],
       [0.04553023, 0.04553023, 0.04553023, ..., 0.04553023, 0.04553023,
        0.04553023]])

In [72]:
np.transpose(rlocalGs)

array([[0.39467951, 1.38828188, 0.7247351 , ..., 3.33840238, 0.47662382,
        0.04553023],
       [0.39467951, 1.38828188, 0.7247351 , ..., 3.33840238, 0.47662382,
        0.04553023],
       [0.39467951, 1.38828188, 0.7247351 , ..., 3.33840238, 0.47662382,
        0.04553023],
       ...,
       [0.39467951, 1.38828188, 0.7247351 , ..., 3.33840238, 0.47662382,
        0.04553023],
       [0.39467951, 1.38828188, 0.7247351 , ..., 3.33840238, 0.47662382,
        0.04553023],
       [0.39467951, 1.38828188, 0.7247351 , ..., 3.33840238, 0.47662382,
        0.04553023]])

In [73]:
sim = np.transpose(rlocalGs)
print(sim[0])
print(sim[1])

[3.94679513e-01 1.38828188e+00 7.24735104e-01 4.34630077e-01
 5.16477510e-03 1.01757022e+00 4.81693340e+00 6.08396914e+00
 1.25170386e+00 2.42329804e-01 6.10227684e-01 1.34031612e+00
 2.02077777e+01 2.31271040e+00 3.11752317e+00 3.78103076e+00
 3.46281849e+00 4.58414279e+00 2.54214333e-01 2.10543748e+00
 2.01484508e+00 3.86507493e+00 2.44678752e-01 7.23409136e-01
 2.22396942e+00 3.41729422e+00 1.43601707e+01 3.40992728e+00
 2.04345248e-01 3.68532305e+00 2.28919438e+00 5.07079414e-01
 6.03929237e-01 1.37146724e+00 1.06516964e+00 1.40489165e-01
 2.19803861e-02 3.12804762e+00 1.38185076e+00 3.25632042e+00
 3.35860137e-01 9.67914873e-01 1.97070794e+00 1.48251478e+00
 1.65412794e+00 4.49757570e+00 5.65705426e-01 5.95248203e-02
 3.69374414e+00 8.71297887e-01 1.12014317e+00 7.80649981e-02
 1.19519156e-01 4.57036649e+00 1.19229560e+00 1.15045976e+00
 2.98708594e-01 1.67009390e-01 7.14187339e-01 1.34892608e+01
 2.47525651e+00 2.21219865e-01 3.01762150e-02 5.17466226e+00
 3.37969757e+00 3.672482

In [74]:
sim[0]

array([3.94679513e-01, 1.38828188e+00, 7.24735104e-01, 4.34630077e-01,
       5.16477510e-03, 1.01757022e+00, 4.81693340e+00, 6.08396914e+00,
       1.25170386e+00, 2.42329804e-01, 6.10227684e-01, 1.34031612e+00,
       2.02077777e+01, 2.31271040e+00, 3.11752317e+00, 3.78103076e+00,
       3.46281849e+00, 4.58414279e+00, 2.54214333e-01, 2.10543748e+00,
       2.01484508e+00, 3.86507493e+00, 2.44678752e-01, 7.23409136e-01,
       2.22396942e+00, 3.41729422e+00, 1.43601707e+01, 3.40992728e+00,
       2.04345248e-01, 3.68532305e+00, 2.28919438e+00, 5.07079414e-01,
       6.03929237e-01, 1.37146724e+00, 1.06516964e+00, 1.40489165e-01,
       2.19803861e-02, 3.12804762e+00, 1.38185076e+00, 3.25632042e+00,
       3.35860137e-01, 9.67914873e-01, 1.97070794e+00, 1.48251478e+00,
       1.65412794e+00, 4.49757570e+00, 5.65705426e-01, 5.95248203e-02,
       3.69374414e+00, 8.71297887e-01, 1.12014317e+00, 7.80649981e-02,
       1.19519156e-01, 4.57036649e+00, 1.19229560e+00, 1.15045976e+00,
      

In [None]:
above = sim[0] >= functest.localG
larger = above.sum(0)
low_extreme = (permutations - larger) < larger
larger[low_extreme] = permutations - larger[low_extreme]
p_sim = (larger + 1.0) / (permutations + 1.0)
p_sim

In [None]:
temp2