# Purpose of this notebook

The purpose of this notebook is to migrate the workbook pseudo code of `LOSH_*.ipynb` and `OLJC_*.ipynb` into functions that match the `PySAL` structure. These will be expanded over time and built out.

## Example function from pysal

In [4]:
"""
Spatial autocorrelation for binary attributes
"""
__author__ = "Sergio J. Rey <srey@asu.edu> , Luc Anselin <luc.anselin@asu.edu>"

from libpysal.weights.spatial_lag import lag_spatial
from esda.tabular import _univariate_handler # change from .tabular to esda.tabular when working on independent machine
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import pandas as pd

__all__ = ['Join_Counts']

PERMUTATIONS = 999

class Join_Counts(object):
    """Binary Join Counts
    Parameters
    ----------
    y               : array
                      binary variable measured across n spatial units
    w               : W
                      spatial weights instance
    permutations    : int
                      number of random permutations for calculation of pseudo-p_values
    Attributes
    ----------
    y            : array
                   original variable
    w            : W
                   original w object
    permutations : int
                   number of permutations
    bb           : float
                   number of black-black joins
    ww           : float
                   number of white-white joins
    bw           : float
                   number of black-white joins
    J            : float
                   number of joins
    sim_bb       : array
                   (if permutations>0)
                   vector of bb values for permuted samples
    p_sim_bb     : array
                  (if permutations>0)
                   p-value based on permutations (one-sided)
                   null: spatial randomness
                   alternative: the observed bb is greater than under randomness
    mean_bb      : float
                   average of permuted bb values
    min_bb       : float
                   minimum of permuted bb values
    max_bb       : float
                   maximum of permuted bb values
    sim_bw       : array
                   (if permutations>0)
                   vector of bw values for permuted samples
    p_sim_bw     : array
                   (if permutations>0)
                   p-value based on permutations (one-sided)
                   null: spatial randomness
                   alternative: the observed bw is greater than under randomness
    mean_bw      : float
                   average of permuted bw values
    min_bw       : float
                   minimum of permuted bw values
    max_bw       : float
                   maximum of permuted bw values
    chi2         : float
                   Chi-square statistic on contingency table for join counts
    chi2_p       : float
                   Analytical p-value for chi2
    chi2_dof     : int
                   Degrees of freedom for analytical chi2
    crosstab     : DataFrame
                   Contingency table for observed join counts
    expected     : DataFrame
                   Expected contingency table for the null 
    p_sim_chi2   : float
                   p-value for chi2 under random spatial permutations
    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> w = libpysal.weights.lat2W(4, 4)
    >>> y = np.ones(16)
    >>> y[0:8] = 0
    >>> np.random.seed(12345)
    >>> from esda.join_counts import Join_Counts
    >>> jc = Join_Counts(y, w)
    >>> jc.bb
    10.0
    >>> jc.bw
    4.0
    >>> jc.ww
    10.0
    >>> jc.J
    24.0
    >>> len(jc.sim_bb)
    999
    >>> round(jc.p_sim_bb, 3)
    0.003
    >>> round(np.mean(jc.sim_bb), 3)
    5.547
    >>> np.max(jc.sim_bb)
    10.0
    >>> np.min(jc.sim_bb)
    0.0
    >>> len(jc.sim_bw)
    999
    >>> jc.p_sim_bw
    1.0
    >>> np.mean(jc.sim_bw)
    12.811811811811811
    >>> np.max(jc.sim_bw)
    24.0
    >>> np.min(jc.sim_bw)
    7.0
    >>> round(jc.chi2_p, 3)
    0.004
    >>> jc.p_sim_chi2
    0.002
    Notes
    -----
    Technical details and derivations can be found in :cite:`cliff81`.
    """
    def __init__(self, y, w, permutations=PERMUTATIONS):
        y = np.asarray(y).flatten()
        w.transformation = 'b'  # ensure we have binary weights
        self.w = w
        self.adj_list = self.w.to_adjlist(remove_symmetric=True) # a function of the weights method in pysal, 
        # to_adjlist(self[, remove_symmetric, …]) which computes an adjacency list representation of a weights object.
        self.y = y
        self.permutations = permutations
        self.J = w.s0 / 2.
        results = self.__calc(self.y)
        self.bb = results[0]
        self.ww = results[1]
        self.bw = results[2]
        self.chi2 = results[3]
        self.chi2_p = results[4]
        self.chi2_dof = results[5]
        self.autocorr_pos = self.bb + self.ww
        self.autocorr_neg = self.bw

        crosstab = pd.DataFrame(data=results[-1])
        id_names = ['W', 'B']
        idx = pd.Index(id_names, name='Focal')
        crosstab.set_index(idx, inplace=True)
        crosstab.columns = pd.Index(id_names, name='Neighbor')
        self.crosstab = crosstab
        expected = pd.DataFrame(data=results[6])
        expected.set_index(idx, inplace=True)
        expected.columns = pd.Index(id_names, name='Neighbor')
        self.expected = expected
        self.calc = self.__calc

        if permutations:
            sim = []
            i = 0
            while i < permutations:
                try:
                    res = self.__calc(np.random.permutation(self.y))
                    sim.append(res)
                    i += 1
                except ValueError:
                    # expected count of 0 -> inadmissible
                    pass
            sim_jc = np.array(sim, dtype=object)
            self.sim_bb = sim_jc[:, 0]
            self.min_bb = np.min(self.sim_bb)
            self.mean_bb = np.mean(self.sim_bb)
            self.max_bb = np.max(self.sim_bb)
            self.sim_bw = sim_jc[:, 2]
            self.min_bw = np.min(self.sim_bw)
            self.mean_bw = np.mean(self.sim_bw)
            self.max_bw = np.max(self.sim_bw)
            self.sim_autocurr_pos = sim_jc[:, 0]+sim_jc[:, 1]
            self.sim_autocurr_neg = sim_jc[:, 2]
            self.sim_chi2 = sim_jc[:, 3]

            stat = ((self.autocorr_pos - np.mean(self.sim_autocurr_pos))**2 / np.mean(self.sim_autocurr_pos)**2 +
                                              (self.autocorr_neg - np.mean(self.sim_autocurr_neg))**2 / np.mean(self.sim_autocurr_pos)**2)
            self.sim_autocorr_chi2 = 1 - chi2.cdf(stat, 1)

            p_sim_bb = self.__pseudop(self.sim_bb, self.bb)
            p_sim_bw = self.__pseudop(self.sim_bw, self.bw)
            p_sim_chi2 = self.__pseudop(self.sim_chi2, self.chi2)
            p_sim_autocorr_pos = self.__pseudop(self.sim_autocurr_pos, self.autocorr_pos)
            p_sim_autocorr_neg = self.__pseudop(self.sim_autocurr_neg, self.autocorr_neg)
            self.p_sim_bb = p_sim_bb
            self.p_sim_bw = p_sim_bw
            self.p_sim_chi2 = p_sim_chi2
            self.p_sim_autocorr_pos = p_sim_autocorr_pos
            self.p_sim_autocorr_neg = p_sim_autocorr_neg

    def __calc(self, z):
        adj_list = self.adj_list
        zseries = pd.Series(z, index=self.w.id_order)
        focal = zseries.loc[adj_list.focal].values
        neighbor = zseries.loc[adj_list.neighbor].values
        sim = focal == neighbor
        dif = 1 - sim
        bb = (focal * sim).sum()
        ww = ((1-focal) * sim).sum()
        bw = (focal * dif).sum()
        wb = ((1-focal) * dif).sum()
        table = [[ww, wb],
                [bw, bb]]
        chi2 = chi2_contingency(table)
        stat, pvalue, dof, expected = chi2
        return (bb, ww, bw+wb, stat, pvalue, dof, expected, np.array(table))

    def __pseudop(self, sim, jc):
        above = sim >=jc
        larger = sum(above)
        psim = (larger + 1.) / (self.permutations + 1.)
        return psim

    @property
    def _statistic(self):
        return self.bw

    @classmethod
    def by_col(cls, df, cols, w=None, inplace=False, pvalue='sim', outvals=None, **stat_kws):
        """
        Function to compute a Join_Count statistic on a dataframe
        Arguments
        ---------
        df          :   pandas.DataFrame
                        a pandas dataframe with a geometry column
        cols        :   string or list of string
                        name or list of names of columns to use to compute the statistic
        w           :   pysal weights object
                        a weights object aligned with the dataframe. If not provided, this
                        is searched for in the dataframe's metadata
        inplace     :   bool
                        a boolean denoting whether to operate on the dataframe inplace or to
                        return a series contaning the results of the computation. If
                        operating inplace, the derived columns will be named
                        'column_join_count'
        pvalue      :   string
                        a string denoting which pvalue should be returned. Refer to the
                        the Join_Count statistic's documentation for available p-values
        outvals     :   list of strings
                        list of arbitrary attributes to return as columns from the
                        Join_Count statistic
        **stat_kws  :   keyword arguments
                        options to pass to the underlying statistic. For this, see the
                        documentation for the Join_Count statistic.
        Returns
        --------
        If inplace, None, and operation is conducted on dataframe in memory. Otherwise,
        returns a copy of the dataframe with the relevant columns attached.
        """
        if outvals is None:
            outvals = []
            outvals.extend(['bb', 'p_sim_bw', 'p_sim_bb'])
            pvalue = ''
        return _univariate_handler(df, cols, w=w, inplace=inplace, pvalue=pvalue,
                                   outvals=outvals, stat=cls,
                                   swapname='bw', **stat_kws)

## LJC

### Univariate

In [8]:
"""
Spatial autocorrelation for binary attributes
"""

__author__ = "Sergio J. Rey <srey@asu.edu> , Luc Anselin <luc.anselin@asu.edu>"

from libpysal.weights.spatial_lag import lag_spatial
# from esda.tabular import _univariate_handler # don't need in my functions at the moment - if in df then yes!
#from scipy.stats import chi2_contingency
#from scipy.stats import chi2
import numpy as np
import pandas as pd

__all__ = ['Join_Counts_Local', # assumed univariate
           'Join_Counts_BV', # assumed bivariate
           'Join_Counts_MV' # assumed multivariate
          ]

# PERMUTATIONS = 999

class Join_Counts_Local_old(object):
    """Univariate Local Join Counts
    Parameters
    ----------
    y               : array
                      binary variable measured across n spatial units
    w               : W
                      spatial weights instance
    permutations    : int
                      number of random permutations for calculation of pseudo-p_values
    Attributes
    ----------
    y            : array
                   original variable
    w            : W
                   original w object
    permutations : int
                   number of permutations
    bb           : float
                   number of black-black joins
    J            : float
                   number of joins
    sim_bb       : array
                   (if permutations>0)
                   vector of bb values for permuted samples
    p_sim_bb     : array
                  (if permutations>0)
                   p-value based on permutations (one-sided)
                   null: spatial randomness
                   alternative: the observed bb is greater than under randomness
    mean_bb      : float
                   average of permuted bb values
    min_bb       : float
                   minimum of permuted bb values
    max_bb       : float
                   maximum of permuted bb values
    chi2         : float
                   Chi-square statistic on contingency table for join counts
    chi2_p       : float
                   Analytical p-value for chi2
    chi2_dof     : int
                   Degrees of freedom for analytical chi2
    crosstab     : DataFrame
                   Contingency table for observed join counts
    expected     : DataFrame
                   Expected contingency table for the null 
    p_sim_chi2   : float
                   p-value for chi2 under random spatial permutations
    Notes
    -----
    Technical details and derivations can be found in :cite:`anselinli2019`.
    """
    def __init__(self, y, w):
        y = np.asarray(y).flatten()
        w.transformation = 'b'  # ensure we have binary weights
        self.w = w
        self.adj_list = self.w.to_adjlist(remove_symmetric=False) # this differs from esda.Join_Counts() function
        self.y = y
        #self.permutations = permutations
        #self.J = w.s0 / 2.
        results = self.__calc(self.y)
        self.bb = results # as there is only one item being returned right now, 
                          # we just use results. once more things are returned in last line of __calc this should return to results[0]
        #self.chi2 = results[3]
        #self.chi2_p = results[4]
        #self.chi2_dof = results[5]
        #self.autocorr_pos = self.bb + self.ww
        #self.autocorr_neg = self.bw
    
    def __calc(self, z):
        adj_list = self.adj_list
        zseries = pd.Series(z, index=self.w.id_order)
        focal = zseries.loc[adj_list.focal].values
        neighbor = zseries.loc[adj_list.neighbor].values
        BB = (focal == 1) & (neighbor == 1)
        adj_list_BB = pd.DataFrame(adj_list.focal.values, BB.astype('uint8')).reset_index()
        adj_list_BB.columns = ['BB', 'ID']
        adj_list_BB = adj_list_BB.groupby(by='ID').sum()
        BB = adj_list_BB.BB.values
        #print(BB)
        return (BB)

Above function is working but is in the 'old' `moran.py` or `join_counts.py` formatting style. Levi suggested making them in the form of scikit-learn or scipy. I'm leaning torwards the scikit-learn style and so I'm emulating `lee.py`.

In [13]:
# Based on format of: https://github.com/pysal/esda/blob/master/esda/lee.py
import numpy
from sklearn.base import BaseEstimator
import libpysal

class Local_Join_Count(BaseEstimator):
    """Local Join Count Statistic"""

    def __init__(self, connectivity=None):
        """
        Initialize a Join_Counts_Local estimator
        Arguments
        ---------
        connectivity:   scipy.sparse matrix object
                        the connectivity structure describing the relationships
                        between observed units. Need not be row-standardized. 
        Attributes
        ----------
        BB_:  numpy.ndarray (1,)
              array containing the estimated Local Join Count coefficients, 
              where element [0,0] is the number of Local Join Counts, ...
        """
        
        self.connectivity = connectivity

    def fit(self, y):
        """
        Arguments
        ---------
        y       :   numpy.ndarray
                    array containing binary (0/1) data
        Returns
        -------
        the fitted estimator.
        Notes
        -----
        Technical details and derivations can be found in :cite:`AnselinLi2019`.
        """
        y = np.asarray(y).flatten()
        
        w = self.connectivity
        w.transformation = 'b' # Ensure we have binary weights
        
        self.BB_ = self._statistic(y, w) # Calculate the statistic
        
        # Need the >>> return self to get the associated .BB_ attribute 
        # (as well as significance in future, i.e. self.reference_distribution_ in lee.py)
        return self
        
    @staticmethod
    def _statistic(y, w):
        adj_list = w.to_adjlist(remove_symmetric=False) # remove_symmetric=False differs from esda.Join_Counts() function
        zseries = pd.Series(y, index=w.id_order)
        focal = zseries.loc[adj_list.focal].values
        neighbor = zseries.loc[adj_list.neighbor].values
        BB = (focal == 1) & (neighbor == 1)
        adj_list_BB = pd.DataFrame(adj_list.focal.values, BB.astype('uint8')).reset_index()
        adj_list_BB.columns = ['BB', 'ID']
        adj_list_BB = adj_list_BB.groupby(by='ID').sum()
        BB = adj_list_BB.BB.values
        return (BB)

Test both the old and new function with some inputs...

In [9]:
import numpy as np
import libpysal
import pandas as pd
# Create a 16x16 grid
w = libpysal.weights.lat2W(4, 4)
y_1 = np.ones(16)
# Set the first 9 of the ones to 0
y_1[0:8] = 0
print('new y_1', y_1)

new y_1 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]


In [10]:
Join_Counts_Local_old(y_1, w)

<__main__.Join_Counts_Local_old at 0x20656e68>

In [11]:
test_ljc_uni = Join_Counts_Local_old(y_1, w)
vars(test_ljc_uni)
print(test_ljc_uni.bb)

[0 0 0 0 0 0 0 0 2 3 3 2 2 3 3 2]


In [12]:
temp = Local_Join_Count(connectivity=w).fit(y_1)
temp.BB_

array([0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 2, 2, 3, 3, 2], dtype=uint64)

Test to ensure equivalency

In [156]:
test_ljc_uni.bb == temp.BB_

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [149]:
# Compare speed of two functions
%alias_magic t timeit

Created `%t` as an alias for `%timeit`.
Created `%%t` as an alias for `%%timeit`.


In [147]:
%t Local_Join_Count(connectivity=w).fit(y_1)

5.9 ms ± 214 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [148]:
%t Join_Counts_Local_old(y_1, w)

5.76 ms ± 178 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


No apparent difference in speed?

### Bivariate Local Join Count

In [175]:
# https://github.com/pysal/esda/blob/master/esda/lee.py
import numpy
from scipy import sparse
from sklearn.base import BaseEstimator
from sklearn import utils

class Local_Join_Count_BV(BaseEstimator):
    """Global Spatial Pearson Statistic"""

    def __init__(self, connectivity=None):
        """
        Initialize a Join_Counts_Local estimator
        Arguments
        ---------
        connectivity:   scipy.sparse matrix object
                        the connectivity structure describing the relationships
                        between observed units. Will be row-standardized. 
        Attributes
        ----------
        association_: numpy.ndarray (2,2)
                      array containg the estimated Lee spatial pearson correlation
                      coefficients, where element [0,1] is the spatial correlation
                      coefficient, and elements [0,0] and [1,1] are the "spatial
                      smoothing factor"
        """
        
        self.connectivity = connectivity

    def fit(self, x, z):
        """
        Arguments
        ---------
        y       :   numpy.ndarray
                    array containing binary (0/1) data
        Returns
        -------
        the fitted estimator.
        Notes
        -----
        Technical details and derivations can be found in :cite:`Lee2001`.
        """
        x = np.asarray(x).flatten()
        z = np.asarray(z).flatten()
        
        w = self.connectivity
        w.transformation = 'b' # Ensure we have binary weights   
        
        self.LJC_ = self._statistic(x, z, w) # Calculate the statistic
        
        # Need the >>> return self to get the associated .BB_ attribute (as well as significance in future, i.e. self.reference_distribution_ in lee.py)
        return self
        
    @staticmethod
    def _statistic(x, z, w):
        adj_list = w.to_adjlist(remove_symmetric=False) # remove_symmetric=False differs from esda.Join_Counts() function
        
        # First, set up a series that maps the y values (input as self.y) to the weights table 
        zseries_x = pd.Series(x, index=w.id_order)
        zseries_z = pd.Series(z, index=w.id_order)

        # Next, map the y values to the focal (i) values 
        focal_x = zseries_x.loc[adj_list.focal].values
        focal_z = zseries_z.loc[adj_list.focal].values

        # Repeat the mapping but for the neighbor (j) values
        neighbor_x = zseries_x.loc[adj_list.neighbor].values
        neighbor_z = zseries_z.loc[adj_list.neighbor].values
        
        # Calculate Case 1
        BJC = (focal_x == 1) & (focal_z == 0) & (neighbor_x == 0) & (neighbor_z == 1)
        adj_list_BJC = pd.DataFrame(adj_list.focal.values, BJC.astype('uint8')).reset_index()
        adj_list_BJC.columns = ['BJC', 'ID']
        adj_list_BJC = adj_list_BJC.groupby(by='ID').sum()
        
        # Calculate Case 2
        CLC = (focal_x == 1) & (focal_z == 1) & (neighbor_x == 1) & (neighbor_z == 1)
        adj_list_CLC = pd.DataFrame(adj_list.focal.values, CLC.astype('uint8')).reset_index()
        adj_list_CLC.columns = ['CLC', 'ID']
        adj_list_CLC = adj_list_CLC.groupby(by='ID').sum()
        
        # Return values
        return (adj_list_BJC.BJC.values, adj_list_CLC.CLC.values)

In [282]:
# https://github.com/pysal/esda/blob/master/esda/lee.py
import numpy
from scipy import sparse
from sklearn.base import BaseEstimator
from sklearn import utils

class Local_Join_Count_BV_v2(BaseEstimator):
    """Global Spatial Pearson Statistic"""

    def __init__(self, connectivity=None):
        """
        Initialize a Join_Counts_Local estimator
        Arguments
        ---------
        connectivity:   scipy.sparse matrix object
                        the connectivity structure describing the relationships
                        between observed units. Will be row-standardized. 
        Attributes
        ----------
        association_: numpy.ndarray (2,2)
                      array containg the estimated Lee spatial pearson correlation
                      coefficients, where element [0,1] is the spatial correlation
                      coefficient, and elements [0,0] and [1,1] are the "spatial
                      smoothing factor"
        """
        
        self.connectivity = connectivity

    def fit(self, x, z, case=None):
        """
        Arguments
        ---------
        y       :   numpy.ndarray
                    array containing binary (0/1) data
        Returns
        -------
        the fitted estimator.
        Notes
        -----
        Technical details and derivations can be found in :cite:`Lee2001`.
        """
        x = np.asarray(x).flatten()
        z = np.asarray(z).flatten()
        
        w = self.connectivity
        w.transformation = 'b' # Ensure we have binary weights   
                
        self.LJC_ = self._statistic(x, z, w, case=case) # Calculate the statistic
        
        return self
        
    @staticmethod
    def _statistic(x, z, w, case=None):        
        adj_list = w.to_adjlist(remove_symmetric=False) # remove_symmetric=False differs from esda.Join_Counts() function
        
        # First, set up a series that maps the y values (input as self.y) to the weights table 
        zseries_x = pd.Series(x, index=w.id_order)
        zseries_z = pd.Series(z, index=w.id_order)

        # Next, map the y values to the focal (i) values 
        focal_x = zseries_x.loc[adj_list.focal].values
        focal_z = zseries_z.loc[adj_list.focal].values

        # Repeat the mapping but for the neighbor (j) values
        neighbor_x = zseries_x.loc[adj_list.neighbor].values
        neighbor_z = zseries_z.loc[adj_list.neighbor].values
        
        if case=="BJC":
            BJC = (focal_x == 1) & (focal_z == 0) & (neighbor_x == 0) & (neighbor_z == 1)
            adj_list_BJC = pd.DataFrame(adj_list.focal.values, BJC.astype('uint8')).reset_index()
            adj_list_BJC.columns = ['BJC', 'ID']
            adj_list_BJC = adj_list_BJC.groupby(by='ID').sum()
            return adj_list_BJC.BJC.values
        elif case=="CLC": 
            CLC = (focal_x == 1) & (focal_z == 1) & (neighbor_x == 1) & (neighbor_z == 1)
            adj_list_CLC = pd.DataFrame(adj_list.focal.values, CLC.astype('uint8')).reset_index()
            adj_list_CLC.columns = ['CLC', 'ID']
            adj_list_CLC = adj_list_CLC.groupby(by='ID').sum()
            return (adj_list_CLC.CLC.values)
        else:
            print("Please specify which type of bivariate Local Join Count you would like to calculate (either 'BJC' or 'CLC'). See Anselin and Li 2019 p. 9-10 for more information")

Test some values...

In [271]:
x = y_1
z = [0,1,0,1,1,1,1,1,0,0,1,1,0,0,1,1]

print('x', x)
print('z', z)

x [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
z [0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]


In [272]:
temp = Local_Join_Count_BV(connectivity=w).fit(x,z)
temp.LJC_

(array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0], dtype=uint64),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2], dtype=uint64))

In [286]:
# Case 1
temp2 = Local_Join_Count_BV_v2(connectivity=w).fit(x,z, case="BJC")
print(temp2.LJC_)
# Case 2
temp2 = Local_Join_Count_BV_v2(connectivity=w).fit(x,z, case="CLC")
print(temp2.LJC_)

[0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 2 2 0 0 2 2]


In [289]:
# Try with a purposefully wrong input or blnak
# Improper input
print(Local_Join_Count_BV_v2(connectivity=w).fit(x,z, case="ThisIsWrong"))
# No input for case
print(Local_Join_Count_BV_v2(connectivity=w).fit(x,z))

Please specify which type of bivariate Local Join Count you would like to calculate (either 'BJC' or 'CLC'). See Anselin and Li 2019 p. 9-10 for more information
Local_Join_Count_BV_v2(connectivity=<libpysal.weights.weights.W object at 0x1BA20628>)
Please specify which type of bivariate Local Join Count you would like to calculate (either 'BJC' or 'CLC'). See Anselin and Li 2019 p. 9-10 for more information
Local_Join_Count_BV_v2(connectivity=<libpysal.weights.weights.W object at 0x1BA20628>)


### Multivariate Local Join Count

In [321]:
# https://github.com/pysal/esda/blob/master/esda/lee.py
import numpy
from scipy import sparse
from sklearn.base import BaseEstimator
from sklearn import utils

class Local_Join_Count_MV(BaseEstimator):
    """Global Spatial Pearson Statistic"""

    def __init__(self, connectivity=None):
        """
        Initialize a Join_Counts_Local estimator
        Arguments
        ---------
        connectivity:   scipy.sparse matrix object
                        the connectivity structure describing the relationships
                        between observed units. Will be row-standardized. 
        Attributes
        ----------
        association_: numpy.ndarray (2,2)
                      array containg the estimated Lee spatial pearson correlation
                      coefficients, where element [0,1] is the spatial correlation
                      coefficient, and elements [0,0] and [1,1] are the "spatial
                      smoothing factor"
        """
        
        self.connectivity = connectivity

    def fit(self, variables):
        """
        Arguments
        ---------
        y       :   numpy.ndarray
                    array containing binary (0/1) data
        Returns
        -------
        the fitted estimator.
        Notes
        -----
        Technical details and derivations can be found in :cite:`Lee2001`.
        """
            
        # Need not be flattened?
        
        w = self.connectivity
        w.transformation = 'b' # Ensure we have binary weights   
                
        self.MJC_ = self._statistic(variables, w) # Calculate the statistic
        
        return self
        
    @staticmethod
    def _statistic(variables, w):
        
        adj_list = w.to_adjlist(remove_symmetric=False) # remove_symmetric=False differs from esda.Join_Counts() function
        
        # The zseries
        zseries = [pd.Series(i, index=w.id_order) for i in variables]
        # The focal values
        focal = [zseries[i].loc[adj_list.focal].values for i in range(len(variables))]
        # The neighbor values
        neighbor = [zseries[i].loc[adj_list.neighbor].values for i in range(len(variables))]
        
        # Find instances where all surrounding focal and neighbor values == 1
        focal_all = np.multiply(*focal)
        neighbor_all = np.multiply(*neighbor)
        MCLC = (focal_all == 1) & (neighbor_all == 1)

        # Create a df that uses the adjacency list focal values and the BBs counts
        adj_list_MCLC = pd.DataFrame(adj_list.focal.values, MCLC.astype('uint8')).reset_index()
        # Temporarily rename the columns
        adj_list_MCLC.columns = ['MCLC', 'ID']
        adj_list_MCLC = adj_list_MCLC.groupby(by='ID').sum()
        
        return (adj_list_MCLC.MCLC.values)

Test inputs

In [322]:
x = x.astype(np.int32)
print('x', x)
print('z', z)
y = [0,1,1,1,1,1,1,1,0,0,0,1,0,0,1,1]
y = np.asarray(y).flatten()
print('y', y)

x [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
z [0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
y [0 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1]


In [323]:
temp = Local_Join_Count_MV(connectivity=w).fit([x,y,z])
temp.MJC_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2], dtype=uint64)

## LOSH

In [14]:
# https://github.com/pysal/esda/blob/master/esda/lee.py
import numpy
from scipy import sparse
from sklearn.base import BaseEstimator
from sklearn import utils
import pysal.lib as lp

class LOSH(BaseEstimator):
    """Local Join Count Statistic"""

    def __init__(self, connectivity=None):
        """
        Initialize a Join_Counts_Local estimator
        Arguments
        ---------
        connectivity:   scipy.sparse matrix object
                        the connectivity structure describing the relationships
                        between observed units. Will be row-standardized. 
        Attributes
        ----------
        LOSH_:  numpy.ndarray (1,)
              array containing the estimated Local Join Count coefficients, 
              where element [0,0] is the number of Local Join Counts, ...
        """
        
        self.connectivity = connectivity

    def fit(self, y):
        """
        Arguments
        ---------
        y       :   numpy.ndarray
                    array containing continuous data
        Returns
        -------
        the fitted estimator.
        Notes
        -----
        Technical details and derivations can be found in :cite:`OrdGetis2012`.
        """
        
        # Define variable of interest
        y = np.asarray(y).flatten()
        
        # Define weights of interest
        w = self.connectivity
        
        # Row standardize the weights
        w.transform = 'r' # Ensure we have binary weights   
        
        self.LOSH_ = self._statistic(y, w) # Calculate the statistic
        
        # Need the >>> return self to get the associated .BB_ attribute (as well as significance in future, i.e. self.reference_distribution_ in lee.py)
        return self
        
    @staticmethod
    def _statistic(y, w):
        ylag = lp.weights.lag_spatial(w,y)
        w_lens = [len(w[i]) for i in range(len(dict(w)))]
        ymean = ylag/w_lens
        yresid = y-ymean
        # Scenario 1: a = 1, an absolute deivations measure $H_{i} = 1$
        sc1 = (w_lens*(abs(yresid)**1))/w_lens
        # Scenario 2: a = 2, a variance measure $H_{i} = 2$
        sc2 = (w_lens*(abs(yresid)**2))/w_lens
        return (sc1, sc2)

Test values based on existing Global Spatial Autocorrelation notebook.

In [328]:
# Load modules
import pandas as pd
import geopandas as gpd
import pysal.lib as lp
import matplotlib.pyplot as plt
import rasterio as rio
import numpy as np
import shapely.geometry as geom
%matplotlib inline

In [330]:
df = gpd.read_file('C:/Users/jeffe/Dropbox/Maryland/PhD_Courses/GEOG788P/MnM4SDS_Fall2019/lectures/data/neighborhoods.gpkg')
listings = gpd.read_file('C:/Users/jeffe/Dropbox/Maryland/PhD_Courses/GEOG788P/MnM4SDS_Fall2019/lectures/data/listings.gpkg')
listings['price'] = listings.price.str.replace('$', '').str.replace(',','_').astype(float)
median_price = gpd.sjoin(listings[['price', 'geometry']], df, op='within')\
                  .groupby('index_right').price.median()
df['median_pri'] = median_price.values
# Make sure missing values are taken care of
pd.isnull(df['median_pri']).sum()
df = df
df['median_pri'].fillna((df['median_pri'].mean()), inplace=True)
y = df['median_pri']

In [331]:
w = lp.weights.Queen.from_dataframe(df)

Pass through function

In [341]:
temp = LOSH(connectivity=w).fit(y)
temp.LOSH_

(array([111.66666667,  51.875     , 204.68      ,  92.22222222,
         84.92      ,  22.22222222,  73.5       ,  88.75      ,
        169.48      ,  44.        ,  83.11111111,  57.5       ,
         46.22222222,  78.1875    , 161.08024691,  83.4609375 ,
        120.        ,  70.15625   , 102.16666667, 159.26      ,
         33.22222222, 100.08333333, 216.15277778,  82.86111111,
        129.2265625 ,  70.32      ,  51.08      ,  87.5703125 ,
         67.3       , 168.265625  ,  60.83333333, 100.421875  ,
         64.08      , 302.81944444,  79.23611111,  84.55555556,
        198.28      ,  98.3046875 ,  72.66326531,  98.16326531,
         72.66      ,  65.55555556, 139.5       , 202.87755102]),
 array([12469.44444444,  2691.015625  , 41893.9024    ,  8504.9382716 ,
         7211.4064    ,   493.82716049,  5402.25      ,  7876.5625    ,
        28723.4704    ,  1936.        ,  6907.45679012,  3306.25      ,
         2136.49382716,  6113.28515625, 25946.84594574,  6965.72808838,
      