# See after the copy-pasted function for a detailed workthrough of how the join counts are calculated!

In [1]:
# examining pysal esda implementation of global join counts

"""
Spatial autocorrelation for binary attributes
"""
__author__ = "Sergio J. Rey <srey@asu.edu> , Luc Anselin <luc.anselin@asu.edu>"

from libpysal.weights.spatial_lag import lag_spatial
from esda.tabular import _univariate_handler # change from .tabular to esda.tabular when working on independent machine
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import numpy as np
import pandas as pd

__all__ = ['Join_Counts']

PERMUTATIONS = 999


class Join_Counts(object):
    """Binary Join Counts
    Parameters
    ----------
    y               : array
                      binary variable measured across n spatial units
    w               : W
                      spatial weights instance
    permutations    : int
                      number of random permutations for calculation of pseudo-p_values
    Attributes
    ----------
    y            : array
                   original variable
    w            : W
                   original w object
    permutations : int
                   number of permutations
    bb           : float
                   number of black-black joins
    ww           : float
                   number of white-white joins
    bw           : float
                   number of black-white joins
    J            : float
                   number of joins
    sim_bb       : array
                   (if permutations>0)
                   vector of bb values for permuted samples
    p_sim_bb     : array
                  (if permutations>0)
                   p-value based on permutations (one-sided)
                   null: spatial randomness
                   alternative: the observed bb is greater than under randomness
    mean_bb      : float
                   average of permuted bb values
    min_bb       : float
                   minimum of permuted bb values
    max_bb       : float
                   maximum of permuted bb values
    sim_bw       : array
                   (if permutations>0)
                   vector of bw values for permuted samples
    p_sim_bw     : array
                   (if permutations>0)
                   p-value based on permutations (one-sided)
                   null: spatial randomness
                   alternative: the observed bw is greater than under randomness
    mean_bw      : float
                   average of permuted bw values
    min_bw       : float
                   minimum of permuted bw values
    max_bw       : float
                   maximum of permuted bw values
    chi2         : float
                   Chi-square statistic on contingency table for join counts
    chi2_p       : float
                   Analytical p-value for chi2
    chi2_dof     : int
                   Degrees of freedom for analytical chi2
    crosstab     : DataFrame
                   Contingency table for observed join counts
    expected     : DataFrame
                   Expected contingency table for the null 
    p_sim_chi2   : float
                   p-value for chi2 under random spatial permutations
    Examples
    --------
    >>> import numpy as np
    >>> import libpysal
    >>> w = libpysal.weights.lat2W(4, 4)
    >>> y = np.ones(16)
    >>> y[0:8] = 0
    >>> np.random.seed(12345)
    >>> from esda.join_counts import Join_Counts
    >>> jc = Join_Counts(y, w)
    >>> jc.bb
    10.0
    >>> jc.bw
    4.0
    >>> jc.ww
    10.0
    >>> jc.J
    24.0
    >>> len(jc.sim_bb)
    999
    >>> round(jc.p_sim_bb, 3)
    0.003
    >>> round(np.mean(jc.sim_bb), 3)
    5.547
    >>> np.max(jc.sim_bb)
    10.0
    >>> np.min(jc.sim_bb)
    0.0
    >>> len(jc.sim_bw)
    999
    >>> jc.p_sim_bw
    1.0
    >>> np.mean(jc.sim_bw)
    12.811811811811811
    >>> np.max(jc.sim_bw)
    24.0
    >>> np.min(jc.sim_bw)
    7.0
    >>> round(jc.chi2_p, 3)
    0.004
    >>> jc.p_sim_chi2
    0.002
    Notes
    -----
    Technical details and derivations can be found in :cite:`cliff81`.
    """
    def __init__(self, y, w, permutations=PERMUTATIONS):
        y = np.asarray(y).flatten()
        w.transformation = 'b'  # ensure we have binary weights
        self.w = w
        self.adj_list = self.w.to_adjlist(remove_symmetric=True) # a function of the weights method in pysal, 
        # to_adjlist(self[, remove_symmetric, …]) which computes an adjacency list representation of a weights object.
        self.y = y
        self.permutations = permutations
        self.J = w.s0 / 2.
        results = self.__calc(self.y)
        self.bb = results[0]
        self.ww = results[1]
        self.bw = results[2]
        self.chi2 = results[3]
        self.chi2_p = results[4]
        self.chi2_dof = results[5]
        self.autocorr_pos = self.bb + self.ww
        self.autocorr_neg = self.bw

        crosstab = pd.DataFrame(data=results[-1])
        id_names = ['W', 'B']
        idx = pd.Index(id_names, name='Focal')
        crosstab.set_index(idx, inplace=True)
        crosstab.columns = pd.Index(id_names, name='Neighbor')
        self.crosstab = crosstab
        expected = pd.DataFrame(data=results[6])
        expected.set_index(idx, inplace=True)
        expected.columns = pd.Index(id_names, name='Neighbor')
        self.expected = expected
        self.calc = self.__calc

        if permutations:
            sim = []
            i = 0
            while i < permutations:
                try:
                    res = self.__calc(np.random.permutation(self.y))
                    sim.append(res)
                    i += 1
                except ValueError:
                    # expected count of 0 -> inadmissible
                    pass
            sim_jc = np.array(sim, dtype=object)
            self.sim_bb = sim_jc[:, 0]
            self.min_bb = np.min(self.sim_bb)
            self.mean_bb = np.mean(self.sim_bb)
            self.max_bb = np.max(self.sim_bb)
            self.sim_bw = sim_jc[:, 2]
            self.min_bw = np.min(self.sim_bw)
            self.mean_bw = np.mean(self.sim_bw)
            self.max_bw = np.max(self.sim_bw)
            self.sim_autocurr_pos = sim_jc[:, 0]+sim_jc[:, 1]
            self.sim_autocurr_neg = sim_jc[:, 2]
            self.sim_chi2 = sim_jc[:, 3]

            stat = ((self.autocorr_pos - np.mean(self.sim_autocurr_pos))**2 / np.mean(self.sim_autocurr_pos)**2 +
                                              (self.autocorr_neg - np.mean(self.sim_autocurr_neg))**2 / np.mean(self.sim_autocurr_pos)**2)
            self.sim_autocorr_chi2 = 1 - chi2.cdf(stat, 1)

            p_sim_bb = self.__pseudop(self.sim_bb, self.bb)
            p_sim_bw = self.__pseudop(self.sim_bw, self.bw)
            p_sim_chi2 = self.__pseudop(self.sim_chi2, self.chi2)
            p_sim_autocorr_pos = self.__pseudop(self.sim_autocurr_pos, self.autocorr_pos)
            p_sim_autocorr_neg = self.__pseudop(self.sim_autocurr_neg, self.autocorr_neg)
            self.p_sim_bb = p_sim_bb
            self.p_sim_bw = p_sim_bw
            self.p_sim_chi2 = p_sim_chi2
            self.p_sim_autocorr_pos = p_sim_autocorr_pos
            self.p_sim_autocorr_neg = p_sim_autocorr_neg

    def __calc(self, z):
        adj_list = self.adj_list
        zseries = pd.Series(z, index=self.w.id_order)
        focal = zseries.loc[adj_list.focal].values
        neighbor = zseries.loc[adj_list.neighbor].values
        sim = focal == neighbor
        dif = 1 - sim
        bb = (focal * sim).sum()
        ww = ((1-focal) * sim).sum()
        bw = (focal * dif).sum()
        wb = ((1-focal) * dif).sum()
        table = [[ww, wb],
                [bw, bb]]
        chi2 = chi2_contingency(table)
        stat, pvalue, dof, expected = chi2
        return (bb, ww, bw+wb, stat, pvalue, dof, expected, np.array(table))

    def __pseudop(self, sim, jc):
        above = sim >=jc
        larger = sum(above)
        psim = (larger + 1.) / (self.permutations + 1.)
        return psim

    @property
    def _statistic(self):
        return self.bw

    @classmethod
    def by_col(cls, df, cols, w=None, inplace=False, pvalue='sim', outvals=None, **stat_kws):
        """
        Function to compute a Join_Count statistic on a dataframe
        Arguments
        ---------
        df          :   pandas.DataFrame
                        a pandas dataframe with a geometry column
        cols        :   string or list of string
                        name or list of names of columns to use to compute the statistic
        w           :   pysal weights object
                        a weights object aligned with the dataframe. If not provided, this
                        is searched for in the dataframe's metadata
        inplace     :   bool
                        a boolean denoting whether to operate on the dataframe inplace or to
                        return a series contaning the results of the computation. If
                        operating inplace, the derived columns will be named
                        'column_join_count'
        pvalue      :   string
                        a string denoting which pvalue should be returned. Refer to the
                        the Join_Count statistic's documentation for available p-values
        outvals     :   list of strings
                        list of arbitrary attributes to return as columns from the
                        Join_Count statistic
        **stat_kws  :   keyword arguments
                        options to pass to the underlying statistic. For this, see the
                        documentation for the Join_Count statistic.
        Returns
        --------
        If inplace, None, and operation is conducted on dataframe in memory. Otherwise,
        returns a copy of the dataframe with the relevant columns attached.
        """
        if outvals is None:
            outvals = []
            outvals.extend(['bb', 'p_sim_bw', 'p_sim_bb'])
            pvalue = ''
        return _univariate_handler(df, cols, w=w, inplace=inplace, pvalue=pvalue,
                                   outvals=outvals, stat=cls,
                                   swapname='bw', **stat_kws)

# Working through docstrings example

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import numpy as np
import libpysal
# Create a 16x16 grid of weights
w = libpysal.weights.lat2W(4, 4)
print(w[0])
print(w[1])
print(w[2])
print(w[3])
print(w[4])
print(w[5])
# ...
print(w[15])
# Create a vector of 16 ones
y = np.ones(16)
print('original y', y)
# Set the first 9 of the ones to 0
y[0:8] = 0
print('adulterated y', y)


{4: 1.0, 1: 1.0}
{0: 1.0, 5: 1.0, 2: 1.0}
{1: 1.0, 6: 1.0, 3: 1.0}
{2: 1.0, 7: 1.0}
{0: 1.0, 8: 1.0, 5: 1.0}
{1: 1.0, 4: 1.0, 9: 1.0, 6: 1.0}
{11: 1.0, 14: 1.0}
original y [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
adulterated y [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]


In [4]:
np.random.seed(12345)
from esda.join_counts import Join_Counts
jc = Join_Counts(y,w)

In [5]:
# print number of black-black (1-1) joins
print(jc.bb)
# print number of white-white (0-0) joins
print(jc.ww)
# print number of black-white (0-1) joins
print(jc.bw)


10.0
10.0
4.0


# So what is going on here?

In [6]:
# print out input and weights
print(y)
print(w)

[0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
<libpysal.weights.weights.W object at 0x20E72250>


In [8]:
# The functions begins like many other pysal functions

# Flatten the input vector y
y = np.asarray(y).flatten()

# ensure weights are binary transformed
w.transformation = 'b' 

# 'new' step to me: the adjacency list
# this creates a list object of unique focal-neighbor pairs. 
# The remove_symmetric=True ensure that there are not duplicated (but reversed) adjacency pairs
adj_list = w.to_adjlist(remove_symmetric=True) 
print(adj_list)
print(w[0])

# From this list we can validate neighbors. For example, in our 
# 4x4 grid, we know that the upper-left hand corner of the grid (w[0]) 
# only touches its right neighbor and bottom (note: we are NOT using a queen contiguity in this example). 
# Thus, the first weight object will capture these relationships and they will be reflected in the adj_list table 
# see [0 1 1.0] and [4 0 1.0]


    focal  neighbor  weight
1       0         1     1.0
3       1         5     1.0
5       2         1     1.0
7       2         3     1.0
10      4         0     1.0
12      4         5     1.0
16      5         6     1.0
17      6         2     1.0
21      7         3     1.0
22      7         6     1.0
23      7        11     1.0
24      8         4     1.0
25      8        12     1.0
27      9         5     1.0
28      9         8     1.0
31     10         6     1.0
32     10         9     1.0
33     10        14     1.0
34     10        11     1.0
37     11        15     1.0
39     12        13     1.0
40     13         9     1.0
44     14        13     1.0
45     14        15     1.0
{4: 1.0, 1: 1.0}


Now that we have an adjacency setup and values in `y`, we need to calculate the various bb, bw, and ww combinations. `Join_Counts` does this in a series of clever maneuvers. 

In [9]:
# First, set up a series that maps the y values (input as self.y) to the weights table 
zseries = pd.Series(y, index=w.id_order)
# Next, map the y values to the focal (i) values 
focal = zseries.loc[adj_list.focal].values
# Repeat the mapping but for the neighbor (j) values
neighbor = zseries.loc[adj_list.neighbor].values

Let's spend a bit of time examining what exactly is going on here. The `focal` object takes on the length 24. Note that this is the same number of total rows as the above `adj_list`. The command:

`focal = zseries.loc[adj_list.focal].values`

is using the **numerical value** as an **index** into the zseries (i.e. y list) object. This means that each focal will be assigned the y value corresponding to the location in the zeries (i.e. y list). For example, the focal 10 should get the `zseries[10]` value. Let's check:

In [10]:
# Value of focal 10 (starts at row 16)
print(focal[16])
# Value of zeries at 10
print(zseries[10])

1.0
1.0


This process is repeated based on neighbors:

`neighbor = zseries.loc[adj_list.neighbor].values`

Although the same, note the overall more random pattern of values in the neighbors column. This will result in different patterns in the focal and neighbor objects:

In [11]:
print(focal)
print(neighbor)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.]


Now that we have defined these nice vectors, how do we actually identify the the different bb/ww/bw counts? Well, we first run a comparison that compares **if a focal value equals the neighbor value** (captured in the `sim` object):

In [12]:
sim = focal == neighbor
sim

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True, False,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True])

At first glance, this array of T/F may not seem like much. But, it is very powerful! If we combine it with the focal-neighbor informaiton from before, we can **identify which focal-neighborhood pairs are both equal to 1 (i.e. 1=1 or black-black), are both equal to 0 (i.e. 0=0 or white-white), or are discordant (i.e. 0-1 or black-white).**

Before working through the logic of each, we need to create an object called `dif`. `dif` stands for difference - if the difference is 0, the focal-neighborhood pairs can be either 0 or 1. However, if the difference is 1, we know the pairs are discordant.

In [13]:
dif = 1 - sim
dif

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0])

Now that we have all of our relevant elements defined, we can work through the logic of each bb/ww/bw/wb combination. 

**bb black-black 1-1**

`bb = (focal * sim).sum()`

We can get the count of bb values by multipling the `focal` object (1/0s) by the `sim` object (T/Fs). This will identify where the focal-neighbors are in agreement (1 or 0). By taking the sum, we count only the the 1 values. This provides us with a count of the focal-neighbors that are both 1.  

**ww white-white 0-0**

`ww = ((1-focal) * sim).sum()`

We first want to identify what parts of the focal are indeed equal to 0. To create this vector, we subtract each element of the `focal` vector from 1. This will result in a 1 value where the `focal` value is originally 0, and a 0 value where the `focal` value is originally 1. We can now multiply this 'flipped' `focal` vector by the `sim` object to identify where both the `focal` vector and `neighbor` vector have the same values (these values could be 0 or 1s, but now we have reduced the `focal` vector to only its instances of 0). As the agreements have now been recoded to 1, we can again take the sum to get a count of the focal-neighbors that are both 0. 

**bw black-white 1-0**

`bw = (focal * dif).sum()`

Note that the above equation is the same as the bb equation, except we swap in the `dif` vector. If the `dif` vector is equal to 1, we know the pairs are discordant. We multiply `focal` by `dif` to identify where original `focal` values of 1 do not agree with neighbor values (i.e. 0). This is a series of 1 times 0 or 1 operations. The resulting temporary vector identifies where the original `focal` value is 1 and the neighbor is 0, and we take the sum of these instances to get the `bw` counts. 

**wb white-black 0-1**

`wb = ((1-focal) * dif).sum()`

Note that the above equation is the same as the ww equation, except we swap in the `dif` vector. We first want to identify what parts of the focal are indeed equal to 0. To create this vector, we subtract each element of the `focal` vector from 1. This will result in a 1 value where the `focal` value is originally 0, and a 0 value where the `focal` value is originally 1. We can now multiply this 'flipped' `focal` vector by the `dif` vector to create a temporary vector where the original `focal` values of 0 (now coded as 1) agree with the `dif` vector. Again, the `dif` vector captures discordance generally, we use the flipped `focal` values to identify where the discordance is 0-1. We take the sum of these instances to get the `wb` counts. 

    If you want to visually confirm the process, these commands render the `wb` counts step by step:

    `print((1-focal))`

    `print(dif)`

    `print((1-focal)*dif) # we can see that the sum of this vector would be 1`


In [14]:
# Calculate each of the combinations of interest
bb = (focal * sim).sum(); print(bb)
ww = ((1-focal) * sim).sum(); print(ww)
bw = (focal * dif).sum(); print(bw)
wb = ((1-focal) * dif).sum(); print(wb)

10.0
10.0
3.0
1.0


We can then input these values into a table and use an existing python function called `chi2_contigency` to get some inference values. **Note that bw and wb are reported TOGETHER!**

In [15]:
table = [[ww, wb],
        [bw, bb]]
# Feed into chi2_continency function
chi2 = chi2_contingency(table)
# Extract values from object
stat, pvalue, dof, expected = chi2
# Print
print(bb, 
      ww, 
      bw+wb, 
      stat, 
      pvalue, 
      dof, 
      expected, 
      np.array(table))


10.0 10.0 4.0 8.479632255856034 0.003591446953916693 1 [[5.95833333 5.04166667]
 [7.04166667 5.95833333]] [[10.  1.]
 [ 3. 10.]]


# Concluding thought

Quite an interesting breakdown! there is still a bit to go in regards to identify how to exploit this approach for the OLJC, but I feel much better having walked through this example.

