In [None]:
# conda install --channel conda-forge esda

# Spatial Statistics: Spatial Autocorrelation

## Overview
In this lecture, we will learn spatial autocorrelation with two well-known methods: Moran's I and Local Indicator of Spatial Association (LISA). 
* **Global Moran's I** demonstrates how geographical phenomena are correlated over space, meaning whether closer things is more related than distant things. The method provides an index with the range -1 to 1; namely, -1 is a strong negative spatial autocorrelation and 1 is a strong positive spatial autocorrelation. 
* While Global Moran's I only provides one index to demonstrate spatial autocorrelation, **Local Indicator of Spatial Association (LISA)**, as known as Local Moran's I explains where high (i.e., HH Cluster) and low (LL Cluster) values are clustered. 

This document uses the following three new packages:
* `PySAL` is the overarching project and it is a name of a Python library for spatial data science. 
* `libpysal` provides foundational algorithms and data structures that support the rest of the `PySAL` library. 
* `esda` provides exploratory spatial data analysis methods, including Moran's I and LISA, under the umbrella of `PySAL`.
* `tqdm` is a library for progress bars and is used to show the progress of the loop in the code.

In [None]:
import esda
import libpysal
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
# warnings.filterwarnings('ignore')

## Spatial Autocorrelation: Global Moran's I

Spatial autocorrelation investigates how the geographical phenomena **are spatially related** to each other based on Tober's First Law of Geography; meaning that everything is usually related to all else but those which are near to each other are more related when compared to those that are further away. There are several indices that indicate the degree of spatial autocorrelation (e.g., Geary's C or Getis-Ord Gi*). Here, we study Moran's I, which is the most well-known method. 

$$I = \frac{n}{W} \frac{\sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}(x_i - \overline{x})(x_j - \overline{x})}{\sum_{i=1}^{n}(x_i - \overline{x})^2}$$

where <br>
$n$ - the number of objects in space, <br>
$W$ - the sum of spatial weights, <br>
$w$ - a spatial weight for a pair of objects, <br>
$x_i, x_j$ - values of an attribute for objects i and j, <br>
$\overline{x}$ - a mean value of an attribute. <br>




In [None]:
gdf = gpd.read_file('./data/sgg_income_ex_ratio.geojson')
gdf

In [None]:
# Geographical distribution of Extinction Ratio data
fig, ax = plt.subplots(1, 1, figsize=(10, 15))

gdf.plot(column='ex_ratio', scheme='NaturalBreaks', cmap='Blues', legend=True, ax=ax)

### Backbone of calculating Moran's I with Python

```python
import libpysal
import esda

## 1. Calculate weights (w) of geographical units
w = libpysal.weights.Queen() # based on Queen's case contiguity
w = libpysal.weights.DistanceBand() # based on a fixed distance

## 2. Define value to calculate spatial autocorrelation
y = df['Variable']

## 3. Calculate the final index
mi = esda.moran.Moran(y, w_queen)
print(mi.I) # Moran's I value
print(mi.p_norm) # p-value of the current Moran's I 
print(mi.z_norm) # Z Score of the current Moran's I 

```

### Compute weigts of a geographical unit over the other geographical units
#### Based on contiguity: Queen's case

For more information, visit <a href=https://pysal.org/libpysal/generated/libpysal.weights.Queen.html>libpysal.weights.Queen</a>.

In [None]:
xmin, ymin, xmax, ymax = gdf.loc[gdf['ADM_NM'].str.startswith('서울')].union_all().bounds

In [None]:
w_queen = libpysal.weights.Queen.from_dataframe(gdf, use_index=True)

fig, ax = plt.subplots(1, 1, figsize=(10,10))
# Contiguity: Queen's case
gdf.boundary.plot(ax=ax, ls=':', color='black')
w_queen.plot(gdf, ax=ax, 
             edge_kws=dict(color='r', linestyle=':', linewidth=1),
             node_kws=dict(marker=''))
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
plt.show()

You can use `.neighbors` attribute to check the neighbor of each geographical unit and `.weights` attribute to check their weights. For contiguity, weights are automatically assigned to equal values. 

In [None]:
w_queen.weights

In [None]:
w_queen.neighbors

In [None]:
# Index of 강남구 is 124
gdf.loc[gdf['ADM_NM'].str.contains('강남구')]

In [None]:
# Neighbors of 강남구
w_queen.neighbors[124]

In [None]:
# Neighbors of 강남구
gdf.loc[w_queen.neighbors[124]]

In [None]:
temp_geom = gdf.loc[gdf['ADM_NM'].str.contains('강남구'), 'geometry'].values[0]

gdf.loc[gdf['geometry'].intersects(temp_geom), 'ADM_NM'].values

In [None]:
# Weights of 강남구's neighbors
w_queen.weights[124]

#### Calculate Moran's I

To calculate Moran's I, you can simply enter the attribute of interest (i.e., `ex_ratio`) and weight matrix (i.e., `w_queen`) to <a href=https://pysal.org/esda/generated/esda.Moran.html>esda.moran.Moran()</a>.

In [None]:
y = gdf['ex_ratio']

mi_queen = esda.moran.Moran(y, w_queen)
print(f"Moran's I with Queen's case contiguity: {round(mi_queen.I, 3)}, p-value: {round(mi_queen.p_norm, 3)}")

### Use of Fixed distance to calculate neighbors

The drawback of contiguity based neighbors is that they do not consider the distance decay for calculating weights. Here, we examine a way to incorporate a distance decay functions. 

The distance decay function in this package is as shown below. Here, alpha value should be negative. If the alpha value decreases, the distance decay becomes strong.

$$w_{ij} = d_{ij}^\alpha$$

In [None]:
# The effect of the power on distance decay
x = np.linspace(1,100,100)
y_05 = [val**-0.05 for val in x]
y_1 = [val**-0.1 for val in x]
y_15 = [val**-0.15 for val in x]

plt.plot(x, y_05, label='alpha = -0.05')
plt.plot(x, y_1, label='alpha = -0.1')
plt.plot(x, y_15, label='alpha = -0.15')

plt.legend(fontsize=15)
plt.show()

In [None]:
# The distance unit is in meters
gdf.crs

In [None]:
threshold_dist = 50000 # distance band (50km)
alpha_val = -0.1 # the power of distance decay function, should be negative value. 

# Obtain coordinates of each geographical units
coords = gdf.apply(lambda x:x.geometry.centroid.coords[0], axis=1).values

# Calculate weights of each geographical units based on distance decay method. 
w = libpysal.weights.DistanceBand(data=list(coords), # coordinates of each geographical units
                                  threshold=threshold_dist, # distance band
                                  binary=False, # whether distance decay is employed or not. 
                                  alpha=alpha_val, # distance decay parameter for weight (default -1.0)
                                #   silence_warnings=True
                                 )
print(w.weights[124])
print(w.neighbors[124])

In [None]:
len(w.neighbors[124]) # number of neighbors|

In [None]:
# Distribution of weights
plt.hist(w.weights[124], bins=20)
plt.show()

In [None]:
# The average number of neighbors
sum([len(n) for n in w.neighbors.values()]) / len(w.neighbors)

In [None]:
# Example of 강릉
gdf.head(3)

In [None]:
w.neighbors[0]

In [None]:
# The neighbors of 강릉
gdf.loc[[2, 6, 10, 14]]

In [None]:
# Distance between 강릉 and its neighbors
gdf.loc[0,'geometry'].centroid.distance(gdf.loc[[2, 6, 10, 14]].centroid)

In [None]:
# Calculate Moran's I
mi = esda.moran.Moran(y, w)

# Print results
print(mi.I) # Moran's I value
print(mi.p_norm) # p-value of the current Moran's I 
print(mi.z_norm) # Z Score of the current Moran's I 

print(f"Moran's I with {threshold_dist} meter radius: {round(mi.I, 3)}, p-value: {round(mi.p_norm, 3)}, z-score: {round(mi.z_norm, 3)}")
print(f"Moran's I with Queen's case contiguity: {round(mi_queen.I, 3)}, p-value: {round(mi_queen.p_norm, 3)}, z-score: {round(mi_queen.z_norm, 3)}")

In [None]:
# Plot relationship between geographical units with a given distance band
fig, ax = plt.subplots(figsize=(10, 10))
gdf.boundary.plot(ax=ax, ls=':', color='black')
w.plot(gdf, ax=ax, 
       edge_kws=dict(color='blue', linestyle=':', linewidth=1),
       node_kws=dict(marker=''))

plt.show()

## Local Indicators of Spatial Association (LISA): Local Moran's I

Moran's I is a characteristic of the complete spatial pattern and does not provide an indication of the location of the clusters. The concept of a local indicator of spatial association, or LISA was suggested in Anselin (1995) to remedie this situation. A LISA is seen as having two important characteristics. 
1. It provides a statistic for each location with an assessment of significance. 
2. It establishes a proportional relationship between the sum of the local statistics and a corresponding global statistic.

Source: https://geodacenter.github.io/workbook/6a_local_auto/lab6a.html <br>
Anselin, Luc. 1995. “Local Indicators of Spatial Association — LISA.” Geographical Analysis 27: 93–115.

<a href=https://pysal.org/esda/generated/esda.Moran_Local.html>esda.moran.Moran_Local</a> will help you calculate LISA. It returns two important information in `.q` attribute and `p_sim`. `.q` provides the indicator of each classification of LISA (1: 'HH', 2: 'LH', 3: 'LL', 4: 'HL'), and `p_sim` provide the p-value of each classification. 

In [None]:
# https://pysal.org/esda/generated/esda.Moran_Local.html
y = gdf['ex_ratio']  # Focused Variable
w_queen = libpysal.weights.Queen.from_dataframe(gdf, use_index=True) # Contiguity weight

lm_queen = esda.moran.Moran_Local(y, w_queen, seed=17)
print(lm_queen.q) # Classification of LISA
print(lm_queen.p_sim) # Significance of each classification

In [None]:
lm_dict = {1: 'HH', 2: 'LH', 3: 'LL', 4: 'HL'}

lisa_queen = []
for idx in range(len(lm_queen.q)):
    if lm_queen.p_sim[idx] < 0.05:
        lisa_queen.append(lm_dict[lm_queen.q[idx]])
    else:
        lisa_queen.append('Not_Sig')
        
gdf['lisa_queen'] = lisa_queen
gdf

In [None]:
lisa_color = {'HH': 'red', 'LL': 'blue', 'HL': 'orange', 'LH': 'skyblue', 'Not_Sig': 'lightgrey'}

fig, ax = plt.subplots(figsize=(10, 10))

for key in lisa_color.keys():
    gdf.loc[gdf['lisa_queen'] == key].plot(ax=ax, color=lisa_color[key], legend=True)

plt.show()

---
### *Exercise*

Let's investigate how the local indicators of spatial association (LISA) varies with different weight (i.e., the fixed bandwidth of 50000). Utilize the codes mentioned above and create a map of LISA. The following describes the steps you need to do.

* Lisa with Fixed Band Width (50000 meters; 50 km)
1. Extract points coordinates from the GeoDataFrame.
2. Calculate weight with `libpysal.weights.DistanceBand()` method. 
3. Run `esda.moran.Moran_Local()` to obtain LISA with `.q` and `.p_sim` attribute. 
4. Select label (i.e., `.q`) with a certain significance in `.p_sim` attribute.
5. Display the result

Check out the following websites for more information.
* https://pysal.org/esda/generated/esda.Moran_Local.html
* https://pysal.org/libpysal/generated/libpysal.weights.Queen.html
* https://pysal.org/libpysal/generated/libpysal.weights.DistanceBand.html


In [None]:
# Your code here 

gdf


In [None]:
# Display your results here

lisa_color = {'HH': 'red', 'LL': 'blue', 'HL': 'orange', 'LH': 'skyblue', 'Not_Sig': 'lightgrey'}

fig, axes = plt.subplots(1,2, figsize=(15, 8))

for key in lisa_color.keys():
    gdf.loc[gdf['lisa_dist'] == key].plot(ax=axes[0], color=lisa_color[key], legend=True)
    gdf.loc[gdf['lisa_queen'] == key].plot(ax=axes[1], color=lisa_color[key], legend=True)

axes[1].set_title("Queen's case", fontsize=15)
axes[0].set_title("Fixed bandwidth", fontsize=15)

for ax in axes:
    ax.get_xaxis().set_visible(False)  # Remove ticks and labels
    ax.get_yaxis().set_visible(False)  # Remove ticks and labels

plt.tight_layout()

plt.show()

## Challenge: manually write codes for Moran's I

Here we want to challenge ourselves to write codes for calculating Moran's I. As mentioned earlier, the equation looks like the one below. 

$$I = \frac{n}{W} \frac{\sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}(x_i - \overline{x})(x_j - \overline{x})}{\sum_{i=1}^{n}(x_i - \overline{x})^2}$$

where <br>
$n$ - the number of objects in space, <br>
$W$ - the sum of spatial weights, <br>
$w$ - a spatial weight for a pair of objects, <br>
$x_i, x_j$ - values of an attribute for objects i and j, <br>
$\overline{x}$ - a mean value of an attribute. <br>

It may sound very challenging, but you can achieve it by following the steps below. 
1. $\overline{x}$: Calculate the mean of the variable interested (Extinction Ratio in our case). <br><br>
2. $(x_i - \overline{x})$: Calculate for each object a difference between single value and a mean. <br><br>
3. ${\sum_{i=1}^{n}(x_i - \overline{x})^2}$: Square each difference calculated at the previous step and to get a sum of these squares. <br><br>
4. $w_{ij} = d_{ij}^\alpha$: Calculate the distance decay of the pair locations based on the power of -1. <br><br>
5. $W = \sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}$: Sum the spatial weights. <br><br>
6. $\frac{n}{W}$: Simply divide the number of objects ($n$) by the sum of spatial weights ($W$). <br><br>
7. $\sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}(x_i - \overline{x})(x_j - \overline{x})$: Finalize the numerator <br><br>
8. Combine all functions together. 


### Step 1 — 3

In [None]:
# Step 1
mean_score = gdf['ex_ratio'].mean()
print(mean_score)

# Step 2
print(gdf.at[0, 'ex_ratio'] - mean_score)

# Step 3

square_diff = 0
for i in range(gdf.shape[0]):
    temp_value = (gdf.at[i, 'ex_ratio'] - mean_score) ** 2
    square_diff += temp_value
    
print(square_diff)

### Step 4 | $w_{ij} = d_{ij}^\alpha$

Let's calculate the distance decay of each pair of locations i and j, if the distance between two places is less than a threshold bandwidth. Suppose we want to formulate the data structure as shown below. 

```python
{i_1: {j_1: distance_decay_1,
       j_2: distance_decay_2,
       j_3: distance_decay_3,
       ...
     }
 ...
 i_87: {j_1: distance_decay_1,
        j_2: distance_decay_2,
        j_3: distance_decay_3,
        ...
       }
}

```

In [None]:
_w = {}
threshold_dist = 50000
alpha = -0.1

for i in tqdm(range(gdf.shape[0])):
    temp_dict = {}
    for j in range(gdf.shape[0]):
        if i != j:
            temp_dist = gdf.at[i, 'geometry'].centroid.distance(gdf.at[j, 'geometry'].centroid)
            if temp_dist <= threshold_dist:
                temp_dict[j] = temp_dist ** alpha
                
    _w[i] = temp_dict
    
_w

In [None]:
# Validation
coords = gdf.apply(lambda x:x.geometry.centroid.coords[0], axis=1).values

# Calculate weights of each geographical units based on distance decay method. 
w = libpysal.weights.DistanceBand(data=list(coords), # coordinates of each geographical units
                                  threshold=threshold_dist, # threshold bandwidth
                                  binary=False, # whether distance decay is employed or not. 
                                  alpha=alpha, # distance decay parameter for weight (default -1.0)
                                #   silence_warnings=True
                                 )
print(w.neighbors[0])
print(w.weights[0])

print("-------")
print(dict(zip(w.neighbors[0], w.weights[0])))
print(_w[0])

In [None]:
from copy import deepcopy

In [None]:
# Calculate the portion of each distance decay value over the entire distance decay value
__w = deepcopy(_w)

for i in tqdm(_w.keys()):
    for j in _w[i].keys():
        temp_sum = sum(list(_w[i].values()))
        __w[i][j] = _w[i][j] / temp_sum
#         print(i, j, w_[i][j], sum(w_[i].values()), w__[i][j])

In [None]:
print(_w[0])
print("-------")
print(sum(_w[0].values()))
print("-------")
print(__w[0])
print("-------")
print(sum(__w[0].values()))

In [None]:
# Reassign the standardized weights of distance decay
_w = __w

### Step 5 | $W = \sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}$

In [None]:
W = 0
for i in range(gdf.shape[0]):
    for j in _w[i].keys():
        # print(i, j, _w[i][j])
        W += _w[i][j]
        
print(W)

### Step 6 | $\frac{n}{W}$

In [None]:
n = gdf.shape[0]

n/W

### Step 7 | $\sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}(x_i - \overline{x})(x_j - \overline{x})$

In [None]:
numerator = 0

for i in tqdm(range(gdf.shape[0])):
    for j in _w[i].keys():
        diff_1 = gdf.at[i, 'ex_ratio'] - mean_score
        diff_2 = gdf.at[j, 'ex_ratio'] - mean_score
        
        numerator += _w[i][j] * diff_1 * diff_2
        
print(numerator)

### Step 8: Finalize Moran's I 
$$I = \frac{n}{W} \frac{\sum_{i=1}^{n}\sum_{j=1}^{n} w_{ij}(x_i - \overline{x})(x_j - \overline{x})}{\sum_{i=1}^{n}(x_i - \overline{x})^2}$$

In [None]:
I = (n/W) * (numerator / square_diff)
I

In [None]:
# Validation

# Obtain coordinates of each geographical units
coords = gdf.apply(lambda x:x.geometry.centroid.coords[0], axis=1).values

# Calculate weights of each geographical units based on distance decay method. 
w = libpysal.weights.DistanceBand(data=list(coords), # coordinates of each geographical units
                                  threshold=threshold_dist, # threshold bandwidth
                                  binary=False, # whether distance decay is employed or not. 
                                  alpha=alpha, # distance decay parameter for weight (default -1.0)
                                #   silence_warnings=True
                                 )
# Focused variable
y = gdf['ex_ratio']

# Calculate Moran's I
mi = esda.moran.Moran(y, w)

# Print results
print(mi.I) # Moran's I value
print(mi.p_norm) # p-value of the current Moran's I 
print(mi.z_norm) # Z Score of the current Moran's I 

print("---------------")
print(f"Moran's I with {threshold_dist} meter radius: {round(mi.I, 3)}, p-value: {round(mi.p_norm, 3)}, z-score: {round(mi.z_norm, 3)}")
print("---------------")

