# K-Means Clustering 
### Using Satellite Images

Author: Kanika Chopra

In [43]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import rasterio as rio

from sklearn.cluster import KMeans
from sklearn.preprocessing import scale

## Importing the Image
We begin with some analysis of the satellite image that we have.

In [44]:
# Import image with rasterio
input_path = r"C:\Users\RQ\Documents\PCA\kmeans-b8.tif"
sat_data = rio.open(input_path)

In [45]:
# Calculating the dimensions of the image on Earth in metres
width_in_projected_units = sat_data.bounds.right - sat_data.bounds.left
height_in_projected_units = sat_data.bounds.top - sat_data.bounds.bottom

print('Width: {}, Height: {}'.format(width_in_projected_units, height_in_projected_units))

Width: 15920.0, Height: 11160.0


In [46]:
# Number of bands
print('Bands: {}'.format(sat_data.count))

# Sequence of band indexes
print(sat_data.indexes)

Bands: 1
(1,)


## Data Preprocessing

In [47]:
src_meta = sat_data.meta

src_meta

{'driver': 'GTiff',
 'dtype': 'uint16',
 'nodata': None,
 'width': 1592,
 'height': 1116,
 'count': 1,
 'crs': CRS.from_dict(init='epsg:32630'),
 'transform': Affine(10.0, 0.0, 276690.0,
        0.0, -10.0, 4008360.0)}

In [48]:
img = sat_data.read()

img.shape

(1, 1116, 1592)

In [49]:
n_bands = src_meta['count']
height = src_meta['height']
width = src_meta['width']

n_bands, height, width

(1, 1116, 1592)

### Reshaping Array

In [50]:
# We want to convert (n_bands, height,width) into an array that is (n_pixel, n_bands) where n_pixel = height * width
flattened_img = img.reshape(n_bands, -1)
flattened_img = flattened_img.T 
flattened_img.shape

(1776672, 1)

In [52]:
flattened_img

array([[2268],
       [2696],
       [2471],
       ...,
       [ 156],
       [ 150],
       [ 152]], dtype=uint16)

In [53]:
# The user will input the number of clusters, but for our case let's try out 5 
n_clusters = 2 

### Standardizing Data
We need to remove the null data from the image and then standardize the data so we can use the non-nulls for the PCA.

In [54]:
# Mask all NaNs
notnull_array = flattened_img[~np.isnan(flattened_img).all(axis=1)]

In [55]:
# Create an empty array for the KMeans results
KMeans_results = np.ones(flattened_img.shape) * -1

KMeans_results

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [56]:
# First we standardize the not null data for the KMeans Clustering 
std_array = scale(notnull_array, axis=0)

std_array.shape

(1776672, 1)

In [57]:
# Let's find out how many rows of NaNs we dropped
flattened_img.shape[0] - std_array.shape[0]

0

So in this case, we did not have any nulls in the data but we should always check because some satellite images do.

## K-means Clustering

In [59]:
# K Means 
Kmean = KMeans(n_clusters=n_clusters, init='k-means++')

Kmean.fit(std_array)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

## Evaluation Metrics
Let's check the inertia

In [61]:
Kmean.inertia_

264580.4260529023

In [63]:
X_clustered = Kmean.labels_
X_clustered = X_clustered.reshape(img.shape)

X_clustered

array([[[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]])

In [68]:
dst_meta = src_meta.copy()
dst_meta['dtype'] = 'int32'

# Open a new file in 'write' mode and unpack (**) the destination metadata
dst_fp = r"C:\Users\RQ\Documents\PCA\kmeans-b8-finished.tif"
with rio.open(dst_fp, 'w', **dst_meta) as dst:
    dst.write(X_clustered)

And we are done with completing K-means clustering on our satellite image! 