## K-Means Clustering

Author: Kanika Chopra

Date: December 5, 2019

In [1]:
import math
import rasterio as rio

import numpy as np

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans

# For the mathematical side of PCA
import operator

### Importing the Image

In [2]:
image_file = r"C:\Users\RQ\Documents\PCA\GC_Landsat8_SR.tif"
sat_data = rio.open(image_file)

#### Calculating the dimensions of the image on Earth in metres

In [3]:
width_in_projected_units = sat_data.bounds.right - sat_data.bounds.left
height_in_projected_units = sat_data.bounds.top - sat_data.bounds.bottom

print('Width: {}, Height: {}'.format(width_in_projected_units, height_in_projected_units))

Width: 0.08556453081239113, Height: 0.1691078522355003


#### Convert pixel co-ordinates to longtidues and latitudes

In [4]:
# Upper left pixel
row_min = 0
col_min = 0

# Lower right pizel. Rows and colums are zero indexing.
row_max = sat_data.height - 1 
col_max = sat_data.width - 1 

# Transform coordinates with the dataset's affine transformation
topleft = sat_data.transform * (row_min, col_min)
botright = sat_data.transform * (row_max, col_max)

print("Top left corner coordinates: {}".format(topleft))
print("Bottom right corner coordinates: {}".format(botright))

Top left corner coordinates: (-74.7413587747272, 7.166669505177131)
Bottom right corner coordinates: (-74.57238566978432, 7.081239721657364)


#### Bands
Let's check how many bands are in our image.

In [5]:
print('Bands: {}'.format(sat_data.count))

# Sequence of band indexes
print(sat_data.indexes)

Bands: 12
(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)


### Data Preprocessing

In [6]:
src_meta = sat_data.meta

src_meta

{'driver': 'GTiff',
 'dtype': 'float64',
 'nodata': None,
 'width': 635,
 'height': 1255,
 'count': 12,
 'crs': CRS.from_dict(init='epsg:4326'),
 'transform': Affine(0.00013474729261792824, 0.0, -74.7413587747272,
        0.0, -0.00013474729261792824, 7.166669505177131)}

In [7]:
img = sat_data.read()

img.shape

(12, 1255, 635)

In [8]:
img

array([[[  nan,   nan,   nan, ...,   nan,   nan,   nan],
        [310. , 310. , 235. , ..., 229. , 274.5, 274.5],
        [310. , 310. , 235. , ..., 243. , 285. , 285. ],
        ...,
        [230. , 237. , 237. , ..., 148. , 148. , 153.5],
        [230. , 237. , 237. , ..., 148. , 148. , 153.5],
        [230. , 306. , 306. , ..., 143.5, 143.5, 167.5]],

       [[  nan,   nan,   nan, ...,   nan,   nan,   nan],
        [360. , 360. , 245. , ..., 255.5, 298. , 298. ],
        [360. , 360. , 245. , ..., 264. , 296. , 296. ],
        ...,
        [314. , 308. , 308. , ..., 215.5, 215.5, 232.5],
        [314. , 308. , 308. , ..., 215.5, 215.5, 232.5],
        [310. , 396. , 396. , ..., 236.5, 236.5, 284. ]],

       [[  nan,   nan,   nan, ...,   nan,   nan,   nan],
        [652. , 652. , 526. , ..., 457. , 558.5, 558.5],
        [652. , 652. , 526. , ..., 467.5, 553. , 553. ],
        ...,
        [712. , 706.5, 706.5, ..., 595. , 595. , 628.5],
        [712. , 706.5, 706.5, ..., 595. , 595

In [9]:
n_bands = src_meta['count']
height = src_meta['height']
width = src_meta['width']

n_bands, height, width

(12, 1255, 635)

In [83]:
# Goal: We want to convert (n_bands, height,width) = (12, 1255, 635) into an array that is (n_pixel, n_bands) = (796925, 12)
flattened_img = img.reshape(n_bands, -1)
flattened_img = flattened_img.T 
n_pixels = flattened_img.shape[0]

flattened_img.shape

(796925, 12)

In [84]:
flattened_img

array([[  nan,   nan,   nan, ...,   nan,   nan,   nan],
       [  nan,   nan,   nan, ...,   nan,   nan,   nan],
       [  nan,   nan,   nan, ...,   nan,   nan,   nan],
       ...,
       [143.5, 236.5, 693. , ..., 130. , 322. ,   0. ],
       [143.5, 236.5, 693. , ..., 130. , 322. ,   0. ],
       [167.5, 284. , 793.5, ..., 160. , 322. ,   0. ]])

In [85]:
# The user will input the number of clusters, but for our case let's try out 5 
n_clusters = 5 

### Standardizing the Data
We need to remove the null data from the image and then standardize the data so we can use the non-nulls for the PCA.

In [138]:
# Create an empty array for the results
KMeans_results = np.ones((n_pixels, 1)) * -1 

KMeans_results

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [153]:
# Create mask for null values
null_mask = np.isnan(flattened_img).all(axis=1)

notnull_array = flattened_img[~null_mask]

In [140]:
# First we standardize the not null data for the PCA 
std_array = scale(notnull_array, axis=0)

std_array.shape

(796290, 12)

In [141]:
# Let's find out how many rows of NaNs we dropped
flattened_img.shape[0] - std_array.shape[0]

635

### K-Means Clustering
Let's perform the K-means clustering now to our data with non-nulls.

In [142]:
# K Means 
Kmean= KMeans(n_clusters=n_clusters, init='k-means++')
Kmean.fit(std_array)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [143]:
# Finding the centroids 
Kmean.cluster_centers_

array([[ 7.31836203e-01,  8.01238507e-01,  8.87354722e-01,
         8.76319064e-01,  1.47169417e-01,  1.14098109e+00,
         1.05103891e+00,  6.48093878e-01,  5.33904215e-01,
         1.48317917e-01, -7.14552323e-03,  0.00000000e+00],
       [-6.46963826e-01, -7.26292445e-01, -8.55352809e-01,
        -7.99739660e-01, -1.29226471e-01, -9.66677060e-01,
        -9.26454438e-01, -8.40474372e-01, -7.48498553e-01,
        -1.25604361e-01, -4.82528936e-03,  0.00000000e+00],
       [ 4.09826736e+00,  4.08046708e+00,  2.97759368e+00,
         2.46418944e+00, -8.73129753e-01,  8.53462584e-01,
         1.57967208e+00, -1.72052691e+00, -1.98929667e+00,
         6.53983725e-01,  1.39837001e+02,  0.00000000e+00],
       [ 3.83530710e+00,  3.96165700e+00,  3.35701389e+00,
         3.71094823e+00, -1.56772610e+00,  1.55677921e+00,
         2.74656465e+00,  1.55190192e+00,  1.33383503e+00,
         1.29941542e+00, -7.14552323e-03,  0.00000000e+00],
       [-1.68197196e-01, -1.48894589e-01, -3.8345714

In [144]:
# Labels of the clusters for the array
X_clustered = Kmean.labels_
point = X_clustered.shape[0]
X_clustered = X_clustered.reshape(point,1)

X_clustered.shape

(796290, 1)

### Creating the New Image Array

We need to replace the correct rows of KMeans_results with -1 for NaNs and with the appropriate K-means clusters. Then, we reshape the data into (1, height, width) so we can export it as a single-band image.

In [147]:
KMeans_results.shape

(796925, 1)

In [165]:
# Change the not null values to the clusters 
KMeans_results[~null_mask] = X_clustered

IndexError: boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 796925

In [149]:
# Reshape the data to (1, height, width) so it is a single-band image
KMeans_results = KMeans_results.reshape(1, height, width)

KMeans_results

array([[[-1., -1., -1., ..., -1., -1., -1.],
        [ 0.,  0.,  4., ...,  1.,  4.,  4.],
        [ 0.,  0.,  4., ...,  1.,  4.,  4.],
        ...,
        [ 0.,  0.,  0., ...,  1.,  1.,  1.],
        [ 0.,  0.,  0., ...,  1.,  1.,  1.],
        [ 0.,  0.,  0., ...,  1.,  1.,  4.]]])

In [150]:
# Check the shape again
KMeans_results.shape

(1, 1255, 635)

In [151]:
# Make a copy of the source dictionary 
dst_meta = src_meta.copy()

# Change dtypes to int32, nodata=-1 since we replaced all NaNs and count to 1 since it is a single-band image
dst_meta['dtype'] = 'float64'
dst_meta['nodata'] = -1
dst_meta['count'] = 1 

dst_meta

{'driver': 'GTiff',
 'dtype': 'float64',
 'nodata': -1,
 'width': 635,
 'height': 1255,
 'count': 1,
 'crs': CRS.from_dict(init='epsg:4326'),
 'transform': Affine(0.00013474729261792824, 0.0, -74.7413587747272,
        0.0, -0.00013474729261792824, 7.166669505177131)}

In [152]:
# Open a new file in 'write' mode and unpack (**) the destination metadata
dst_fp = r"C:\Users\RQ\Documents\PCA\GC_Landsat8_KMeans.tif"
with rio.open(dst_fp, 'w', **dst_meta) as dst:
    dst.write(KMeans_results)

This completes our K-means clustering for satellite images.

### Generalization
We want to create a general function that will allow us to input the input filepath, the number of clusters, a list of bands to use and an output path to export the single-band image to.

In [172]:
# -*- coding: utf-8 -*-
"""
Created on Thu Dec  5 15:20:52 2019

@author: Kanika Chopra
"""

import os
import rasterio as rio
import numpy as np

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans

def clustering_image(input_fp, n_clusters, output_fp=None, band_lst=None):
    ''' 
    Takes an image of the format .TIF from the input_fp and conducts k-means 
    clustering with n_clusters to get a single-band image outputting into the 
    output_fp. If band_lst is specified, it will only conduct the k-means 
    clustering on the specified bands.
    
    
    Args: 
        input_fp (str): The file path to retrieve the .TIF file from 
        n_clusters (int): The number of clusters for k-means clustering
        output_fp (str): The file path to export the image to once it is 
        transformed
            None, _kmeans + n_clusters will be added to the input filepath 
            (default: None)
        band_lst (lst): The list of bands in the image that you want to transform
            None, all of the bands will be used. 
            (default: None)
    Returns:
        None, it will print that the New Kmeans image is stored in the 
        output_fp
        
    @author: Kanika Chopra
    '''
    # Import and open the image
    sat_data = rio.open(input_fp)
    
    # Collect important data from metadata of original image
    src_meta = sat_data.meta
    n_bands = src_meta['count']
    height = src_meta['height']
    width = src_meta['width']
    
    # Read the image
    if band_lst is not None:
        img = sat_data.read(band_lst)
        n_bands = len(band_lst)
    else:
        img = sat_data.read()
        
    print('Image has been read in')
        
       
    # We convert (n_bands, height, width) to an array of shape (n_pixel, n_bands)
    flattened_img = img.reshape(n_bands, -1)
    flattened_img = flattened_img.T
    n_pixels = flattened_img.shape[0]
    
    # Create an empty array for the results
    KMeans_results = np.ones((n_pixels, 1)) * -1 

    # Collecting the Data that only has the non-nulls
    null_mask = np.isnan(flattened_img).all(axis=1)
    notnull_array = flattened_img[~null_mask]
    
    # K-Means Clustering with the non Null array
    results = kmeans(notnull_array, n_clusters, n_pixels)
    
    # Replacing the correct rows of the KMeans_results with the clustering
    KMeans_results[~null_mask] = results
    
    # Reshaping data to a single-band image (1, height, width)
    KMeans_results = KMeans_results.reshape(1, height, width)

    # Export the image 
    if output_fp is None:
        output_fp = append_file_suffix(input_fp, 'kmeans' + str(n_clusters))
    
    # Update destination metadata
    dst_meta = src_meta.copy()
    dst_meta['dtype'] = 'float64'
    dst_meta['nodata'] = -1
    dst_meta['count'] = 1 

    with rio.open(output_fp, 'w', **dst_meta) as dst:
        dst.write(KMeans_results)
    
    
    print('New Image after K-Means Clustering stored in ' + output_fp)
    
    
def kmeans(array, n_clusters, n_pixels):
    '''
    Standardizes the array and performs K-Means Clustering with n_clusters 
    
    Args: 
        array (np.array): The array for K-Means Clustering (has no Nulls).
        n_clusters (int): The number of clusters to organize the data into 
        when clustering.
        n_pixels (int): The number of pixels in the satellite image.
        
    Returns:
        An array after the K-Means Clustering where the values are cluster values 
        from (0, 1, 2, ..., n_clusters-1)
            
    @author: Kanika Chopra
    '''

    # Standardize the array 
    std_array = scale(array, axis=0)
    
    # K-Means Clustering
    Kmean = KMeans(n_clusters = n_clusters, init='k-means++')
    Kmean.fit(std_array)
    
    X_clustered = Kmean.labels_    
    X_clustered = X_clustered.reshape(n_pixels,1)
    
    return X_clustered 

    
def append_file_suffix(filepath, suffix=None):
    ''' Appends a suffix to a filepath. 
    Args: 
        filepath (str): The file path to modify. 
        suffix (str): The suffix to be appended to the filepath string 
            None, no suffix is appended (default: None)
    Returns: 
        str: The file path with added suffix
    @author: charles
    '''
    name, ext = os.path.splitext(filepath)
    if suffix is not None: 
        filepath = "{name}_{uid}{ext}".format(name=name, uid=suffix, ext=ext)
        
    return filepath

In [161]:
test = clustering_image(image_file, 5)

Image has been read in
New Image after K-Means Clustering stored in C:\Users\RQ\Documents\PCA\GC_Landsat8_SR_kmeans5.tif


In [169]:
test

array([[[-1., -1., -1., ..., -1., -1., -1.],
        [ 0.,  0.,  3., ...,  3.,  0.,  0.],
        [ 0.,  0.,  3., ...,  3.,  3.,  3.],
        ...,
        [ 0.,  0.,  0., ...,  3.,  3.,  3.],
        [ 0.,  0.,  0., ...,  3.,  3.,  3.],
        [ 0.,  0.,  0., ...,  4.,  4.,  4.]]])