# K Mean Clustering 

### Import the libraries

In [None]:
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt
from PIL import Image
from IPython.display import display, HTML

### Sapmle Input Data vizualization

In [None]:
image_filenames = ['1.jpg', '2.jpg', '3.jpg']

# Open and display each image
for filename in image_filenames:
    # Open the image using PIL
    image = Image.open(filename)

    # Define the desired width for the resized image
    desired_width = 250

    # Calculate the new height while maintaining the aspect ratio
    aspect_ratio = float(image.size[1]) / float(image.size[0])
    desired_height = int(aspect_ratio * desired_width)

    # Resize the image
    resized_image = image.resize((desired_width, desired_height))

    # Display the resized image
    display(resized_image)


### Read the images

In [None]:
#________________This code is for getting information of images



input_img_1_info = cv.imread('1.jpg')
input_img_2_info = cv.imread('2.jpg')
input_img_3_info = cv.imread('3.jpg')

#Converting BGR to RGB(OpenCV uses BGR format by default)
input_img_1 = cv.cvtColor(input_img_1_info, cv.COLOR_BGR2RGB)
input_img_2 = cv.cvtColor(input_img_2_info, cv.COLOR_BGR2RGB)
input_img_3 = cv.cvtColor(input_img_3_info, cv.COLOR_BGR2RGB)

scaling_factor = 0.3  # Adjust this value to change the size of the shrunk images


#Information of images
print("shapes of images:")
print('Image 1: ', input_img_1_info.shape)
print('Image 2: ', input_img_2_info.shape)
print('Image 3: ', input_img_3_info.shape)



Expexted Output: <br>
shapes of images: <br>
Image 1:  (744, 750, 3) <br>
Image 2:  (533, 800, 3)<br>
Image 3:  (393, 700, 3)

### Understanding the data

In [None]:
#printing the first row of the image
print("Image 1 : ROW1:")
print(input_img_1_info[0].shape)
print(input_img_1_info[0])
type(input_img_1_info)

### Preprocessing the data

In [None]:
#_________________This code will need your attention to run properly____________

#convert the image 1 to 2D array
input_img_1_2d = input_img_1.reshape((-1, 3))

print("Image 1 : 2D array:", end=" ")
print(input_img_1_2d.shape)

#convert the image 2 to 2D array
input_img_2_2d = input_img_2.reshape((-1, 3))
print("Image 2 : 2D array:", end=" ")
print(input_img_2_2d.shape)

#convert the image 3 to 2D array
input_img_3_2d = input_img_3.reshape((-1, 3))
print("Image 3 : 2D array:", end=" ")
print(input_img_3_2d.shape)




##### Expected Output

Image 1 : 2D array:
(558000, 3) <br>
Image 2 : 2D array:
(426400, 3) <br>
Image 3 : 2D array:
(275100, 3) <br>

### Implimentation of K-Mean Clustering

#### Helper Funtions

In [None]:

def random_centroids_initializer(data, k):
    
    # Randomly select k points from the data as initial centroids replace = false ensures that choice is unique
    indices = np.random.choice(data.shape[0], size=k, replace=False)

    #assigning data from randomly selected indexes to variable centroids
    centroids = data[indices]

    #returning points
    return centroids


In [None]:

def assign_data_points_to_nearest_centroid(data, centroids):
    
    #computing manhattan distance
    distances = np.sum(np.abs(data[:, np.newaxis] - centroids), axis=2)
    
    # returning index of minimum value
    assigned_centroids = np.argmin(distances, axis=1)
    
    #returning array
    return assigned_centroids

In [None]:

def update_centroids_by_mean(data, assigned_centroids, k):
    
    #creating empty centrod array
    centroids = np.empty((k, data.shape[1]))
    
    #go for every centroid 
    for i in range(k):
        
        # if centroid is of type i return data at that index
        assigned_points = data[assigned_centroids == i]
        
        #if ith centroid is not used by any data point
        if len(assigned_points) > 0:
            centroids[i] = np.mean(assigned_points, axis=0)

    return centroids


#### Mian Function

In [None]:


def kmeans_clustering(data, k, max_iterations=100):
        
    # Randomly initialize centroids
    #call the helper function here of random_centroids_initializer
    centroids = random_centroids_initializer(data, k)
    
    for _ in range(max_iterations):

        # Assign each data point to the nearest centroid
        #call the helper function here of assign_data_points_to_nearest_centroid
        labels = assign_data_points_to_nearest_centroid(data, centroids)

        # Update centroids by taking the mean of the assigned data points
        #call the helper function here of update_centroids_by_mean(absolute distance)
        new_centroids = update_centroids_by_mean(data, labels, k)

        # Check convergence
        if np.sum(np.abs(new_centroids - centroids)) == 0:
            break
        
        centroids = new_centroids

    #Upadte centroids to the latest centroids

    return labels, centroids


#### Verification of the output

In [None]:
# chanage the value of  to match your output with given output
# Create an empty ndarray with dtype=object
images = np.empty(3, dtype=object)

# Assign the input arrays to the corresponding positions
images[0] = input_img_1_2d
images[1] = input_img_2_2d
images[2] = input_img_3_2d
k = 2
for i in range(3):
    print(f'For input_image_{i+1}:')
    labels, centroids = kmeans_clustering(images[i], k)
    print('k', k)
    print('unique labels',np.unique(labels))
    print('Labels shape: ', labels.shape)
    print('Centroids shape: ', centroids.shape)
    print('centroids',centroids)
    print('\n\n')

#### Expected Output
Note : The output will vary if you use diffrent distance metric or reduced size of the image.

For input_image_1: <br>
<hr>

- k 2
- unique labels [0 1]
- Labels shape:  (558000,)
- Centroids shape:  (2, 3)
> [[ 98.15321625  72.44336854  63.15206393] <br>
      [182.32910336 168.23132632 150.00980018]]
<hr>

For input_image_2: <br>

- k 2
- unique labels [0 1]
- Labels shape:  (426400,)
- Centroids shape:  (2, 3)
> centroids [[226.34647018 223.46249381 220.90380275] <br>
 [193.99718075 161.61216588 146.87529588]]

 <hr>
 For input_image_1: <br>

- k 2
- unique labels [0 1]
- Labels shape:  (275100,)
- Centroids shape:  (2, 3)
> centroids [[118.32229891 158.51120919 238.83084683] <br>
      [ 79.00552162 107.03481755  28.29886674]]

## BATCH RUN

### K-Mean Clustering on the all images

NOTE: IF YOUR CODE IS TAKING TOO MUCH TIME TO RUN, TRY RESIZE THE IMAGE TO SMALLER SIZE 

In [None]:
k_values = [2, 5, 10,20]
images = [input_img_1_2d, input_img_2_2d, input_img_3_2d]
org_images = [input_img_1, input_img_2, input_img_3]

for i, (image, org_image) in enumerate(zip(images, org_images)):
    for k in k_values:
        labels, centroids = kmeans_clustering(image, k)
        # Replace pixel values with centroid values
        new_image_data = centroids[labels].reshape(org_image.shape)
        # Convert the data back to image format
        new_image = Image.fromarray(new_image_data.astype(np.uint8))
        # Display the new image
        display(HTML(f'<h2>Clustered Image of input {i+1} (k={k})</h2>'))
        display(new_image)


### Expected Output


![Example Image](1.O.jpg)
![Example Image](2.O.jpg)
![Example Image](3.O.jpg)
