# EDA

In [None]:
import os
import matplotlib.pyplot as plt
from skimage.io import imread, imshow
from skimage.color import rgb2gray, rgb2hsv, rgb2lab

In [None]:
base_path = "../data/dataset/EuroSAT/"
categories = os.listdir(base_path)

### Image composition

First and foremost, it's important to visualize the images and know the dimensions of them. As we can see in the example below, the images are of size 64x64 pixels with 3 spectral bands. To the human eye the resolution is quite deceibing, but we can get a lot of information from it.

In [None]:
im = imread(base_path + "Industrial/Industrial_1.jpg")
print(im.shape)
imshow(im)

We can also convert the original image to diferent color contexts, like grayscale, black & white, cielab or hsv.

In [None]:
im_gray = rgb2gray(im)
imshow(im_gray)

A part from the visual point of view, we need to know how the data of the image is structured in the different color spaces.

In [None]:
list(im_gray)

In [None]:
im_bw = im_gray > 0.75
imshow(im_bw)

In [None]:
list(im_bw)

In [None]:
list(rgb2lab(im))

In [None]:
images = []
for cat in categories:
	files = os.listdir(base_path + cat)
	images.append(imread(base_path + cat + "/" + files[0]))

In [None]:
_, axs = plt.subplots(2, 5, figsize=(12, 5.5), layout="constrained")
axs = axs.flatten()
for i, cat in enumerate(categories):
	axs[i].imshow(images[i])
	axs[i].set_title(cat)
plt.show()

In [None]:
_, axs = plt.subplots(2, 5, figsize=(12, 5.5), layout="constrained")
axs = axs.flatten()
for i, cat in enumerate(categories):
	axs[i].imshow(rgb2gray(images[i]))
	axs[i].set_title(cat)
plt.show()

In [None]:
_, axs = plt.subplots(2, 5, figsize=(12, 5.5), layout="constrained")
axs = axs.flatten()
for i, cat in enumerate(categories):
	axs[i].imshow(rgb2gray(images[i]) > 0.75)
	axs[i].set_title(cat)
plt.show()

In [None]:
_, axs = plt.subplots(2, 5, figsize=(12, 5.5), layout="constrained")
axs = axs.flatten()
for i, cat in enumerate(categories):
	axs[i].imshow(rgb2hsv(images[i]))
	axs[i].set_title(cat)
plt.show()

In [None]:
_, axs = plt.subplots(2, 5, figsize=(12, 5.5), layout="constrained")
axs = axs.flatten()
for i, cat in enumerate(categories):
	# axs[i].imshow((rgb2lab(images[i]) - rgb2lab(images[i]).min()) / (rgb2lab(images[i]).max() - rgb2lab(images[i]).min())) Normalized values for visualization (not accurate)
	axs[i].imshow(rgb2lab(images[i]))
	axs[i].set_title(cat)
plt.show()

If we show an example of each category in all the diferent color spaces that we want to work with, we can start to see some differences that may indicate that some categories will be easier to classify than others, since the information of the image is quite different from the rest of the categories. But as we know, that's the job of the algorithm, here it's important to identify how and which features we can extract.

In [None]:
_, axs = plt.subplots(10, 5, figsize=(13,25), layout="constrained")
axs = axs.flatten()

for i, cat in enumerate(categories):
	files = os.listdir(base_path + "/" + cat)
	image = imread(base_path + "/" + cat + "/" + files[0])
	
	axs[i*5 + 0].imshow(image)
	axs[i*5 + 1].imshow(rgb2gray(image))
	axs[i*5 + 2].imshow(rgb2gray(image) > 0.75)
	axs[i*5 + 3].imshow(rgb2hsv(image))
	axs[i*5 + 4].imshow(rgb2lab(image))
    
	axs[i*5 + 0].set_title(f"{cat} (C)")
	axs[i*5 + 1].set_title(f"{cat} (G)")
	axs[i*5 + 2].set_title(f"{cat} (B/W)")
	axs[i*5 + 3].set_title(f"{cat} (HSV)")
	axs[i*5 + 4].set_title(f"{cat} (LAB)")
	
plt.show()

As previously mentioned, the original images are in 3 spectral bands (RGB). A good representation of the color data is to use a histogram, this way we can visualize the color distribution and decide to apply different color features.

In [None]:
_, axs = plt.subplots(10, 3, figsize=(8, 20), layout="constrained")
axs = axs.flatten()

for i, cat in enumerate(categories):
    files = os.listdir(base_path + "/" + cat)
    image = imread(base_path + "/" + cat + "/" + files[0])
    
    r_vals = image[:, :, 0].flatten()
    g_vals = image[:, :, 1].flatten()
    b_vals = image[:, :, 2].flatten()
    
    r_ax = axs[i*3 + 0]  
    g_ax = axs[i*3 + 1]  
    b_ax = axs[i*3 + 2]  
    
    r_ax.hist(r_vals, bins=256, color='red')
    g_ax.hist(g_vals, bins=256, color='green')
    b_ax.hist(b_vals, bins=256, color='blue')
    
    r_ax.set_title(f"{cat} (R)")
    g_ax.set_title(f"{cat} (G)")
    b_ax.set_title(f"{cat} (B)")

plt.show()

And to end up, it's essential to know how the dataset is distributed and populated. In this particular dataset, all categories have between 2000 and 3000 images, this important difference indicates that the dataset is not quite balanced. Maybe it would be interesting to take this into account when dealing with the train, test splits.

In [None]:
sizes = []
length = {}

for cat in categories:
	files = os.listdir(base_path + cat + "/")
	print(files)
	length[cat] = len(files)
	for image in files:
		im = imread(base_path + cat + "/" + image)
		shape = im.shape
		if shape not in sizes:
			sizes.append(shape)

print(sizes)
print(length)