# Image Features Analysis with PCA and pretrained MBNV2
## Libraries importations and preprocessing
Note : All elements in this first section have already been described in the basic_model notebook.

In [1]:
import tensorflow as tf
from tensorflow import keras as tfk
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sn
from matplotlib import cm
from plotly import graph_objects as go

import albumentations as A
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import random
import cv2 as cv
import torch
from torchvision import transforms, models
from PIL import Image

np.random.seed(42)
tf.random.set_seed(42)

In [2]:
data = np.load('training_set.npz')

X = data['images']
y = data['labels']
print(X.shape, y.shape)

labels = {0:'Basophil', 1:'Eosinophil', 2:'Erythroblast', 3:'Immature granulocytes', 4:'Lymphocyte', 5:'Monocyte', 6:'Neutrophil', 7:'Platelet'}

(13759, 96, 96, 3) (13759, 1)


In [3]:
import hashlib
missing_targets = np.isnan(y.flatten()).any()
image_hashes = [hashlib.md5(img.tobytes()).hexdigest() for img in X]
df = pd.DataFrame({'image_hash': image_hashes, 'label': y.flatten()})
unique_df = df.drop_duplicates(subset = 'image_hash').reset_index(drop=True)

if not missing_targets :
    print("There are no missing or NaN values in the target vector.")

X_unique = X[unique_df.index]
y_unique = y[unique_df.index]
print(f'labels reduced from {len(y)} to {len(y_unique)}')

There are no missing or NaN values in the target vector.
labels reduced from 13759 to 11953


## Features extraction
We extract the features of a subset of the images (1000) with a pretrained MBNV2 by keeping only the output of the GAP layer. The goal is then to visualize those features in a reduced-dimensions space.

In [4]:
X_sample = X_unique[:1000]

In [5]:
# Load pre-trained MBNV2 with the top layers included
mbv2 = tf.keras.applications.MobileNetV2(include_top=True, weights='imagenet')
mbv2.trainable = False  # Freeze the model

print(mbv2.layers)
# Extract the output from the GAP layer (input to the first FC layer)
feature_extractor = tf.keras.Model(
    inputs=mbv2.input,
    outputs=mbv2.get_layer('global_average_pooling2d').output
)

# Define a function to preprocess images
def preprocess_images(images):
    """
    Preprocess a batch of images for VGG16:
    - Resize to (224, 224)
    - Scale pixel values to [0, 1]
    - Normalize using ImageNet mean and std
    - Stack into a single batch tensor
    """
    preprocessed_batch = []
    for img in images:
        img = tf.image.resize(img, (224, 224))  # Resize to VGG input size
        img = tf.keras.applications.mobilenet_v2.preprocess_input(img)
        preprocessed_batch.append(img)
    return tf.stack(preprocessed_batch, axis=0)  # Combine into a batch tensor

# Assuming X_unique is provided
features = []
batch_size = 64  # Adjust batch size as per your hardware capability

# Process images in batches
for start in range(0, len(X_sample), batch_size):
    end = start + batch_size
    batch = X_sample[start:end]  # Get the batch of images

    # Preprocess the batch of images
    preprocessed_batch = preprocess_images(batch)

    # Extract features for the batch
    batch_features = feature_extractor(preprocessed_batch)
    batch_features = batch_features.numpy()  # Convert to NumPy

    # Append features to the list
    features.extend(batch_features)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224.h5
[<keras.engine.input_layer.InputLayer object at 0x0000024ACDACBF40>, <keras.layers.convolutional.conv2d.Conv2D object at 0x0000024ACDADAE50>, <keras.layers.normalization.batch_normalization.BatchNormalization object at 0x0000024ACDADA700>, <keras.layers.activation.relu.ReLU object at 0x0000024ACDADAB50>, <keras.layers.convolutional.depthwise_conv2d.DepthwiseConv2D object at 0x0000024AF8332D90>, <keras.layers.normalization.batch_normalization.BatchNormalization object at 0x0000024A8AB6C6D0>, <keras.layers.activation.relu.ReLU object at 0x0000024A8AB6C700>, <keras.layers.convolutional.conv2d.Conv2D object at 0x0000024A8AB6C460>, <keras.layers.normalization.batch_normalization.BatchNormalization object at 0x0000024A8AB90850>, <keras.layers.convolutional.conv2d.Conv2D object at 0x0000024A8AB6C9D0>, <keras.layers.normalization.batch_norma

In [7]:
features_df = []
for i in range(int(len(features))):
    features_df.append(features[i].tolist())

features_df = np.asarray(features_df)
print(features_df.shape)

(1000, 1280)


## PCA for dimension reduction
We will keep only the first 3 components to be able to visualize the features in space. It will be a glance at how the model sees the data before sending it to its classifier.

In [8]:
N_COMPONENTS = 10

pca = PCA(n_components=N_COMPONENTS)
pca.fit(features_df)
x = list(range(1, N_COMPONENTS+1))
y = pca.explained_variance_ratio_
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=x,
        y=y
    )
)
fig.update_layout(
    title_text='PCA scree plot',
    xaxis=dict(title_text='Principle Component',
               tickmode='array', tickvals=x),
    yaxis=dict(title_text='Explained Variance Ratio'),
)

We can clearly see how some classes are much more easily separable in this space than others. The Immature Granulocytes for example are difficult to distinguish from Monocytes, Basophils and Eosinophils for example.

In [10]:
fitted = pca.transform(features_df)
fig = px.scatter_3d(
            x=fitted[:, 0],
            y=fitted[:, 1],
            z=fitted[:, 2],
            color= [labels[label] for label in y_unique[:1000].flatten()],
            title='3D PCA',
            labels = {'x': f'PC1 ({pca.explained_variance_ratio_[0]:.2f})',
                      'y': f'PC2 ({pca.explained_variance_ratio_[1]:.2f})',
                      'z': f'PC3 ({pca.explained_variance_ratio_[2]:.2f})'})
fig.update_traces(marker_size=4)
fig.show()