# Base

In [3]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
import cv2

import joblib
from sklearn.cluster import KMeans
from scipy.cluster.vq import vq

np.random.seed(0)  # reproducibility

# BoVW class

In [21]:
class BagOfVisualWords:
    def __init__(
        self,
        root_dir: str = "/kaggle/input/the-hyper-kvasir-dataset/labeled_images",
        all_descriptors_dir: str = None,
        codebook_dir: str = None,
    ):
        """Constructor method
        
        :param all_descriptors_dir: str (optional), path to file including all computed descriptors (vectors)
        :param codebook_dir: str (optional), path to visual vocabulary
        
        """
        self.root_dir = root_dir
        self.df = pd.read_csv(f"{root_dir}/image-labels.csv")
        self.labels = tuple(self.df["Finding"].unique())
        
        # n descriptors of m images extracted from given extractor (description algorithm/ feature detection)
        if all_descriptors_dir is not None:
            self.all_descriptors = joblib.load(all_descriptors_dir)
        
        # codebook (Lookup table)
        if codebook_dir is not None:
            self.k, self.codebook = joblib.load(codebook_dir)
        
        # In reality in building codebook, choose small sample size idx for efficient 
        self.samples_idx = []  

    def extract_descriptors(self, method: str = 'sift',
                                sample_size: int = 2000,
                                grayscale: bool = True,
                                strongest_percent: float = 1,
                                **extractor_kwargs
                        ) -> np.array:
        """Extract descriptors from sample_size images
        :param method: str, method to extract feature descriptors e.g. ORB, SIFT, SURF, etc
        :param sample_size: size of sample. (We likely use a small sample in real-world scenario,
            where whole dataset is big)
        :param grayscale: bool - if True, convert to gray for efficient computing
        :param strongest_percent: float - get % percent of strongest (based on .response of keypoints)
        descriptors.  

        :return: list, n descriptors x sample_size images
        
        # TODO: sample for building visual vocabulary must be balance between classes
        every class include at least one image
        """
        # ------ extracting algorithms --------
        self.method = method
        if method == "sift":
            self.extractor = cv2.SIFT_create(**extractor_kwargs)
        elif method == "orb":
            self.extractor = cv2.ORB_create(**extractor_kwargs)
        elif method == "surf":
            self.extractor = cv2.xfeatures2d.SURF_create(**extractor_kwargs)
        else:
            raise ValueError(f"Unsupported feature extracting method: {method}")
        
        # ----- extracting process -------
        # == Sampling ==
        self.sample_idx = np.random.choice(np.arange(0, len(self.df)),
                                            size=sample_size,
                                            replace=False
                                        ).tolist() #  randomly sample sample_size images

        descriptors_sample_all = (
            []
        )  # each image has many descriptors, descriptors_sample_all
        # is a list of all descriptors of sample_size images

        # loop each image > extract > append
        for idx in self.sample_idx:
            img, _ = self._get_item(idx)
            # convert to grayscale for efficient computing
            if len(img.shape) == 3 and grayscale:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # descriptors extracting
            img_keypoints, img_descriptors = self.extractor.detectAndCompute(img, None)
            if img_descriptors is not None:
                # filter top_percent strongest keypoint
                sorted_couple = sorted(zip(img_keypoints, img_descriptors), key=lambda x: x[0].response, reverse=True)
                img_keypoints, img_descriptors = zip(*sorted_couple) # unzip
                top = int(len(img_keypoints) * strongest_percent)
                top_descriptors = img_descriptors[:top]               
                
                for descriptor in top_descriptors:
                    descriptors_sample_all.append(np.array(descriptor))

        # convert to single numpy array
        descriptors_sample_all = np.stack(descriptors_sample_all)

        return descriptors_sample_all

    def build_codebook(
        self,
        all_descriptors: np.array,
        cluster_algorithm: str = 'kmean',
        k: int = 200,
    ) -> np.array:
        """Building visual vocabulary (visual words)
        :param all_descriptors: array of descriptors
        :param cluster_algorithm: type of cluster algorithm like K-mean, Birch
        :param k: #cluster (centroids)
        
        :return: #centroids, codebook

        """
        kmeans = KMeans(n_clusters=k, random_state=123)
        kmeans.fit(all_descriptors)

        return kmeans.cluster_centers_

    def get_embedding(self, idx: int, normalized: bool = False, tfidf: bool = False):
        """Get embeddings of image[idx] (image > descriptors > project in codebook > frequencies vectors)
        :param idx: int, image index
        :param normalized: bool, if True, normalize embedding in scale [0, 1]

        :return: np.array, frequencies vector (can consider as embedding)
        """
        img_descriptors = self._get_descriptors_one_img(idx)
        img_visual_words, distance = vq(img_descriptors, self.codebook)
        img_frequency_vector = np.histogram(
            img_visual_words, bins=self.k, density=normalized
        )[0]

        if tfidf:
            self._tf_idf()
            img_frequency_vector = img_frequency_vector * self.idf

        return img_frequency_vector

    def _tf_idf(self):
        """TODO: Reweight important features in codebook"""
        self.idf = 1

        all_embeddings = []
        for i in range(len(self.df)):
            embedding = self.get_embedding(i)
            all_embeddings.append(embedding)

        all_embeddings = np.stack(all_embeddings)

        N = len(self.df)
        df = np.sum(all_embeddings > 0, axis=0)
        idf = np.log(N / df)

        return idf

    def _get_item(self, idx) -> tuple:
        """Return pair (image(arr), label)
        :param idx: index of data

        :return: tuple, (image: np.array, label)
        """
        # get path of image
        GI_dir = {"Lower GI": "lower-gi-tract", "Upper GI": "upper-gi-tract"}

        img = self.df["Video file"][idx]
        gi_tract = GI_dir[self.df["Organ"][idx]]
        classification = self.df["Classification"][idx]
        finding = self.df["Finding"][idx]
        path = f"""{self.root_dir}/{gi_tract}/{classification}/{finding}/{img}.jpg"""
        assert (
            os.path.exists(path) == True
        ), f"{path} does not exist"  # dir existance checking

        # read image
        image = np.array(Image.open(path))
        label = self.labels.index(finding)

        return image, label
    
    def _get_descriptors_one_img(self, idx, grayscale=True):
        """Extracting descriptors for each image[idx]
        :param method: method to extract features e.g. ORB, SIFT, SURF, etc
        :param idx: image index

        :return: descriptors
        :rtype: np.array
        """
        # get image
        img, _ = self._get_item(idx)
        # preprocessing: convert to grayscale for efficient computing
        if len(img.shape) == 3 and grayscale:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # descriptors extracting
        _, img_descriptors = self.extractor.detectAndCompute(img, None)

        return img_descriptors

# Pipeline

## 1. extracting descriptors

In [43]:
model = BagOfVisualWords(
        root_dir="/media/mountHDD2/lamluuduc/endoscopy/dataset/hyperKvasir/labeled-images",
#         codebook_dir="/kaggle/input/bag-of-visual-words/bovw_codebook_sift.pkl",
    )
# ----- 1. extracting descriptors -------------
'''hyperparameters'''
method = 'sift'
sample_size = len(model.df) # test
strongest_percent = 1
'''hyperparameters'''

all_descriptors = model.extract_descriptors(method=method,
                                            sample_size=sample_size,
                                            strongest_percent=strongest_percent)

# == saving all descriptors ==
ckpt_path = '../checkpoints'
# name convention for saving sample_size-method-strongest.pkl
all_descriptors_path = joblib.dump(all_descriptors,
                                    f'{ckpt_path}/descriptors/{sample_size}_img-{model.method}_extractor-{strongest_percent*100}%_strongest.pkl',
                                    compress=3)
print('='*20, 'Completely extracting descriptors', '='*20)



## 2. Building codebook

In [48]:
from tqdm import tqdm
a = np.arange(0, 10, 1)
for i in tqdm(range(0, len(all_descriptors))):
    ...

100%|██████████| 10568760/10568760 [00:01<00:00, 7530680.87it/s]


In [7]:
# ------- 2. building visual vocabulary -----------
'''hyperparameters'''
k = 200 # #cluster = vector size
cluster_algorithm='kmean'
'''hyperparameters'''
all_descriptors = joblib.load(*all_descriptors_path) # unpack list (result from joblib.dump())
codebook = model.build_codebook(all_descriptors=all_descriptors,
                                    cluster_algorithm='kmean',
                                    k=200
                                )
print('='*20, '\nCompletely building codebook', '='*20)
codebook.shape
codebook = model.build_codebook(all_descriptors, k)
joblib.dump((k, codebook), f'bovw_codebook_{model.method}.pkl', compress=3) # saving codebook

embedding = model.get_embedding(0, normalized=True)
plt.bar(list(range(len(embedding))),embedding)

Unnamed: 0,Video file,Organ,Finding,Classification
0,000e3fee-7f5c-4819-9f9c-4c983b68888a,Lower GI,cecum,anatomical-landmarks
1,001a41c2-2a5d-40b1-8fd5-b5f2f292277b,Lower GI,cecum,anatomical-landmarks
2,006af0aa-2044-4477-964d-10d9e043fb78,Lower GI,cecum,anatomical-landmarks
3,00832522-ab8e-4b98-bfce-93a777929571,Lower GI,cecum,anatomical-landmarks
4,012ab888-64e6-4361-9745-f52b4a03ba75,Lower GI,cecum,anatomical-landmarks
...,...,...,...,...
10657,f7dd198b-88f0-4566-b8f4-81c8c2fee1ed,Upper GI,esophagitis-b-d,pathological-findings
10658,f9a06ca3-3500-4e5e-ac46-110b52963a99,Upper GI,esophagitis-b-d,pathological-findings
10659,fb31e2c2-c8db-42b3-bbf1-564e42076a8e,Upper GI,esophagitis-b-d,pathological-findings
10660,fe6c191e-3da0-4f18-9e38-8f7a11097a3b,Upper GI,esophagitis-b-d,pathological-findings


In [None]:
headers = [f'feature{i}' for i in range(model.k)]
embedding_df = pd.DataFrame(columns=headers)
labels = []

# Embedding entire dataset
for idx in range(len(model.df)):
    img, label = model._get_item(idx)
    embedding = model.get_embedding(idx, normalized=True)
    # Add a row to the DataFrame
    embedding_df.loc[len(embedding_df)] = embedding
    labels.append(label)
#     break

embedding_df['label'] = pd.Series(labels, dtype='int')

embedding_df.to_csv('embeddings_with_labels.csv', index=False)

# embedding_df

# Classification with embeddings

In [2]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv('/kaggle/input/bag-of-visual-words/embeddings_with_labels.csv')

In [3]:
# 2. Extract features, labels
X = df.iloc[:, 0:200]
y = df.iloc[:, 200]

# Step 3: Divide the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a basic model (Random Forest Classifier in this case)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Predict on the test set
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 58.51%


In [None]:
# Step 7: Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
print(f'Cross-validation Accuracy: {cv_scores.mean() * 100:.2f}%')


# Test

In [4]:

# =========== Sanity check =================================
# ====================== Unit tests =================================================
def test_attributes():
    assert model.df.shape == (10662, 4)  # dataframe
    assert len(model.labels) == 23  # #labels


# test _get_item method
def test_get_item():
    image, label = model._get_item(0)
    assert len(image.shape) == 3  # image is a 3-dimensional array (h, w, c)
    assert type(label) == int and 0 <= label <= 22  # label


# test _get_descriptors method
def test_get_descriptors():
    img_descriptors = model._get_descriptors(0)
    assert len(img_descriptors.shape) == 2


# test extract all descriptors process method
def test_extract_desciptors():
    # all_descriptors = model.extract_descriptors() # ensure output is 2d
    assert len(model.all_descriptors.shape) == 2, "Invalid extracting process"
    # assert len(model.sample_idx) == 1000, 'Invalid sampling'


# test build_codebook method
def test_build_codebook():
    assert model.codebook.shape == (model.k, 128), "Invalid building codebook process"


# test get_embedding method
def test_get_embedding():
    embedding = model.get_embedding(0)
    assert embedding.shape[0] == model.k
    
test_attributes()
test_get_item()
test_get_descriptors()
# test_extract_desciptors()
test_build_codebook()
test_get_embedding()