In [4]:
import os
from tqdm import tqdm

import cv2
import numpy as np
import matplotlib.pyplot as plt

from utils.dataloader import DataLoader
from utils.vis import MatplotlibVisualizer
from utils.transforms import HairRemoval, Composer
from descriptors.shape import HOGDescriptor
from descriptors.texture import LBPDescriptor, GLCMDescriptor
from utils.segmentation import ThresholdingSegmentation
from descriptors.color import ColorDescriptor, ColorLayoutDescriptor, ColorCooccurrenceMatrixDescriptor

In [5]:
## Classes
CLASSES = ['nevus', 'others']

## Work folfer
work_folder = os.getcwd()
data_folder = os.path.join(work_folder, '..', 'Data/Challenge1')

## Visualizer
matplotlib_visualizer = MatplotlibVisualizer()

In [6]:
transforms_composer = Composer([
                                HairRemoval(),
                                ])
otsu_thresholding = ThresholdingSegmentation(method='otsu')

## Featrure Extraction

### Color Descriptor

In [7]:
modes = ['train', 'val']
## Descriptors
color_descriptor = ColorDescriptor(bins=(8, 12, 3))
color_cooccurrence_matrix_descriptor = ColorCooccurrenceMatrixDescriptor(distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], levels=8)
glcm_descriptor = GLCMDescriptor(distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], levels=8, visualize=False, grid_x=4, grid_y=4)
lbp_descriptor = LBPDescriptor(radius=3, n_points=16, grid_x=3, grid_y=3, visualize=False)

for mode in modes:
    ## Make a new folder for the features
    os.makedirs(os.path.join('feautres', mode, 'color'), exist_ok=True)

    ## Data loader
    ### Limit the number of samples to 200 for training and load all samples for validation
    max_samples = None if mode == 'train' else None
    ### Balance the dataset for training
    balance = False if mode == 'train' else False
    dataloader = DataLoader(data_folder, mode, 
                            shuffle=True, 
                            ignore_folders=['black_background', '.DS_Store'], 
                            max_samples=max_samples, 
                            balance=balance,
                            transforms=None)

    ## Extract features
    features = []
    labels = []
    for i, (img, label, path) in tqdm(enumerate(dataloader), total=len(dataloader), desc=f'Extracting features for {mode}'):
        ## combine all descriptors
        # mask = otsu_thresholding(img)
        # mask = cv2.bitwise_not(mask)
        mask = None
        color_features = color_descriptor.extract(img, mask=mask)
        color_cooccurrence_matrix_features = color_cooccurrence_matrix_descriptor.extract(img, mask=mask)
        glcm_features, glcm_img = glcm_descriptor.extract(img, mask=None)
        lbp_features, lbp_img = lbp_descriptor.extract(img, mask=mask)
        # hog_features = hog_descriptor.extract(img)
        features.append(np.concatenate([color_features, color_cooccurrence_matrix_features, lbp_features, glcm_features]))
        
        
        ## add label
        labels.append(label)

    ## Save features to disk
    features = np.array(features)
    labels = np.array(labels)
    features_with_labels = np.concatenate([features, labels.reshape(-1, 1)], axis=1)
    np.save(os.path.join('feautres', mode, 'color', 'features.npy'), features_with_labels)

Extracting features for train: 100%|██████████| 10759/10759 [19:36<00:00,  9.15it/s]
Extracting features for val: 100%|██████████| 2635/2635 [04:46<00:00,  9.19it/s]


## Training

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

In [21]:
mode = 'train'
# model = SVC(kernel='rbf', C=5.0, random_state=42, degree=5)
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=500, learning_rate=0.05, n_jobs=-1)


In [22]:
features_with_labels = np.load(os.path.join('feautres', mode, 'color', 'features.npy'))
features = features_with_labels[:, :-1]
labels = features_with_labels[:, -1]

X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

model.fit(X_train, y_train)

## Validation

In [23]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

       nevus       0.71      0.65      0.68       860
      others       0.78      0.82      0.80      1292

    accuracy                           0.75      2152
   macro avg       0.74      0.74      0.74      2152
weighted avg       0.75      0.75      0.75      2152



## Testing

In [24]:
# mode = 'val'
features_with_labels_test = np.load(os.path.join('feautres', 'val', 'color', 'features.npy'))
features_test = features_with_labels_test[:, :-1]
labels_test = features_with_labels_test[:, -1]

y_pred = model.predict(features_test)
print(classification_report(labels_test, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

       nevus       0.73      0.65      0.69      1052
      others       0.78      0.84      0.81      1583

    accuracy                           0.76      2635
   macro avg       0.76      0.75      0.75      2635
weighted avg       0.76      0.76      0.76      2635



## Grid Search

In [35]:
from sklearn.model_selection import GridSearchCV

# Initialize the base model (XGBClassifier)
model = xgb.XGBClassifier(objective='binary:logistic', n_jobs=-1, random_state=42)

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 300, 500],  # Number of trees
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage
    'subsample': [0.8, 1.0],  # Fraction of samples to use for each tree
    'colsample_bytree': [0.8, 1.0],  # Fraction of features to use for each tree
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                        scoring='accuracy',  # Use 'accuracy' as the evaluation metric
                        cv=5,  # 3-fold cross-validation
                        verbose=2, 
                        n_jobs=-1)  # Parallel processing

# Assuming X_train and y_train are your training data
# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and best score from the grid search
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=1.0; total time=   6.6s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=0.8; total time=   6.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=0.8; total time=   6.7s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=1.0; total time=   6.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=0.8; total time=   6.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=0.8; total time=   7.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=1.0; total time=   7.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=0.8; total time=   7.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, n_estimators=100, subsample=1.0; total time=   6.7s
[CV] END co



[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=500, subsample=1.0; total time=  22.1s
[CV] END colsample_bytree=1.0, learning_rate=0.05, n_estimators=500, subsample=1.0; total time=  21.4s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=0.8; total time=   5.4s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=0.8; total time=   5.3s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=0.8; total time=   5.5s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=1.0; total time=   5.0s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=1.0; total time=   5.1s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=1.0; total time=   5.0s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=1.0; total time=   5.4s
[CV] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, subsample=1.

In [36]:
model = grid_search.best_estimator_