In [7]:
import os
from tqdm import tqdm

import cv2
import numpy as np
import matplotlib.pyplot as plt

from utils.dataloader import DataLoader
from utils.vis import MatplotlibVisualizer
from utils.transforms import HairRemoval, Composer
from utils.utils import export_experiment
from descriptors.shape import HOGDescriptor
from utils.segmentation import ThresholdingSegmentation
from descriptors.stats import IntensityStatsGridDescriptor
from descriptors.texture import LBPDescriptor, GLCMDescriptor, GaborFilterDescriptor, TextureDescriptor_update
from descriptors.color import ColorDescriptor, ColorLayoutDescriptor, ColorCooccurrenceMatrixDescriptor, ColorDescriptor_Update
from descriptors.fourier import FourierTransformExtractor, FFTExtractor

In [2]:
## Classes
CLASSES = ['nevus', 'others']

## Work folfer
work_folder = os.getcwd()
data_folder = os.path.join(work_folder, '..', 'Data')

## Visualizer
matplotlib_visualizer = MatplotlibVisualizer()
exp_name = 'binary_classification'

In [3]:
print(work_folder)

/Users/sumeetdash/MAIA/Semester_3/CAD/Skin-Lesion-Classification


In [4]:
transforms_composer = Composer([
                                HairRemoval(),
                                ])

## Featrure Extraction

### Descriptors

In [5]:
## Define parameters
params = {
    'color_layout': {
        'grid_x': 1,
        'grid_y': 1,
    },
    'intensity_stats': {
        'grid_x': 3,
        'grid_y': 3,
    },
    'color': {
        'bins': (8, 12, 3),
        'grid_x': 1,
        'grid_y': 1,
    },
    'glcm': {
        'distances': [1],
        'angles': [0, np.pi/4, np.pi/2, 3*np.pi/4],
        'levels': 8,
        'grid_x': 3,
        'grid_y': 3,
    },
    'lbp': {
        'radius': 3,
        'n_points': 16,
        'grid_x': 3,
        'grid_y': 3,
    },
}

In [6]:
# modes = ['train', 'val']
modes = ['test']

## Descriptors
color_layout_descriptor = ColorLayoutDescriptor(**params['color_layout'])
intensity_stats_grid_descriptor = IntensityStatsGridDescriptor(**params['intensity_stats'])
color_descriptor = ColorDescriptor(**params['color'])
glcm_descriptor = GLCMDescriptor(**params['glcm'])
lbp_descriptor = LBPDescriptor(**params['lbp'])
color_cooccurrence_matrix_descriptor = ColorCooccurrenceMatrixDescriptor(distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], levels=8, grid_x=3, grid_y=3)
# gabor_filter_descriptor = GaborFilterDescriptor(frequencies=[0.1, 0.2, 0.3], orientations=[0, np.pi/4, np.pi/2, 3*np.pi/4])


features_dict = {}

for mode in modes:

    ## Data loader
    ### Limit the number of samples to 200 for training and load all samples for validation
    max_samples = None
    ### Balance the dataset for training
    balance = False 
    dataloader = DataLoader(data_folder, mode, 
                            shuffle=False, 
                            ignore_folders=['black_background', '.DS_Store'], 
                            max_samples=max_samples, 
                            balance=balance,
                            transforms=None, 
                            classes=CLASSES, 
                            mask=True)

    ## Extract features
    features = []
    labels = []
    paths = []
    for i, (img, label, mask, path) in tqdm(enumerate(dataloader), total=len(dataloader), desc=f'Extracting features for {mode}'):
        # matplotlib_visualizer.show_multiple_images([img, mask], labels=['org', 'mask'])
        paths.append(path)
        color_features = color_descriptor.extract(img, mask=mask)
        color_features_inverse = color_descriptor.extract(img, mask=cv2.bitwise_not(mask))
        # color_layout_features = color_layout_descriptor.extract(img, mask=mask)
        # intensity_stats_grid_features = intensity_stats_grid_descriptor.extract(img, mask=None)
        # glcm_features, glcm_img = glcm_descriptor.extract(img, mask=None)
        # lbp_features, lbp_img = lbp_descriptor.extract(img, mask=None)
        color_cooccurrence_matrix_features = color_cooccurrence_matrix_descriptor.extract(img, mask=None)
        glcm_features, glcm_img = glcm_descriptor.extract(img, mask=None)
        lbp_features, lbp_img = lbp_descriptor.extract(img, mask=None)
        features.append(np.concatenate([lbp_features, glcm_features, color_features, color_features_inverse, color_cooccurrence_matrix_features], axis=0))              
        ## add label
        # labels.append(label)
    
    ## Save features to disk
    features = np.array(features)
    # labels = np.array(labels)
    features_with_labels = np.concatenate([features, labels.reshape(-1, 1)], axis=1)
    features_dict[mode] = features_with_labels

Extracting features for test:   0%|          | 0/6340 [00:00<?, ?it/s]

Extracting features for test:   0%|          | 0/6340 [00:00<?, ?it/s]


IndexError: list index out of range

In [73]:
np.save('features_val.npy', features_dict['val'])
np.save('paths_val.npy', paths)



In [70]:
features_dict[mode] = features_with_labels


In [72]:
features_dict['val'][:, -1]

array([1., 1., 1., ..., 0., 0., 0.])

In [None]:
dest_folder = os.path.join(data_folder, 'test')

indices = []
results = []

### Load test images
for img_name in tqdm(os.listdir(test_folder), desc='Predicting test images ...'):
    image_idx = int(img_name.split('.')[0].replace('xxx', ''))
    indices.append(image_idx)
    
    ## Load image
    img_path = os.path.join(test_folder, img_name)
    img = cv2.imread(img_path)
    
    ## Extract features
    color_features = color_descriptor.extract(img, mask=mask)
    color_features_inverse = color_descriptor.extract(img, mask=cv2.bitwise_not(mask))
    # color_layout_features = color_layout_descriptor.extract(img, mask=mask)
    # intensity_stats_grid_features = intensity_stats_grid_descriptor.extract(img, mask=None)
    # glcm_features, glcm_img = glcm_descriptor.extract(img, mask=None)
    # lbp_features, lbp_img = lbp_descriptor.extract(img, mask=None)
    color_cooccurrence_matrix_features = color_cooccurrence_matrix_descriptor.extract(img, mask=None)
    glcm_features, glcm_img = glcm_descriptor.extract(img, mask=None)
    lbp_features, lbp_img = lbp_descriptor.extract(img, mask=None)
    
    ## Concatenate features
    features = np.concatenate([lbp_features, glcm_features, color_features, color_features_inverse, color_cooccurrence_matrix_features], axis=0)
    features = features.reshape(1, -1)
    
    ## Predict
    y_pred = best_model.predict(features)
    results.append(y_pred[0])

In [51]:
import pickle

# Specify the path to your pickle file
pickle_file_path = "/Users/sumeetdash/MAIA/Semester_3/CAD/Skin-Lesion-Classification/features_dict_lbp_glcm_color_mask.pkl"

# Load the pickle file
with open(pickle_file_path, "rb") as file:
    features_dict_u = pickle.load(file)

# Now you can use the loaded data
print(features_dict_u.keys())


dict_keys(['train', 'val'])


In [None]:
# modes = ['train', 'val']

# ## Descriptors
# # color_layout_descriptor = ColorLayoutDescriptor(**params['color_layout'])
# # intensity_stats_grid_descriptor = IntensityStatsGridDescriptor(**params['intensity_stats'])
# # color_descriptor = ColorDescriptor(**params['color'])
# # glcm_descriptor = GLCMDescriptor(**params['glcm'])
# # lbp_descriptor = LBPDescriptor(**params['lbp'])
# # color_cooccurrence_matrix_descriptor = ColorCooccurrenceMatrixDescriptor(distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4], levels=8, grid_x=3, grid_y=3)
# # gabor_filter_descriptor = GaborFilterDescriptor(frequencies=[0.1, 0.2, 0.3], orientations=[0, np.pi/4, np.pi/2, 3*np.pi/4])
# # color_descriptor_u = ColorDescriptor_Update()
# texture_descriptor_u = TextureDescriptor_update()

# features_dict = {}

# for mode in modes:

#     ## Data loader
#     ### Limit the number of samples to 200 for training and load all samples for validation
#     max_samples = None
#     ### Balance the dataset for training
#     balance = False 
#     dataloader = DataLoader(data_folder, mode, 
#                             shuffle=True, 
#                             ignore_folders=['black_background', '.DS_Store'], 
#                             max_samples=max_samples, 
#                             balance=balance,
#                             transforms=None, 
#                             classes=CLASSES, 
#                             mask=True)

#     ## Extract features
#     features = []
#     labels = []
#     for i, (img, label, mask, path) in tqdm(enumerate(dataloader), total=len(dataloader), desc=f'Extracting features for {mode}'):
#         # matplotlib_visualizer.show_multiple_images([img, mask], labels=['org', 'mask'])

#         texture_features = texture_descriptor_u.extract(img, mask=mask)
#         texture_features_inverse = texture_descriptor_u.extract(img, mask=cv2.bitwise_not(mask))
#         # color_features_u = color_descriptor_u.extract(img, mask=mask)
#         # color_features_inverse_u = color_descriptor_u.extract(img, mask=cv2.bitwise_not(mask))
#         # color_layout_features = color_layout_descriptor.extract(img, mask=mask)
#         # intensity_stats_grid_features = intensity_stats_grid_descriptor.extract(img, mask=None)
#         # glcm_features, glcm_img = glcm_descriptor.extract(img, mask=None)
#         # lbp_features, lbp_img = lbp_descriptor.extract(img, mask=None)
#         # color_cooccurrence_matrix_features = color_cooccurrence_matrix_descriptor.extract(img, mask=None)
#     #     # gabors_features = gabor_filter_descriptor.extract(img, mask=None)
#         # features.append(np.concatenate([lbp_features, glcm_features, color_features, color_features_inverse, color_cooccurrence_matrix_features], axis=0))        
#         features.append(np.concatenate([texture_features, texture_features_inverse], axis=0))        
#         ## add label
#         labels.append(label)
        
    
#     ## Save features to disk
#     features = np.array(features)
#     labels = np.array(labels)
#     features_with_labels = np.concatenate([features, labels.reshape(-1, 1)], axis=1)
#     features_dict[mode] = features_with_labels

Extracting features for train: 100%|██████████| 15195/15195 [1:58:08<00:00,  2.14it/s]  
Extracting features for val: 100%|██████████| 3796/3796 [27:10<00:00,  2.33it/s]


In [49]:
print(features_dict['train'].shape)
final_features = {}
final_features['train'] = np.concatenate([features_dict['train'][:,:-1], colour_u['train']], axis=1)
final_features['val'] = np.concatenate([features_dict['val'][:,:-1], colour_u['val']], axis=1)
print(final_features['train'].shape)

(15195, 883)
(15195, 906)


In [None]:
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

In [None]:
# # scaler = StandardScaler()
# # pca = PCA(n_components=288)

# # train_scaled = scaler.fit_transform(features_dict['train'][:, :-1])
# # val_scaled = scaler.transform(features_dict['val'][:, :-1])

# # features = pca.fit_transform(train_scaled)
# # features_test = pca.transform(val_scaled) 
# feat = features_dict['train'][:, :-1]
# labels = features_dict['train'][:, -1]

# print(feat.shape)
# print(labels.shape)

(15195, 576)
(15195,)


In [None]:
# from sklearn.feature_selection import RFECV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import StratifiedKFold

# # Assuming 'feat' is your feature matrix and 'target' is your target variable
# # feat = ... (your feature matrix)
# target = labels

# # Step 2: Initialize the classifier
# classifier = RandomForestClassifier()

# # Step 3: Perform RFECV
# rfecv = RFECV(estimator=classifier, step=1, cv=StratifiedKFold(5), scoring='accuracy')
# rfecv.fit(feat, target)

# # Step 4: Visualize the results
# plt.figure(figsize=(10, 6))
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross-validation score (accuracy)")
# plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
# plt.title('RFECV - Number of Features vs. Cross-Validation Score')
# plt.show()

# # Step 5: Get the selected features
# selected_features = feat.columns[rfecv.support_]
# print("Selected features:", selected_features)

KeyboardInterrupt: 

## Training

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb

In [53]:
# from catboost import CatBoostClassifier
mode = 'train'
# model = SVC(kernel='rbf', C=5.0, random_state=42, degree=5)
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1500, learning_rate=0.2, n_jobs=-1)
# cat_clf = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.2,
#     task_type="CPU",  # Use "GPU" if you have GPU available
#     loss_function='Logloss',  # For binary classification
#     verbose=100  # Adjust to see progress during training
# )

In [54]:
features = features_dict_u[mode][:, :-1]
# features = final_features[mode]
labels = features_dict_u[mode][:, -1]

X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

In [55]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((12156, 882), (3039, 882), (12156,), (3039,))

In [56]:
# scaler = MinMaxScaler()
# model = Pipeline([('scaler', scaler), ('model', xgb_clf)])
model = xgb_clf
# model = cat_clf


In [57]:
model.fit(X_train, y_train)

## Validation

In [58]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred, target_names=CLASSES))


              precision    recall  f1-score   support

       nevus       0.85      0.85      0.85      1545
      others       0.85      0.85      0.85      1494

    accuracy                           0.85      3039
   macro avg       0.85      0.85      0.85      3039
weighted avg       0.85      0.85      0.85      3039



## Testing

In [59]:
mode = 'val'
features_test = features_dict[mode][:, :-1]
labels_test = features_dict[mode][:, -1]

y_pred = model.predict(features_test)
print(classification_report(labels_test, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

       nevus       0.85      0.84      0.84      1931
      others       0.84      0.84      0.84      1865

    accuracy                           0.84      3796
   macro avg       0.84      0.84      0.84      3796
weighted avg       0.84      0.84      0.84      3796



## Cross Validation

In [35]:
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold

# Initialize XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', objective='binary:logistic', n_estimators=500, learning_rate=0.1, n_jobs=-1)

# Define cross-validation strategy (e.g., 5-fold stratified)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get accuracy scores for each fold
features_train = features_dict['train'][:, :-1]
labels_train = features_dict['train'][:, -1]
cv_scores = cross_val_score(xgb_clf, features_train, labels_train, cv=cv, scoring='accuracy')

# Output the results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Accuracy Scores: [0.84830536 0.84369859 0.84007897 0.84666009 0.83810464]
Mean CV Accuracy: 0.843369529450477
Standard Deviation of CV Accuracy: 0.00384305398372002


In [36]:
# Cross-validation predictions (optional)
features_test = features_dict['val'][:, :-1]
labels_test = features_dict['val'][:, -1]
cv_predictions = cross_val_predict(xgb_clf, features_test, labels_test, cv=cv)
print("Classification Report for CV Predictions:\n", classification_report(labels_test, cv_predictions))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Classification Report for CV Predictions:
               precision    recall  f1-score   support

         0.0       0.82      0.82      0.82      1931
         1.0       0.81      0.81      0.81      1865

    accuracy                           0.81      3796
   macro avg       0.81      0.81      0.81      3796
weighted avg       0.81      0.81      0.81      3796



## Grid Search

In [53]:
from sklearn.model_selection import GridSearchCV

# Initialize the base model (XGBClassifier)
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_jobs=-1, random_state=42)
scaler_cv = StandardScaler()

model = Pipeline([('scaler', scaler_cv), ('model', xgb_clf)])

# Define the parameter grid for Grid Search
param_grid = {
    'model__n_estimators': [1000, 1500],  # Number of trees
    'model__learning_rate': [0.1, 0.2],  # Step size shrinkage
    # 'reg_lambda': [1.0, 0.8],  # L2 regularization term on weights
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                        scoring='accuracy',  # Use 'accuracy' as the evaluation metric
                        cv=5,  # 5-fold cross-validation
                        verbose=4, 
                        n_jobs=-1)  # Parallel processing

features_train = features_dict['train'][:, :-1]
labels_train = features_dict['train'][:, -1]
grid_search.fit(features_train, labels_train)

# Get the best parameters and best score from the grid search
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 2/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=0.838 total time= 2.5min
[CV 1/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=0.855 total time= 2.6min
[CV 4/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=0.844 total time= 2.6min
[CV 5/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=0.854 total time= 2.6min
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=0.837 total time= 2.6min
[CV 2/5] END model__learning_rate=0.1, model__n_estimators=1500;, score=0.838 total time= 3.6min
[CV 1/5] END model__learning_rate=0.1, model__n_estimators=1500;, score=0.856 total time= 3.7min
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=1500;, score=0.840 total time= 3.7min
[CV 1/5] END model__learning_rate=0.2, model__n_estimators=1000;, score=0.852 total time= 2.4min
[CV 2/5] END model__learning_rate=0.2, model__n_estimators=1000;, s

In [None]:
from sklearn.model_selection import GridSearchCV

# Initialize the base model (XGBClassifier)
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', n_jobs=-1, random_state=42)
scaler_cv = StandardScaler()

# model = Pipeline([('scaler', scaler_cv), ('model', xgb_clf)])
model = xgb_clf
# Define the parameter grid for Grid Search
param_grid = {
    'model__n_estimators': [1000, 1500],  # Number of trees
    'model__learning_rate': [0.1, 0.2],  # Step size shrinkage
    # 'reg_lambda': [1.0, 0.8],  # L2 regularization term on weights
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                        scoring='accuracy',  # Use 'accuracy' as the evaluation metric
                        cv=5,  # 5-fold cross-validation
                        verbose=4, 
                        n_jobs=-1)  # Parallel processing

features_train = features_dict['train'][:, :-1]
labels_train = features_dict['train'][:, -1]
grid_search.fit(features_train, labels_train)

# Get the best parameters and best score from the grid search
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

In [38]:
best_model = grid_search.best_estimator_
features_test = features_dict['val'][:, :-1]
labels_test = features_dict['val'][:, -1]

y_pred = best_model.predict(features_test)
print(classification_report(labels_test, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

       nevus       0.84      0.86      0.85      1931
      others       0.85      0.83      0.84      1865

    accuracy                           0.85      3796
   macro avg       0.85      0.85      0.85      3796
weighted avg       0.85      0.85      0.85      3796



## Exporting Experiment

In [None]:
## Export experiment
notebook_name = 'BinaryClass.ipynb'
export_experiment(name=exp_name, params=params, feature_dict=features_dict , model=best_model, notebook_name=notebook_name)

<IPython.core.display.Javascript object>

Experiment 'binary_classification' saved at experiments/binary_classification_20241029_165927


## Feature Selection

In [None]:
features_train = features_dict['train'][:, :-1]
labels_train = features_dict['train'][:, -1]
X_train, X_val, y_train, y_val = train_test_split(features_train, labels_train, test_size=0.1, random_state=42, stratify=labels)

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd


# Train random forest and get feature importances
model = RandomForestClassifier()
model.fit(X_train, y_train)
importances = model.feature_importances_

# Display feature importances
feature_importances = pd.Series(importances)
print(feature_importances.sort_values(ascending=False))

2786    0.010357
1379    0.010292
1457    0.009750
1454    0.009593
590     0.009396
          ...   
1019    0.000000
1020    0.000000
1021    0.000000
1022    0.000000
2834    0.000000
Length: 2835, dtype: float64


In [None]:
#get the most important features
top_features = feature_importances.sort_values(ascending=False).index[:400]

# Retrain the model using only the top features
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=1000, learning_rate=0.2, n_jobs=-1)
model.fit(X_train[:, top_features], y_train)

# Evaluate the model
y_pred = model.predict(X_val[:, top_features])
print(classification_report(y_val, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

       nevus       1.00      0.67      0.80         3
      others       0.67      1.00      0.80         2

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5



In [None]:
# test the model
features_test = features_dict['val'][:, :-1]
labels_test = features_dict['val'][:, -1]

y_pred = model.predict(features_test[:, top_features])
print(classification_report(labels_test, y_pred, target_names=CLASSES))

              precision    recall  f1-score   support

       nevus       0.58      0.84      0.69        25
      others       0.71      0.40      0.51        25

    accuracy                           0.62        50
   macro avg       0.65      0.62      0.60        50
weighted avg       0.65      0.62      0.60        50

