# K-means clusters for color images

In this notebook I tried using a CNN Efficient Net B0 to extract features that can be cluster using the K-means algorithm.

**Conclusion: It is not suitable, the accuracy score obtained was 0.4298245614035088**

Connect to google drive because the data is stored there.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
% cd './drive/My Drive/'
% ls

In [3]:
# Requirements
! pip install torch
! pip install efficientnet-pytorch

Collecting efficientnet-pytorch
  Downloading https://files.pythonhosted.org/packages/b8/cb/0309a6e3d404862ae4bc017f89645cf150ac94c14c88ef81d215c8e52925/efficientnet_pytorch-0.6.3.tar.gz
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.6.3-cp36-none-any.whl size=12422 sha256=d4a5552662ec13f575e56b4795d2650350ee4230e764779d5cf1cb265a3591b1
  Stored in directory: /root/.cache/pip/wheels/42/1e/a9/2a578ba9ad04e776e80bf0f70d8a7f4c29ec0718b92d8f6ccd
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.6.3


Functions

In [0]:
import random
import os
import numpy as np
import torch


def myseed(seed=42):
    """
    Make results reproducible.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [0]:
import os
import pandas as pd

from glob import glob


def load_data(data_dir='/content/drive/My Drive/skin'):
    """ 
    Find .jpg files and create a dataframe.
    """
    myseed(seed=42)
    
    # Get filenames
    filenames = glob(os.path.join(data_dir, '*','*.jpg'))
    if len(filenames) == 0:
      filenames = glob(os.path.join(data_dir, '*', '*','*.jpg'))

    # Create a dataframe
    df = pd.DataFrame(data=filenames, columns=['filenames'])
    df['label'] = df['filenames'].str.extract(r'skin\W(\w+)\W')
    
    # Get the labels and their one hot encoded values
    try:
      df['label'] = df['label'].astype('category')
      mapping = dict(enumerate(df['label'].cat.categories ))
    except AttributeError as err:
      mapping = ''
      
    df['label_code'] = pd.Categorical(df['label']).codes
    return df, mapping

In [0]:
import torchvision
from torch.utils.data import DataLoader,Dataset
from PIL import Image
from torchvision import transforms


def get_loaders(size=100, batch_size=1, num_workers=1):
    """
    Put data into the dataloaders.
    """
    myseed(seed=42)

    # Custom Pytorch dataloader for this dataset
    class Derm(Dataset):
        """
        Read a pandas dataframe with
        images paths and labels
        """
        def __init__(self, df, transform=None):
            self.df = df
            self.transform = transform

        def __len__(self):
            return len(self.df)

        def __getitem__(self, index):
            # Load image data and get label
            X = Image.open(self.df['filenames'][index]).convert('RGB')
            y = torch.tensor(int(self.df['label_code'][index]))
            
            if self.transform:
                X = self.transform(X)

            return X, y
    
    # ImageNet statistics
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]

    # Transforms
    data_transforms = {x : transforms.Compose([transforms.Resize(size),
                                              transforms.CenterCrop((size,size)),
                                              transforms.ToTensor(),
                                              transforms.Normalize(mean,std)]) for x in ['train', 'unknown']}
      
    # Frames
    df = {'train': train, 'unknown': unknown}                                    
    # Sets
    image_datasets = {x: Derm(df[x], transform=data_transforms[x]) for x in ['train','unknown']}
    # Sizes
    dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'unknown']}
    # Loaders
    dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers) for x in ['train','unknown']}
    return data_transforms, df, image_datasets, dataloaders, dataset_sizes

In [0]:
import torch

from efficientnet_pytorch import EfficientNet


def mymodel(unfreeze=True):
  """
  Unfreeze(True) all the model weights or freeze(False) the convolutional 
  layers only
  """
  myseed(seed=42)
  
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  model = EfficientNet.from_pretrained('efficientnet-b0')
  model = model.to(device)
  for param in model.parameters():
    param.requires_grad = unfreeze
  return model

In [0]:
def get_num_parameters(model):
  """
  Get the total number of parameters in a neural network.
  """
  myseed(seed=42)
  
  total_params = sum(p.numel() for p in model.parameters())
  total_trainable_params = sum(
  p.numel() for p in model.parameters() if p.requires_grad)
  return total_params, total_trainable_params

In [0]:
def make_predictions(mapping, df, phase='test'):
  """
  Use a model to predict a class
  Put the class in column predictions
  """
  myseed(seed=42)

  with torch.no_grad():

    features = []
    for data in dataloaders[phase]:
      images,_ = data
      images = images.to(device)

      # Use the model to extract the features
      extracted = model.extract_features(images)
      features.append(extracted.cpu().numpy().flatten())
  
  df['features'] = features
  return df

In [0]:
# Functions to display images inside a pandas dataframe
import base64

from PIL import Image
from io import BytesIO
from IPython.display import HTML


def get_thumbnail(path):
    myseed(seed=42)
    i = Image.open(path).convert('RGB')
    i.thumbnail((150, 150), Image.LANCZOS)
    return i

def image_base64(im):
    myseed(seed=42)
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im):
    myseed(seed=42)
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

# Main

In [22]:
import time

since = time.time()


myseed(seed=42)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# DATA
train, mapping = load_data(data_dir='/content/drive/My Drive/skin')
unknown, _= load_data(data_dir='/content/drive/My Drive/dermatology')
data_transforms, df, image_datasets, dataloaders, dataset_sizes = get_loaders(size=224, batch_size=1, num_workers=4)

# MODEL
model = mymodel(unfreeze=False)
total_params, total_trainable_params = get_num_parameters(model) 
print(f'{total_params:,} total parameters.')
print(f'{total_trainable_params:,} training parameters.')

last_df = make_predictions(mapping, df=train, phase='train')
display(last_df.head())

time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

Loaded pretrained weights for efficientnet-b0
5,288,548 total parameters.
0 training parameters.


Unnamed: 0,filenames,label,label_code,features
0,/content/drive/My Drive/skin/skin/00000000.jpg,skin,1,"[-0.024662528, -0.25344232, 1.1171257, -0.1078..."
1,/content/drive/My Drive/skin/skin/00000001.jpg,skin,1,"[-0.26015037, -0.042534508, -0.17842826, 4.334..."
2,/content/drive/My Drive/skin/skin/00000002.jpg,skin,1,"[-0.061070863, 1.2369761, -0.2691417, 0.592121..."
3,/content/drive/My Drive/skin/skin/00000003.jpg,skin,1,"[-0.0011058161, -0.007300381, -0.045351334, -0..."
4,/content/drive/My Drive/skin/skin/00000004.jpg,skin,1,"[-0.10162022, -0.18533117, -0.22678417, -0.035..."


Training complete in 0m 12s


## K-means clustering 

In [23]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score


# Cluster
X = last_df['features'].tolist()
kmeans = KMeans(n_clusters=2, random_state=42).fit(X)
y_pred = kmeans.labels_
y_true = last_df['label_code'].tolist()

# Calculate the accuracy of the clusters
print(f'Accuracy score {accuracy_score(y_true, y_pred)}')


last_df['image'] = last_df.filenames.map(lambda f: get_thumbnail(f))
last_df['cluster'] = y_pred
last_df['new_label'] = last_df['cluster'].map(mapping)

# Show a df with images
HTML(last_df[['label', 'label_code', 'new_label', 'cluster','image']].to_html(formatters={'image': image_formatter}, escape=False))

Accuracy score 0.4298245614035088


Unnamed: 0,label,label_code,new_label,cluster,image
0,skin,1,not_skin,0,
1,skin,1,not_skin,0,
2,skin,1,not_skin,0,
3,skin,1,not_skin,0,
4,skin,1,skin,1,
5,skin,1,not_skin,0,
6,skin,1,not_skin,0,
7,skin,1,not_skin,0,
8,skin,1,skin,1,
9,skin,1,not_skin,0,
