In [1]:
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
from functools import partial

In [2]:
SZ = 512
MRGN = 20
SMALL_MRGN = 0
HDR = 100
BKGR_COLOR = (0,0,0) 
FNT = 'RobotoSlab-Regular.ttf'
FNT_SZ = 45

In [3]:
# function to write with black outline
def write_file_name(results_image, text, x, y, x_shift=0, y_shift=0, font_cr='rgb(0,0,0)', FNT_SZ = 45, font='RobotoSlab-Regular.ttf'): 
    font = ImageFont.truetype(font, FNT_SZ)
    x += x_shift
    y += y_shift
    results_image_draw = ImageDraw.Draw(results_image)
    results_image_draw.text((x-1, y), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x+1, y), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x, y-1), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x, y+1), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x, y), text, fill=font_cr, font=font)

In [4]:
def image_matrix_draw(val_image_df, train_df, INPUT_PATH, SZ=512, MRGN=20, SMALL_MRGN=0, HDR=100, BKGR_COLOR=(245,245,245), Font='RobotoSlab-Regular.ttf', FNT_SZ=25):
    """
    This is a function to explore the prediciton's of a classifier. It is comparing the image that has been classified from the validation set, 
    with the images that has the same predicted class from the train set, and with the images that has the same class from the ground truth label of the validation image in question.
    It will draw 7 columns of images. Each column has 4 images, with size (sz).
    
    The 1st column is composed of 4 replicates of the validation image that we want to explore.
    The 2nd to 4th column composed of images that has the same predicted class from the train set.
    The 5th to 7th column composed of images that has the same class from the ground truth label of the validation image in question.
    
    Parameters:
    val_image_df    :   pandas series that has the following information about the validation image:
                            Image: file name
                            Id   : ground truth label 
                            nbs  : predicted label
                            d    : score
    train_df        :   pandas dataframe of the whole dataset's labels (train + validation) contain the following columns:
                            Image: file name
                            Id   : label
    SZ              :   Size of each image
    MRGN            :   Size of the margin between the major 3 columns (validation column + predicted class images column, ground truth class images column)
    SMALL_MRGN      :   Size of the margin between the each columns of the 7 columns
    BKGR_COLOR      :   Background color 
    HDR             :   Height of the header of the total image
    INPUT_PATH      :   Path of the whole dataset images
    Font            :   Font of the written text 
    FNT_SZ          :   Font size
    """
    
    predicted_class_train_images = train_df.Image[train_df.Id == val_image_df.nbs]
    ground_truth_class_train_images = train_df.Image[train_df.Id == val_image_df.Id]
    
    if val_image_df.Id == val_image_df.nbs:
        predicted_class_train_images = pd.DataFrame(predicted_class_train_images.sample(n=min(24, len(predicted_class_train_images)), replace = False))  
        # replace = False means what you sampled do not make it available again for the next item to be sampled
        ground_truth_class_train_images = []
    else:
        predicted_class_train_images = pd.DataFrame(predicted_class_train_images.sample(n=min(12, len(predicted_class_train_images)), replace = False))  
        # replace = False means what you sampled do not make it available again for the next item to be sampled
        ground_truth_class_train_images = pd.DataFrame(ground_truth_class_train_images.sample(n=min(12, len(ground_truth_class_train_images)), replace = False))
        ground_truth_class_train_images = ground_truth_class_train_images.reset_index()
    predicted_class_train_images = predicted_class_train_images.reset_index()
    
    font = ImageFont.truetype(Font, FNT_SZ)
    results_image = Image.new('RGB', (SZ*7+MRGN*2+SMALL_MRGN*6, SZ*4+HDR), BKGR_COLOR)
    
    img = Image.open(INPUT_PATH+val_image_df.Image)
    img.thumbnail((SZ,SZ),Image.LANCZOS)  # LANCZOS : means Anitalias
    
    # draw the validation image
    for i in range(4):
        results_image.paste(img, (0, i*SZ+HDR))

    preds_imgs = []
    preds_file_names = []
    for i in range(24):
        if i < len(predicted_class_train_images):
            name = predicted_class_train_images.loc[i].Image
            img = Image.open(INPUT_PATH+name)
        else:
            name = ""
            img = Image.new('RGB', (SZ, SZ), BKGR_COLOR)
        img.thumbnail((SZ,SZ),Image.LANCZOS) 
        preds_imgs.append(img)
        preds_file_names.append(name)

    ground_truth_imgs = []
    ground_truth_file_names = []
    if len(ground_truth_class_train_images) > 0:
        for i in range(12):
            if i < len(ground_truth_class_train_images):
                name = ground_truth_class_train_images.loc[i].Image
                img = Image.open(INPUT_PATH+name)
            else:
                name = ""
                img = Image.new('RGB', (SZ, SZ), BKGR_COLOR)
            img.thumbnail((SZ,SZ),Image.LANCZOS)
            ground_truth_imgs.append(img) 
            ground_truth_file_names.append(name)

    # Color the fonts with purple if the prediciton is wrong, otherwise with green
    if val_image_df.Id == val_image_df.nbs:
        font_color  = 'rgb(0,150,0)'
        prediciton_correct = True
    else:
        font_color  = 'rgb(148,0,211)' 
        prediciton_correct = False

    # Put at the header of the whole image:
    # file name : (real class) 
    # top-??   score : (pred class)
    write_file_name(results_image, f"{val_image_df.Image} : ({val_image_df.Id})",10 ,10 , font_cr=font_color, FNT_SZ=FNT_SZ)
    write_file_name(results_image, f"top-{K}    {float(val_image_df.d):.3f}   : ({val_image_df.nbs})",10, 45,  font_cr=font_color, FNT_SZ=FNT_SZ)
    write_file_name(results_image, f"predicted ({val_image_df.nbs})",SZ*2.25+MRGN+SMALL_MRGN, 25,  font_cr=font_color, FNT_SZ=FNT_SZ)
    if not prediciton_correct:
        write_file_name(results_image, f"ground truth ({val_image_df.Id})",SZ*5.25+MRGN*2+SMALL_MRGN*3, 25, font_cr='rgb(0,0,150)', FNT_SZ=FNT_SZ)
        
    # draw randomly selected predicted class train image
    index = 0
    for i in range(4):
        for j in range(1,4):
            x = j*(SZ+SMALL_MRGN)+int(j/4+1)*MRGN
            y = i*SZ+HDR
            results_image.paste(preds_imgs[index], (x, y))
            write_file_name(results_image, preds_file_names[index], x, y, x_shift = 5, y_shift=-35, font_cr= font_color, FNT_SZ=FNT_SZ)
            index += 1
            if len(preds_imgs)<=index:
                break
        if len(preds_imgs)<=index:
            break
            
    # draw randomly selected ground truth class train images
    if len(ground_truth_class_train_images) > 0:
        index = 0
        for i in range(4):
            for j in range(4,7):
                x = j*(SZ+SMALL_MRGN)+int(j/4+1)*MRGN
                y = i*SZ+HDR            
                results_image.paste(ground_truth_imgs[index], (x, y))
                write_file_name(results_image, ground_truth_file_names[index], x, y, x_shift = 5, y_shift=-35, 
                                font_cr=font_color if prediciton_correct else 'rgb(0,0,150)', FNT_SZ=FNT_SZ)
                index += 1
                if len(ground_truth_imgs)<=index:
                    break
            if len(ground_truth_imgs)<=index:
                break
    else:
        if len(preds_imgs) > 12:
            index = 12
            for i in range(4):
                for j in range(4,7):
                    x = j*(SZ+SMALL_MRGN)+int(j/4+1)*MRGN
                    y = i*SZ+HDR
                    results_image.paste(preds_imgs[index], (x, y))
                    write_file_name(results_image, preds_file_names[index], x, y, x_shift = 5, y_shift=-35, 
                                    font_cr=font_color if prediciton_correct else 'rgb(0,0,150)', FNT_SZ=FNT_SZ)
                    index += 1
                    if len(preds_imgs)<=index:
                        break
                if len(preds_imgs)<=index:
                    break
                    
    return results_image


In [5]:
K = 1 # top nearest neighbours (1,2.3...)
NW = 16 # number of workers

INPUT_PATH = '../input/humpback-whale-identification/train/'
LABELS = '../input/humpback-whale-identification/train.csv'
OUTPUT_PATH = '../output/'



In [6]:
# Load dataframe: Image, Id
train_df = pd.read_csv(LABELS)

# Load: Image, Id, nbs, d   (Image file name, label, nearest neighbours, distances)
data = pd.read_csv('val-hard2.csv')

# Choose the K-th nearest neighbours to save all resulted images by using only them
# In case we wanted to check other K-th nearest neighbours, we can run again with another K value
# The following code will save them in a different folder for each K
nearest_nb = pd.DataFrame(data['nbs'].apply(lambda x: str.split(x," ")[K-1]))
nearest_nb_distance = pd.DataFrame(data['d'].apply(lambda x: str.split(x," ")[K-1]))

topk = data[['Image','Id']]
topk = topk.join(nearest_nb)
topk = topk.join(nearest_nb_distance)

In [7]:
# saving all the resulted images
CORRECT_PRED_PATH = OUTPUT_PATH+'top'+str(K)+'_correct/'
INCORRECT_PRED_PATH = OUTPUT_PATH+'top'+str(K)+'_incorrect/'

os.makedirs(CORRECT_PRED_PATH, exist_ok = True) # exist_ok = False will raise error if the target folder exists. TURN exist_ok= True if you want overwrite
os.makedirs(INCORRECT_PRED_PATH, exist_ok = True) # exist_ok = False will raise error if the target folder exists. TURN exist_ok= True if you want overwrite

In [8]:
def save_topk_images(val_image_df, train_df, INPUT_PATH, CORRECT_PRED_PATH, INCORRECT_PRED_PATH, **kwargs):
    result = image_matrix_draw(val_image_df, train_df, INPUT_PATH, **kwargs)
    # file named as : score_predclass_realclass_validationfilename
    if val_image_df.Id == val_image_df.nbs:
        # saving correct prediction
        result.save(CORRECT_PRED_PATH+val_image_df.d+'_'+val_image_df.nbs+'_'+val_image_df.Id+'_'+val_image_df.Image, format='JPEG')
    else:
        # saving incorrect prediction
        result.save(INCORRECT_PRED_PATH+val_image_df.d+'_'+val_image_df.nbs+'_'+val_image_df.Id+'_'+val_image_df.Image, format='JPEG')     

In [38]:
func = partial(save_topk_images,train_df=train_df,INPUT_PATH=INPUT_PATH,CORRECT_PRED_PATH=CORRECT_PRED_PATH,INCORRECT_PRED_PATH=INCORRECT_PRED_PATH,BKGR_COLOR='rgb(100,100,100)')

In [39]:
%%time 
with ThreadPoolExecutor(NW) as e: 
    # Saving only the 1st 20 validation images for testing everything is okay
    # later on remove .header(20)
    # e.map(func, [val_image_df for (_, val_image_df) in topk.head(10).iterrows()])
    e.map(func, [val_image_df for (_, val_image_df) in topk.iterrows()])
         

CPU times: user 13 s, sys: 192 ms, total: 13.2 s
Wall time: 7.54 s
