In [1]:
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
from functools import partial
import time

In [2]:
def expand_path(path):
    if os.path.isfile(TRAIN + path):
        return TRAIN + path
    if os.path.isfile(TEST + path):
        return TEST + path
    return p

In [3]:
# function to write with black outline
def write_on_image(results_image, text, x, y, x_shift=0, y_shift=0, font_cr='rgb(0,0,0)', FNT_SZ = 45, font='RobotoSlab-Regular.ttf'): 
    font = ImageFont.truetype(font, FNT_SZ)
    x += x_shift
    y += y_shift
    results_image_draw = ImageDraw.Draw(results_image)
    results_image_draw.text((x-1, y), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x+1, y), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x, y-1), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x, y+1), text, font=font, fill='rgb(0,0,0)')
    results_image_draw.text((x, y), text, fill=font_cr, font=font)

In [4]:
# when multiple neighbours are passed in the function to plot ascending scored images, a series will be returned in some
# cases instead of string

def extract_from_series(sr):
    if type(sr) == pd.core.series.Series:
        try: sr = sr[0]
        except: pass
    return sr

In [5]:
def draw_box(results_image, x, y, size, enlarge=0, line_color= 'rgb(0,0,150)'):
    area = (x-enlarge, y-enlarge, x+size[0]+enlarge, y+size[0]+enlarge)
    results_image_draw = ImageDraw.Draw(results_image)
    results_image_draw.rectangle(area, outline = 'rgb(0,0,0)')

In [6]:
def image_matrix_draw(val_image_df, train_df, TRAIN, TEST, trn_vs_val_df = None, topk = False, ascending = True, 
                      dcut = 0, dcut_label = '', SZ=512, MRGN=20, SMALL_MRGN=0, HDR=100,
                      BKGR_COLOR=(245,245,245), Font='RobotoSlab-Regular.ttf', FNT_SZ=25):
    """
    This is a function to explore the prediciton's of a classifier. It is comparing the image that has been classified from the validation set, 
    with the images that has the same predicted class from the train set, and with the images that has the same class from the ground truth label of the validation image in question.
    It will draw 7 columns of images. Each column has 4 images, with size (sz).
    
    The 1st column is composed of 4 replicates of the validation image that we want to explore.
    The 2nd to 4th column composed of images that has the same predicted class from the train set.
    The 5th to 7th column composed of images that has the same class from the ground truth label of the validation image in question.
    
    # For the pred columns we have 3 states:
    # 1. sort by topk. Req. : topk == True && hasattr(val_image_df, 'nbs_Image')
    # 2. sort by class. Req.: topk == False (or hasattr(val_image_df, 'nbs_Image')=False) &&  trn_vs_val_df is not None
    # 3. unsorted randomly picked from pred class.
    
    # For the ground truth columns:
    # 1. sort by class (pred_sorted_by_class = True). Req.: trn_vs_val_df is not None
    # 2. unsorted randomly picked from pred class (pred_sorted_by_class = False)
    
    Parameters:
    val_image_df    :    pandas series that has the following information about the validation/test image:
                            Image     : file name of the validation images
                            Id        : ground truth label (if Id is not present, the function will regard the images to be analyzed as test images instead of validation images)
                            nbs       : predicted labels (space separated if multiple labels provided for multiple topk=True
                                         e.g., 100 of labels in one column otherwise only one).
                            d         : score  (space separated if multiple labels provided for multiple topk=True
                                         e.g., 100 of labels in one column otherwise only one)
                            nbs_Image : file names of Images of the predicted labels (space separated in one column). If this column not provided, 
                                        randomly picked images will be provided of the predicted class instead of sorted from low to high score or in reverse (even if topk = True)

    train_df        :    pandas dataframe of the whole dataset's labels (train without validation) contain the following columns:
                            Image: file name
                            Id   : label
                            
    trn_vs_val_df   :    pandas dataframe matrix of scores between all train images and validation images. 
                         If provided, the function will plot in the columns of the ground truth the images that belongs to the ground truth label
                         sorted (either ascending or descending) by scores of the validation image in question vs train ground truth class images.
                         Columns:
                            Image     : file name of the train images
                            Id        : ground truth label 
                            file names: multiple columns: each column for each validation file name
    
    topk            :    True         : Plotting the images of the top-1, top-2, top-3  ...  instead of analyzing only one of the top-k
                         False        : Analyzing the top-1 score or top-2 or top-3 ...etc. Which top-k score is chosen is determined by the 
                                        val_image_df data of which top-k that is passed to this function
                        
    ascending       :    sorting scores from low to high if True (e.g., for sorting nearest negihbours), or sorting from High to low if Flase (e.g., sorting according to accuracy scores)                   
    dcut            :    if score > dcut assign dcut_label to the predictions if ascending = True
                         if score < dcut assign dcut_label to the predictions if ascending = False
    SZ              :    Size of each image
    MRGN            :    Size of the margin between the major 3 columns (validation column + predicted class images column, ground truth class images column)
    SMALL_MRGN      :    Size of the margin between the each columns of the 7 columns
    BKGR_COLOR      :    Background color 
    HDR             :    Height of the header of the total image
    INPUT_PATH      :    Path of the whole dataset images
    Font            :    Font of the written text 
    FNT_SZ          :    Font size
    
    TODO: ascending = False
    dcut <> 0 and dcut_label <>'' else if dcut <>0 and dcut_label ='' raise error
    """

    if  not hasattr(val_image_df, 'Id'): # if it is a test set without labels info
        test_set = True
    else:
        test_set = False
   
    if topk == True and hasattr(val_image_df, 'nbs_Image'):
        pred_sorted_by_topk = True
    else:
        pred_sorted_by_topk = False  # Without nb_Image file name info we cannot plot as topk so disable topk or if topk chosen as False
        topk = False
            
    if trn_vs_val_df is not None:
        y_sorted_by_class = True
    else:
        y_sorted_by_class = False # Cannot sort anything by class if trn_vs_val_df is not provided 

    if topk == False  and  trn_vs_val_df is not None:
        pred_sorted_by_class = True
    else:
        pred_sorted_by_class = False # Cannot sort anything by class if trn_vs_val_df is not provided 
    
    if not test_set:
        if topk == False:
            if len(val_image_df['nbs'].split(" ")) == 1: # means only one neighbor passed
                if val_image_df.Id == val_image_df.nbs: # if pred is correct then the pred columns will already shows the ground truth, so no meaning to repeat but better to continue
                    pred_correct = 1
                else:
                    pred_correct = -1
        else:
            if len(val_image_df['nbs'].split(" ")) != 1: # means more than one neighbor passed
                if val_image_df['nbs'].split(" ")[0] == val_image_df['Id']: # if pred is correct then the pred columns will already shows the ground truth, so no meaning to repeat but better to continue
                    pred_correct = 2
                else:
                    pred_correct = -2
    else:
        pred_correct = 0
        # To make this function works for test-set it is easier to append label data to the test set too
        # So it will work like a validation-set. It will be colored like correct always
        if len(val_image_df['nbs'].split(" ")) == 1: # means only one neighbor passed
            val_image_df.Id = val_image_df.nbs #  Label = prediciton
        else:
            val_image_df['Id'] = val_image_df['nbs'].split(" ")[0]  #Label = top-1 prediction

    if  test_set == True: # if it is a test set without labels info
        y_cls_show = False  # y_class is not shown, so instead those columns will show continuation of the pred columns if enough extra pred images available
    else:
        if topk:
            y_cls_show = True # if topk = True: always show labels (either sorted (y_sorted_by_class = True) or randomly) if it is not dataset (if it is dataset topk already disabled by the 1st if statement)
        elif pred_correct == 1 or pred_correct == 2 or pred_correct == 0: # if pred is correct then the pred columns will already shows the ground truth, so no meaning to repeat but better to continue
            y_cls_show = False
        else:
            y_cls_show = True    
    
    
    # For the pred columns we have 3 states:
    # 1. sort by topk (pred_sorted_by_topk = True)
    # 2. sort by class (pred_sorted_by_class = True)
    # 3. unsorted randomly picked from pred class (pred_sorted_by_topk = False & pred_sorted_by_class = False)
    
    # For the ground truth columns:
    # 1. sort by class (pred_sorted_by_class = True)
    # 2. unsorted randomly picked from pred class (pred_sorted_by_class = False)
    # 3. If topk = False , pred_correct = 1 means that prediction is correct then gorund truth will not plotted but continue plotting pred
    
    
    pred_scores   = []
    pred_trn_imgs = []
    pred_classes  = []
    y_scores      = []
    y_trn_imgs    = []
    y_classes     = []
    
    
    if pred_sorted_by_class == False and pred_sorted_by_topk == False: # pick randomly for preds columns
        pred_trn_imgs = train_df.Image[train_df.Id == val_image_df.nbs]
        y_trn_imgs = train_df.Image[train_df.Id == val_image_df.Id]

        if not y_cls_show:
            pred_trn_imgs = pd.DataFrame(pred_trn_imgs.sample(n=min(24, len(pred_trn_imgs)), replace = False))
            y_trn_imgs = []
        else:
            pred_trn_imgs = pd.DataFrame(pred_trn_imgs.sample(n=min(12, len(pred_trn_imgs)), replace = False))
            y_trn_imgs = pd.DataFrame(y_trn_imgs.sample(n=min(12, len(y_trn_imgs)), replace = False))
            if trn_vs_val_df is not None:
                y_scores = trn_vs_val_df[trn_vs_val_df.Image.isin(y_trn_imgs.Image)][[val_image_df.Image]]
                y_trn_imgs =  trn_vs_val_df[trn_vs_val_df.Image.isin(y_trn_imgs.Image)][['Image']] # This is because the sorting of y is different. By this it will be the same
                y_scores = y_scores.reset_index()
            y_trn_imgs = y_trn_imgs.reset_index()
           
        if trn_vs_val_df is not None:
            pred_scores = trn_vs_val_df[trn_vs_val_df.Image.isin(pred_trn_imgs.Image)][[val_image_df.Image]]
            pred_trn_imgs = trn_vs_val_df[trn_vs_val_df.Image.isin(pred_trn_imgs.Image)][['Image']] # This is because the sorting of y is different. By this it will be the same
            pred_scores = pred_scores.reset_index()
        pred_trn_imgs = pred_trn_imgs.reset_index()
        pred_top1_img = "nothing"
        
    elif pred_sorted_by_class == True and pred_sorted_by_topk == False: # sort images picked from pred class images
        temp_df = trn_vs_val_df[['Image', 'Id',val_image_df.Image]]
        pred_trn_imgs = temp_df[temp_df.Id.isin([val_image_df.nbs])].sort_values(by=[val_image_df.Image])
        if not y_cls_show:
            pred_trn_imgs, pred_scores = pred_trn_imgs.head(24)['Image'], pred_trn_imgs.head(24)[val_image_df.Image]
            y_trn_imgs = []
        else:
            pred_trn_imgs, pred_scores = pred_trn_imgs.head(12)['Image'], pred_trn_imgs.head(12)[val_image_df.Image]
            temp_df = trn_vs_val_df[['Image', 'Id', val_image_df.Image]]
            y_trn_imgs = temp_df[temp_df.Id.isin([val_image_df.Id])].sort_values(by=[val_image_df.Image])
            y_trn_imgs, y_scores = y_trn_imgs.head(12)['Image'], y_trn_imgs.head(12)[val_image_df.Image]
            y_scores = y_scores.reset_index()
            y_trn_imgs = y_trn_imgs.reset_index()
        pred_scores = pred_scores.reset_index()
        pred_trn_imgs = pred_trn_imgs.reset_index()
        pred_top1_img = pred_trn_imgs.loc[0].Image
    elif pred_sorted_by_topk == True: # sort images in the pred column by top-1, top-2,...
        pred_trn_imgs = pd.DataFrame(val_image_df['nbs_Image'].split(" "), columns=['Image'])
        pred_scores   = pd.DataFrame(val_image_df['d'].split(" "), columns=[[val_image_df.Image]])
        pred_classes  = pd.DataFrame(val_image_df['nbs'].split(" "), columns=['nbs'])
        pred_top1_img = "nothing"
        
        if not y_cls_show:
            pred_trn_imgs = pred_trn_imgs.head(24)
            pred_scores   = pred_scores.head(24)
            pred_classes = pred_classes.head(24)
            y_trn_imgs = []
        else:
            pred_trn_imgs = pred_trn_imgs.head(12)
            pred_scores   = pred_scores.head(12)
            pred_classes = pred_classes.head(12)
            if y_sorted_by_class == True:
                temp_df = trn_vs_val_df[['Image', 'Id', val_image_df.Image]]
                y_trn_imgs = temp_df[temp_df.Id.isin([val_image_df.Id])].sort_values(by=[val_image_df.Image])
                y_trn_imgs, y_scores = y_trn_imgs.head(12)['Image'], y_trn_imgs.head(12)[val_image_df.Image]
                y_scores = y_scores.reset_index()
                y_trn_imgs = y_trn_imgs.reset_index()
            else:            
                y_trn_imgs = train_df.Image[train_df.Id == val_image_df.Id]
                y_trn_imgs = pd.DataFrame(y_trn_imgs.sample(n=min(12, len(y_trn_imgs)), replace = False))
                if trn_vs_val_df is not None:
                    y_scores = trn_vs_val_df[trn_vs_val_df.Image.isin(y_trn_imgs.Image)][[val_image_df.Image]]
                    y_trn_imgs =  trn_vs_val_df[trn_vs_val_df.Image.isin(y_trn_imgs.Image)][['Image']] # This is because the sorting of y is different. By this it will be the same
                    y_scores = y_scores.reset_index()
                y_trn_imgs = y_trn_imgs.reset_index()
        
    font = ImageFont.truetype(Font, FNT_SZ)
    results_image = Image.new('RGB', (SZ*7+MRGN*2+SMALL_MRGN*6, SZ*4+HDR), BKGR_COLOR)
    
    img = Image.open(expand_path(val_image_df.Image))
    img.thumbnail((SZ,SZ),Image.LANCZOS)  # LANCZOS : means Anitalias
    
    # draw the validation image
    for i in range(4):
        results_image.paste(img, (0, i*SZ+HDR))

    preds_imgs = []
    preds_file_names = []
    pred_scores_list = []
    pred_class_list = []
    scr = ''
    cls = ''
    for i in range(24):
        if i < len(pred_trn_imgs):
            name = extract_from_series(pred_trn_imgs.loc[i].Image)
            img = Image.open(expand_path(name))
            if len(pred_scores) > 0:
                scr  = extract_from_series(pred_scores.loc[i][val_image_df.Image])
            if len(pred_classes) > 0:
                cls  = extract_from_series(pred_classes.loc[i]['nbs'])
            else:
                cls  = ""
        else:
            img = Image.new('RGB', (SZ, SZ), BKGR_COLOR)
            name = ""
            scr  = ""
            cls  = ""
        img.thumbnail((SZ,SZ),Image.LANCZOS) 
        preds_imgs.append(img)
        preds_file_names.append(name)
        pred_scores_list.append(scr)
        pred_class_list.append(cls)

    ground_truth_imgs = []
    ground_truth_file_names = []
    ground_truth_scores_list = []
    ground_truth_class_list = []
    scr = ''
    cls = ''

    if len(y_trn_imgs) > 0:
        for i in range(12):
            if i < len(y_trn_imgs):
                name = extract_from_series(y_trn_imgs.loc[i].Image)
                img = Image.open(expand_path(name))
                if len(y_scores) > 0:
                    scr  = extract_from_series(y_scores.loc[i][val_image_df.Image]) 
                if len(y_classes) > 0:
                    cls  = extract_from_series(y_classes.loc[i]['nbs'])
            else:
                img = Image.new('RGB', (SZ, SZ), BKGR_COLOR)
                name = ""
                scr  = ""
                cls  = ""               
            img.thumbnail((SZ,SZ),Image.LANCZOS)
            ground_truth_imgs.append(img) 
            ground_truth_file_names.append(name)
            ground_truth_scores_list.append(scr)
            ground_truth_class_list.append(cls)
            
    # Color the fonts with purple if the prediciton is wrong, if correct with green, if topk sorted blue
    if pred_correct > 0:
        font_color  = 'rgb(0,150,0)'
    elif pred_correct < 0:
        font_color  = 'rgb(148,0,211)' 

    if test_set:
        font_color  = 'rgb(0,0,180)'


    # Put at the header of the whole image:
    # file name : (real class) 
    # top-x   score : (pred class)

    if test_set:
        write_on_image(results_image, f"{val_image_df.Image}   (Test set)",10 ,5 , font_cr=font_color, FNT_SZ=FNT_SZ)
    else:
        write_on_image(results_image, f"{val_image_df.Image} : ({val_image_df.Id})   (Validation set)",10 ,5 , font_cr=font_color, FNT_SZ=FNT_SZ)
    if topk:
        if y_cls_show:
            if test_set:
                # write_on_image(results_image, f"test set top-k: [[top-1, top-2, top-3],[top-4..],[..top-12]]",10, 45,  font_cr=font_color, FNT_SZ=FNT_SZ)
                write_on_image(results_image, f"top-k",10, 40,  font_cr=font_color, FNT_SZ=FNT_SZ)
            else:
                # write_on_image(results_image, f"validation set top-k: [[top-1, top-2, top-3],[top-4..],[..top-12]]",10, 45,  font_cr=font_color, FNT_SZ=FNT_SZ)
                write_on_image(results_image, f"top-k",10, 40,  font_cr=font_color, FNT_SZ=FNT_SZ)
        else:
            if test_set:
                # write_on_image(results_image, f"test set top-k: [[top-1, top-2, top-3],[top-4..],[..top-12]] + [[top-13, top-14, top-15],[top-16..],[..top-24]]",10, 45,  font_cr=font_color, FNT_SZ=FNT_SZ)
                write_on_image(results_image, f"top-k",10, 40,  font_cr=font_color, FNT_SZ=FNT_SZ)
            else:
                # write_on_image(results_image, f"validation set top-k: [[top-1, top-2, top-3],[top-4..],[..top-12]] + [[top-13, top-14, top-15],[top-16..],[..top-24]]",10, 45,  font_cr=font_color, FNT_SZ=FNT_SZ)
                write_on_image(results_image, f"top-k",10, 40,  font_cr=font_color, FNT_SZ=FNT_SZ)
    else:
        write_on_image(results_image, f"top-{K}    {float(val_image_df.d):.3f}   : ({val_image_df.nbs})",10, 40,  font_cr=font_color, FNT_SZ=FNT_SZ)
    if not pred_sorted_by_topk:
        write_on_image(results_image, f"predicted ({val_image_df.nbs})",SZ*2.25+MRGN+SMALL_MRGN, 5,  font_cr=font_color, FNT_SZ=FNT_SZ)
    else:
        write_on_image(results_image, f"predicted top-1 ({val_image_df.nbs.split(' ')[0]})",SZ*2.25+MRGN+SMALL_MRGN, 5,  font_cr=font_color, FNT_SZ=FNT_SZ)
    if y_cls_show:
        if not pred_sorted_by_topk:
            write_on_image(results_image, f"ground truth ({val_image_df.Id})",SZ*5.25+MRGN*2+SMALL_MRGN*3, 5, font_cr=font_color, FNT_SZ=FNT_SZ)
        else:
            write_on_image(results_image, f"ground truth ({val_image_df.Id.split(' ')[0]})",SZ*5.25+MRGN*2+SMALL_MRGN*3, 5, font_cr=font_color, FNT_SZ=FNT_SZ)
 
    index = 0
    for i in range(4):
        for j in range(1,4):
            x = j*(SZ+SMALL_MRGN)+int(j/4+1)*MRGN
            y = i*SZ+HDR
            results_image.paste(preds_imgs[index], (x, y))
            if len(pred_scores_list)>0:            
                write_on_image(results_image, preds_file_names[index], x, y, x_shift = 5, y_shift=-70, font_cr= font_color, FNT_SZ=FNT_SZ)
                write_on_image(results_image, str(pred_scores_list[index]), x, y, x_shift = 5, y_shift=-35, font_cr= font_color, FNT_SZ=FNT_SZ)
            else:
                write_on_image(results_image, preds_file_names[index], x, y, x_shift = 5, y_shift=-35, font_cr= font_color, FNT_SZ=FNT_SZ)
            if len(pred_class_list)>0: 
                write_on_image(results_image, str(pred_class_list[index]), x, y, x_shift = 5, y_shift=5, font_cr= font_color, FNT_SZ=FNT_SZ)
            
            if pred_top1_img == preds_file_names[index]:
                draw_box(results_image, x, y, size=(SZ,SZ), enlarge=0, line_color= font_color)
            index += 1
            if len(preds_imgs)<=index:
                break
        if len(preds_imgs)<=index:
            break

            
    # draw ground truth train images
    if len(y_trn_imgs) > 0:
        index = 0
        for i in range(4):
            for j in range(4,7):
                x = j*(SZ+SMALL_MRGN)+int(j/4+1)*MRGN
                y = i*SZ+HDR            
                results_image.paste(ground_truth_imgs[index], (x, y))
                if len(ground_truth_scores_list)>0:               
                    write_on_image(results_image, ground_truth_file_names[index], x, y, x_shift = 5, y_shift=-70, 
                                   font_cr=font_color, FNT_SZ=FNT_SZ)
                    write_on_image(results_image, str(ground_truth_scores_list[index]), x, y, x_shift = 5, y_shift=-35,
                                   font_cr=font_color, FNT_SZ=FNT_SZ)
                else:
                    write_on_image(results_image, str(ground_truth_file_names[index]), x, y, x_shift = 5, y_shift=-35, 
                                       font_cr=font_color, FNT_SZ=FNT_SZ)
                if len(ground_truth_class_list)>0:
                    write_on_image(results_image, str(ground_truth_class_list[index]), x, y, x_shift = 5, y_shift=5, 
                                   font_cr=font_color, FNT_SZ=FNT_SZ)
                index += 1
                if len(ground_truth_imgs)<=index:
                    break
            if len(ground_truth_imgs)<=index:
                break
    elif len(preds_imgs) > 12:
        index = 12
        for i in range(4):
            for j in range(4,7):
                x = j*(SZ+SMALL_MRGN)+int(j/4+1)*MRGN
                y = i*SZ+HDR
                results_image.paste(preds_imgs[index], (x, y))
                if len(str(pred_scores_list[12]))>0:
                    write_on_image(results_image, preds_file_names[index], x, y, x_shift = 5, y_shift=-70, 
                                   font_cr=font_color, FNT_SZ=FNT_SZ)
                    write_on_image(results_image, str(pred_scores_list[index]), x, y, x_shift = 5, y_shift=-35, 
                                   font_cr=font_color, FNT_SZ=FNT_SZ)
                else:
                    write_on_image(results_image, preds_file_names[index], x, y, x_shift = 5, y_shift=-35, 
                                   font_cr=font_color, FNT_SZ=FNT_SZ)
                if len(str(pred_class_list[12]))>0:
                    write_on_image(results_image, str(pred_class_list[index]), x, y, x_shift = 5, y_shift=5, 
                                   font_cr=font_color, FNT_SZ=FNT_SZ)

                index += 1
                if len(preds_imgs)<=index:
                    break
            if len(preds_imgs)<=index:
                break
    
    return results_image

In [7]:
def save_single_topk_images(val_image_df, train_df, TRAIN, TEST, CORRECT_PRED_PATH, INCORRECT_PRED_PATH, TOPK_CORRECT_PRED_PATH,
                            TOPK_INCORRECT_PRED_PATH, TEST_SET_PRED_PATH, TEST_SET_TOPK_PRED_PATH, topk=False, **kwargs):
    
    if hasattr(val_image_df, 'Id'): # if it is a test set without labels info
        test_set = False
    else:
        test_set = True    
    
    result = image_matrix_draw(val_image_df, train_df, TRAIN, TEST, topk=topk, **kwargs)
    
    leading_zeros = 5
    
    # file named as : score_predclass_realclass_validationfilename
    
    if topk == False:        
        if len(val_image_df['nbs'].split(" ")) != 1: 
            # means more than one neighbor passed
            raise Exception('You should pass only one neighbor image data in the columns(nbs, d, nbs_Image)  of val_image_df  if you do not want to plot all topk: top-1, top2, top3 in the same result image')
    else:
        if len(val_image_df['nbs'].split(" ")) == 1: 
            # means only one neighbor passed
            raise Exception('if you want to plot all topk you should pass more than one neighbor image data in the columns(nbs, d, nbs_Image) of val_image_df  [space seperated]')

    
    
    if not test_set:
        if topk == False:        
            if len(val_image_df['nbs'].split(" ")) == 1: # means only one neighbor passed
                if val_image_df.Id == val_image_df.nbs: # if pred is correct then the pred columns will already shows the ground truth, so no meaning to repeat but better to continue
                    pred_correct = 1
                else:
                    pred_correct = -1
        else:
            if len(val_image_df['nbs'].split(" ")) != 1: # means more than one neighbor passed
                if val_image_df['nbs'].split(" ")[0] == val_image_df['Id']: # if pred is correct then the pred columns will already shows the ground truth, so no meaning to repeat but better to continue
                    pred_correct = 2
                else:
                    pred_correct = -2
            
        
        if pred_correct == 1:
            # saving correct prediction
            result.save(f"{CORRECT_PRED_PATH}{float(val_image_df.d):012.5f}_{val_image_df.nbs}_{val_image_df.Id}_{val_image_df.Image}", format='JPEG')
            
        elif pred_correct == -1:
            # saving incorrect prediction
            result.save(f"{INCORRECT_PRED_PATH}{float(val_image_df.d):012.5f}_{val_image_df.nbs}_{val_image_df.Id}_{val_image_df.Image}", format='JPEG') 
        elif pred_correct == 2:
            # saving correct topk
            result.save(f"{TOPK_CORRECT_PRED_PATH}{float(val_image_df.d.split(' ')[0]):012.5f}_{val_image_df.nbs.split(' ')[0]}_{val_image_df.Id.split(' ')[0]}_{val_image_df.Image}", format='JPEG') 
        elif pred_correct == -2:
            # saving incorrect topk
            result.save(f"{TOPK_INCORRECT_PRED_PATH}{float(val_image_df.d.split(' ')[0]):012.5f}_{val_image_df.nbs.split(' ')[0]}_{val_image_df.Id.split(' ')[0]}_{val_image_df.Image}", format='JPEG') 
    else:
        if topk == False:        
            if len(val_image_df['nbs'].split(" ")) != 1: # means not only one neighbor passed
                raise Exception('You should pass only one neighbor image data in the columns(nbs, d, nbs_Image)  of val_image_df  if you do not want to plot all topk: top-1, top2, top3 in the same result image')
            os.makedirs(TEST_SET_PRED_PATH, exist_ok = True)
            # saving top-x prediction
            result.save(f"{TEST_SET_PRED_PATH}{float(val_image_df.d):012.5f}_{val_image_df.nbs}_{val_image_df.Image}", format='JPEG')        
        else:
            os.makedirs(TEST_SET_TOPK_PRED_PATH, exist_ok = True)
            # saving topk prediction
            result.save(f"{TEST_SET_TOPK_PRED_PATH}{float(val_image_df.d.split(' ')[0]):012.5f}_{val_image_df.nbs.split(' ')[0]}_{val_image_df.Image}", format='JPEG') 

## **Customize your options**

In [8]:
## CHOOSE your config of the output
SZ = 512
MRGN = 20
SMALL_MRGN = 0
HDR = 100
BKGR_COLOR = (0,0,0) 
FNT = 'RobotoSlab-Regular.ttf'
FNT_SZ = 45

In [9]:
K = 2          # CHOOSE top-K nearest neighbours (1,2.3...)
NW = 16        # number of workers

TRAIN = '../input/humpback-whale-identification/train/' # for train set analysis
TEST = '../input/humpback-whale-identification/test/'    # for test set analysis
LABELS = '../input/humpback-whale-identification/train.csv'
OUTPUT = '../output/'

In [10]:
### CHOOSE to load either validation set or test set to analyze ####

### LOAD your dataset

# You can skip loading 'trn_vs_val_df' scores matrix if you don't have it,
# but analysis will be by random pick rather than sorted by scores

# Load: Image, Id, nbs, d   (Image file name, label, nearest neighbours, distances)    
# val_df = pd.read_csv('Iofoss-COMMIT7-baseline-sz512-val-v7-iafoss-sz512-dcut17.csv')    # for train set analysis
# trn_vs_val_df = pd.read_csv('trn_vs_val-v7-iafoss-sz512-dcut17.csv')                    # for train set analysis
val_df = pd.read_csv('Iofoss-COMMIT7-baseline-sz512-test-v7-iafoss-sz512-dcut17.csv')   # for test set analysis
trn_vs_val_df = pd.read_csv('trn_vs_test-v7-iafoss-sz512-dcut17.csv')                   # for test set analysis

# Load dataframe: Image, Id

train_df = trn_vs_val_df[['Image','Id']]

In [11]:
# CHOOSE the K-th nearest neighbours to save all resulted images by using only them

# In case we wanted to check other K-th nearest neighbours, we can run again with another K value
# The following code will save them in a different folder for each K
nearest_nb = pd.DataFrame(val_df['nbs'].apply(lambda x: str.split(x," ")[K-1]))
nearest_nb_distance = pd.DataFrame(val_df['d'].apply(lambda x: str.split(x," ")[K-1]))
if hasattr(val_df,'nbs_Image'):
    nearest_nb_img = pd.DataFrame(val_df['nbs_Image'].apply(lambda x: str.split(x," ")[K-1]))

if hasattr(val_df,'Id'):
    data = val_df[['Image','Id']]
else:
    data = val_df[['Image']]    
    
data = data.join(nearest_nb)
data = data.join(nearest_nb_distance)
if hasattr(val_df,'nbs_Image'): data = data.join(nearest_nb_img)
# data = val_df  # If you want to plot topk THEN you should do this, because we want all the info and not only the k-th neighbor

In [12]:
## CHOOSE your ouput paths

# saving all the resulted images in the following paths:
CORRECT_PRED_PATH = OUTPUT+'top'+str(K)+'_correct/'
INCORRECT_PRED_PATH = OUTPUT+'top'+str(K)+'_incorrect/'
TOPK_CORRECT_PRED_PATH = OUTPUT+'topk_correct/'
TOPK_INCORRECT_PRED_PATH = OUTPUT+'topk_incorrect/'
TEST_SET_PRED_PATH = OUTPUT+'Test_top'+str(K)+'/'
TEST_SET_TOPK_PRED_PATH = OUTPUT+'Test_topk/'

os.makedirs(CORRECT_PRED_PATH, exist_ok = True) # set exist_ok = False to raise error so preventing accidental over writing
os.makedirs(INCORRECT_PRED_PATH, exist_ok = True) 
os.makedirs(TOPK_CORRECT_PRED_PATH, exist_ok = True)
os.makedirs(TOPK_INCORRECT_PRED_PATH, exist_ok = True)

In [13]:
############# CHOOSE one of the following 2 cases  ########

## in case you want top-1 or top-2 or ... according to you choice of K in previous constants cell
func = partial(save_single_topk_images,
               train_df=train_df,
               TRAIN=TRAIN,
               TEST=TEST,
               CORRECT_PRED_PATH=CORRECT_PRED_PATH,
               INCORRECT_PRED_PATH=INCORRECT_PRED_PATH,
               TOPK_CORRECT_PRED_PATH=TOPK_CORRECT_PRED_PATH, 
               TOPK_INCORRECT_PRED_PATH=TOPK_INCORRECT_PRED_PATH,
               TEST_SET_PRED_PATH=TEST_SET_PRED_PATH, 
               TEST_SET_TOPK_PRED_PATH=TEST_SET_PRED_PATH, 
               BKGR_COLOR='rgb(255,255,255)', 
               trn_vs_val_df=trn_vs_val_df, # pass None here if you dont have this matrix, but analysis of the ground truth images (if training set is analyzed) will be by random pick rather than sorted by scores
               topk=False)

### in case you want topk
# data = val_df 
# func = partial(save_single_topk_images,
#                train_df=train_df,
#                TRAIN=TRAIN,
#                TEST=TEST,
#                CORRECT_PRED_PATH=CORRECT_PRED_PATH,
#                INCORRECT_PRED_PATH=INCORRECT_PRED_PATH,
#                TOPK_CORRECT_PRED_PATH=TOPK_CORRECT_PRED_PATH, 
#                TOPK_INCORRECT_PRED_PATH=TOPK_INCORRECT_PRED_PATH,
#                TEST_SET_PRED_PATH=TEST_SET_PRED_PATH, 
#                TEST_SET_TOPK_PRED_PATH=TEST_SET_TOPK_PRED_PATH, 
#                BKGR_COLOR='rgb(255,255,255)', 
#                trn_vs_val_df=trn_vs_val_df, # pass None here if you dont have this matrix, but analysis of the ground truth images (if training set is analyzed) will be by random pick rather than sorted by scores
#                topk=True)           

In [14]:
data.loc[0]

Image        cfcc21b0e.jpg
nbs              w_5665c80
d                6.0546875
nbs_Image    732b981cb.jpg
Name: 0, dtype: object

In [15]:
# ## run for 1 image
func(data.loc[0])

In [16]:
### RUN FOR THE WHOLE DATASET ###
tic = time.time()

with ThreadPoolExecutor(NW) as e: 
    # Saving only the 1st 20 validation images for testing everything is okay
#     e.map(func, [val_image_df for (_, val_image_df) in data.head(20).iterrows()])
    e.map(func, [val_image_df for (_, val_image_df) in data.iterrows()])

toc = time.time()
print("Saving all images time: ", (toc - tic) / 60.,' min.')

Saving all images time:  18.66553550561269  min.


In [17]:
from my_python_tricks import *
notify_me('interpreter saving images finished')