In [1]:
%reset -f
%reload_ext autoreload
%autoreload 2
# %matplotlib inline
%config Completer.use_jedi = False

In [2]:
from pathlib import Path
from fastbook import *
from fastai.vision.all import *
from pyanis.fileops.utils import copy_files

# Cleaning with FastAI

In [3]:
def get_iw_info(
    learn,
    ds_idx:int=0 # Index in `learn.dls`
) -> list:
    "For every image in `dls` `zip` it's `Path`, target and loss"
    dl = learn.dls[ds_idx].new(shuffle=False, drop_last=False)
    probs,targs,preds,losses = learn.get_preds(dl=dl, with_input=False, with_loss=True, with_decoded=True)
    targs = [dl.vocab[t] for t in targs]
    return L([dl.dataset.items,targs,losses]).zip()

def label_func(fname):
    return "insect" if "/test/insect" in str(fname) or "/train/insect" in str(fname) else "not-insect"

In [4]:
# path = Path(f'/mnt/HGST_4TB/Datasets/insect_vs_noinsect/insect_vs_noinsect_copy/')
path = Path(f'/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect/')

In [5]:
# CLEANING INVALID FILES
# fns = get_image_files(path)
# failed = verify_images(fns)
# assert not len(failed)
# failed.map(Path.unlink);

In [6]:
trials = 10
epochs = 6
th_loss = 2.5

for i in range(trials):
    insects = DataBlock(
        # provide a tuple where we specify what types we want for the independent and dependent variables:
        blocks=(ImageBlock, CategoryBlock), # What kinds of data we are working with
        get_items=get_image_files, # How to get the list of items
        splitter=RandomSplitter(valid_pct=0.2, seed=42), # How to create the validation set
        get_y=label_func, # How to label these items, gets the name of the folder a file is in
        item_tfms=Resize(150)) # Transformation: picture resize

    ds = insects.datasets(path)
    dls = insects.dataloaders(path) # the path where the images can be found
    # dls.train.show_batch(max_n=6, nrows=1) # check training set
    # dls.valid.show_batch(max_n=6, nrows=1) # check validation set
    # dls.show_batch(max_n=6, nrows=1) # check data

    print(f"Train data: {len(dls.train.dataset)}, Valid data: {len(dls.valid.dataset)}")
    learn = vision_learner(dls, densenet121, metrics=error_rate)
    learn.fine_tune(epochs)
    
    print("Gathering loss info..")
    iwis = get_iw_info(learn,0), get_iw_info(learn,1)
    df = pd.DataFrame(iwis[0]).sort_values(by=2, ascending=False)
    df1 = pd.DataFrame(iwis[1]).sort_values(by=2, ascending=False)
    df = pd.concat([df,df1])
    df[2] = df[2].apply(lambda x: float(x))

    print(f"Data to be moved: {len(df[df[2]>1.][0].tolist())}")
    copy_files(df[df[2]>th_loss][0].tolist(), '/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review', move_files=True)

Train data: 73177, Valid data: 18294


epoch,train_loss,valid_loss,error_rate,time
0,0.111504,0.076258,0.028589,03:48


epoch,train_loss,valid_loss,error_rate,time
0,0.055076,0.037277,0.014486,05:22
1,0.043866,0.030981,0.011916,05:23
2,0.023726,0.02893,0.010222,05:23
3,0.010261,0.02832,0.009894,05:23
4,0.003067,0.019004,0.006286,05:23
5,0.002252,0.019054,0.005794,05:23


Gathering loss info..


Data to be moved: 87


Moving files..: 100%|█████████████████████████| 46/46 [00:00<00:00, 6378.75it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 73140, Valid data: 18285


epoch,train_loss,valid_loss,error_rate,time
0,0.09872,0.081112,0.029751,03:48


epoch,train_loss,valid_loss,error_rate,time
0,0.060915,0.040338,0.015149,05:23
1,0.034331,0.037058,0.014383,05:24
2,0.020135,0.022584,0.008039,05:24
3,0.009881,0.026348,0.009188,05:24
4,0.00307,0.021026,0.006453,05:23
5,0.00107,0.020906,0.005852,05:23


Gathering loss info..


Data to be moved: 91


Moving files..: 100%|█████████████████████████| 45/45 [00:00<00:00, 6358.00it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 73104, Valid data: 18276


epoch,train_loss,valid_loss,error_rate,time
0,0.110691,0.076541,0.028562,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.051838,0.033417,0.013023,05:23
1,0.039605,0.0349,0.013843,05:23
2,0.023561,0.024699,0.010013,05:23
3,0.009509,0.018882,0.006511,05:24
4,0.002578,0.01324,0.004651,05:23
5,0.001013,0.011762,0.00383,05:24


Gathering loss info..


Data to be moved: 60


Moving files..: 100%|█████████████████████████| 23/23 [00:00<00:00, 6449.76it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 73086, Valid data: 18271


epoch,train_loss,valid_loss,error_rate,time
0,0.108569,0.072514,0.029117,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.050091,0.034904,0.014559,05:23
1,0.045563,0.053903,0.021236,05:23
2,0.01975,0.030983,0.010892,05:23
3,0.008487,0.023867,0.007498,05:23
4,0.002147,0.018283,0.00509,05:23
5,0.003069,0.016543,0.004597,05:23


Gathering loss info..


Data to be moved: 73


Moving files..: 100%|█████████████████████████| 40/40 [00:00<00:00, 6025.00it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 73054, Valid data: 18263


epoch,train_loss,valid_loss,error_rate,time
0,0.098182,0.077696,0.029185,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.050823,0.038232,0.015167,05:23
1,0.041367,0.030858,0.013251,05:23
2,0.02611,0.033965,0.012156,05:23
3,0.010115,0.027307,0.009746,05:24
4,0.001812,0.021569,0.00564,05:24
5,0.000322,0.01813,0.005257,05:23


Gathering loss info..


Data to be moved: 85


Moving files..: 100%|█████████████████████████| 42/42 [00:00<00:00, 6289.43it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 73021, Valid data: 18255


epoch,train_loss,valid_loss,error_rate,time
0,0.100792,0.074478,0.029964,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.043491,0.033812,0.013914,05:23
1,0.036177,0.037586,0.013859,05:24
2,0.020563,0.027407,0.009477,05:23
3,0.008411,0.019114,0.006409,05:23
4,0.003446,0.019255,0.005204,05:23
5,0.00099,0.018046,0.00493,05:23


Gathering loss info..


Data to be moved: 79


Moving files..: 100%|█████████████████████████| 34/34 [00:00<00:00, 6549.99it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 72994, Valid data: 18248


epoch,train_loss,valid_loss,error_rate,time
0,0.110969,0.077367,0.029757,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.04654,0.042155,0.016221,05:26
1,0.033051,0.036235,0.013645,05:24
2,0.02262,0.028921,0.010302,05:23
3,0.00883,0.018055,0.007124,05:23
4,0.001121,0.014002,0.004768,05:24
5,0.00134,0.018118,0.005261,05:23


Gathering loss info..


Data to be moved: 80


Moving files..: 100%|█████████████████████████| 40/40 [00:00<00:00, 3686.33it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 72962, Valid data: 18240


epoch,train_loss,valid_loss,error_rate,time
0,0.11193,0.074993,0.029167,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.053366,0.034095,0.013268,05:23
1,0.038003,0.036245,0.013487,05:23
2,0.023583,0.020254,0.008224,05:23
3,0.013259,0.016973,0.006414,05:23
4,0.002128,0.015667,0.004879,05:23
5,0.000871,0.015284,0.00477,05:23


Gathering loss info..


Data to be moved: 67


Moving files..: 100%|█████████████████████████| 40/40 [00:00<00:00, 3592.63it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 72931, Valid data: 18232


epoch,train_loss,valid_loss,error_rate,time
0,0.104513,0.077138,0.030276,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.048337,0.036456,0.013767,05:22
1,0.031454,0.032434,0.013273,05:23
2,0.025091,0.025057,0.009544,05:23
3,0.013768,0.021105,0.00713,05:23
4,0.002245,0.016008,0.005485,05:23
5,0.000259,0.013273,0.004607,05:23


Gathering loss info..


Data to be moved: 69


Moving files..: 100%|█████████████████████████| 33/33 [00:00<00:00, 3786.30it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





Train data: 72904, Valid data: 18226


epoch,train_loss,valid_loss,error_rate,time
0,0.100131,0.078947,0.029024,03:49


epoch,train_loss,valid_loss,error_rate,time
0,0.053703,0.034692,0.014485,05:23
1,0.035329,0.022634,0.008998,05:24
2,0.021207,0.027412,0.010534,05:24
3,0.005842,0.012412,0.004664,05:24
4,0.002353,0.010071,0.002963,05:24
5,0.00122,0.010197,0.003182,05:23


Gathering loss info..


Data to be moved: 48


Moving files..: 100%|█████████████████████████| 23/23 [00:00<00:00, 3176.77it/s]

Finished copying files to:
	/mnt/EE7455C074558BE9/backups/data_backups/images/insect_vs_noinsect_review





In [24]:
# import matplotlib.pyplot as plt

# interp = ClassificationInterpretation.from_learner(learn)
# plt.figure(figsize=(20,12))
# interp.plot_confusion_matrix(figsize=(20,12))
# interp.plot_top_losses(10, nrows=4)

In [25]:
# from fastai.vision.widgets import *
# #hide_output
# cleaner = ImageClassifierCleaner(learn)

ImageClassifierCleaner doesn't actually do the deleting or changing of labels for you; it just returns the indices of items to change.


In [72]:
# cleaner

In [None]:
# ImageClassifierCleaner doesn't actually do the deleting or changing of labels for you; it just returns the indices of items to change.

In [None]:
# # To delete (unlink) all images selected for deletion, we would run:

# for idx in cleaner.delete(): cleaner.fns[idx].unlink()

# # To move images for which we've selected a different category, we would run:

# for idx,cat in cleaner.change(): shutil.move(str(cleaner.fns[idx]), path/cat)



In [None]:
# df_hiloss = df[df[2]>2.5]

# plt.figure(figsize=(16,12));
# for i, (idx,path,target,loss) in enumerate(df_hiloss.itertuples()):
#     plt.subplot(9,9,i+1);
#     img = Image.open(path)
#     plt.imshow(img);plt.axis('off');plt.title(f"{target}/{loss:.2f}");