In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [11]:
import pathlib
from sklearn.model_selection import train_test_split
from shutil import copy
import os
import pandas as pd


DATAPATH = pathlib.Path( "/home/john/data/Hey-Waldo/64/")
OUTPUTPATH = pathlib.Path("/home/john/data/WheresWally/")

def create_experiment_folders(datapath, outputpath, test_size=0.2, valid_size=0.2):

    datapath = pathlib.Path( datapath)
    outputpath = pathlib.Path(outputpath)

    outputpath.mkdir(exist_ok=True)

    class_names = os.listdir(datapath)
    print(f"Found {len(class_names)} classes: {class_names} ")

    trn = outputpath/"train"
    trn.mkdir(exist_ok=True)
    val = outputpath/"valid"
    val.mkdir(exist_ok=True)
    tst = outputpath/"test"
    tst.mkdir(exist_ok=True)

    for cl in class_names:
        elems = os.listdir(datapath/cl)
        trainVal_set, test_set = train_test_split(elems, test_size=test_size)
        train_set, val_set = train_test_split(trainVal_set, test_size=valid_size)
        print(f'{cl}: train: {len(train_set)} val: {len(val_set)} test: {len(test_set)}')
        for set_path, s in zip([trn, val, tst], [train_set, val_set, test_set]):
            for i in s:
                (set_path/cl).mkdir(exist_ok=True)
                copy(datapath/cl/i, set_path/cl/i)

    print(f'Finished building experiment folders in {outputpath}')
    
    

In [66]:
create_experiment_folders(DATAPATH, OUTPUTPATH)

Found 2 classes: ['notwaldo', 'waldo'] 
notwaldo: train: 3415 val: 854 test: 1068
waldo: train: 24 val: 7 test: 8
Finished building experiment folders in /home/john/data/WheresWally


# Unfortunatly, neither of the other 2 datasets provided images in the folder form :(

In [39]:
ships_path = pathlib.Path("/home/john/data/ships-in-satellite-imagery/shipsnet/")

In [41]:
image_list = os.listdir(ships_path)

In [46]:
s[0] == "0"

True

In [50]:
not_ship = []
is_ship = []
for s in image_list:
    if s[0] == '0':
        not_ship.append(s)
    elif s[0] == '1':
        is_ship.append(s)
    else:
        print(f"ERROR {s}")
    
        

In [55]:
full_data_path = ships_path/"fulldata"
full_data_path.mkdir(exist_ok=True)
for label, files in [("is_ship",is_ship),("not_ship",not_ship)]:
    folder = full_data_path/label
    folder.mkdir(exist_ok=True)
    for f in files:
        copy(ships_path/f, folder/f)
    print(f"copied {len(files)} files to {folder}")
        

copied 700 files to /home/john/data/ships-in-satellite-imagery/shipsnet/fulldata/is_ship
copied 2100 files to /home/john/data/ships-in-satellite-imagery/shipsnet/fulldata/not_ship


In [61]:
create_experiment_folders(full_data_path, "/home/john/data/sat-ships/")

Found 2 classes: ['is_ship', 'not_ship'] 
is_ship: train: 448 val: 112 test: 140
not_ship: train: 1344 val: 336 test: 420
Finished building experiment folders in /home/john/data/sat-ships


# Ok, now let's do the same for the arabic ones... 

We are going to extract the images from the CSV and put them into a folder. Then we will use the labels csv and the `from_csv` method of `ImageClassifierData` to extract them. 


In [52]:
from tqdm import tqdm
import shutil

In [7]:
PATH = pathlib.Path("/home/john/data/arabicCharacters/")

In [9]:
files = os.listdir(PATH)
files

['ahcd1.zip',
 'Arabic Handwritten Characters Dataset CSV',
 'csvTrainImages 13440x1024.csv',
 'csvTestImages 3360x1024.csv',
 'Train+Test Images Matlab.mat',
 'csvTestLabel 3360x1.csv',
 'csvTrainLabel 13440x1.csv']

In [53]:
trainImgs = pd.read_csv(PATH/'csvTrainImages 13440x1024.csv', header=None)

In [54]:
trainImgs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
import matplotlib.pyplot as plt

In [61]:
tqdm?

In [111]:
imageOutputFolder = PATH/"train"
imageOutputFolder.mkdir(exist_ok=True)
for i, r in tqdm(trainImgs.iterrows(), total=13440):
    a = r.values
    a = a.reshape(32,32)
    plt.imsave(fname=str(imageOutputFolder/f"{i+1}.png"), arr=a)

100%|██████████| 13440/13440 [00:09<00:00, 1476.89it/s]


In [112]:
testImgs = pd.read_csv(PATH/"csvTestImages 3360x1024.csv", header=None)
imageOutputFolder = PATH/"test"
offset = 13441
imageOutputFolder.mkdir(exist_ok=True)
for i, r in tqdm(testImgs.iterrows()):
    a = r.values
    a = a.reshape(32,32)
    plt.imsave(fname=str(imageOutputFolder/f"{i+offset}.png"), arr=a)

3360it [00:02, 1491.83it/s]


In [113]:
trainLabels = pd.read_csv(PATH/ 'csvTrainLabel 13440x1.csv', header=None)

output_dir = PATH/"train"
for i in range(1, 29):
    (output_dir/f"{i}").mkdir(exist_ok=True)
    
for i, r in tqdm(trainLabels.iterrows()):
    label = r[0]
    shutil.move(output_dir/f"{i+1}.png", output_dir/f"{label}"/f"{i+1}.png")
    

13440it [00:01, 10038.60it/s]


In [114]:
testLabels = pd.read_csv(PATH/ 'csvTestLabel 3360x1.csv', header=None)
output_dir = PATH/"test"
for i in range(1, 29):
    (output_dir/f"{i}").mkdir(exist_ok=True)
    
for i, r in tqdm(testLabels.iterrows()):
    label = r[0]
    shutil.move(output_dir/f"{i+offset}.png", output_dir/f"{label}"/f"{i+offset}.png")

3360it [00:00, 10107.97it/s]


# Valid Set
Ok we still need to construct a validation set. Let's butcher the function from before because we have already been given a test set to work on. 

In [115]:
datapath = PATH/"train"
class_names = os.listdir(datapath)
print(f"Found {len(class_names)} classes: {class_names} ")

Found 28 classes: ['10', '17', '16', '21', '20', '3', '18', '22', '5', '9', '24', '26', '14', '1', '4', '8', '7', '23', '13', '6', '28', '27', '12', '2', '19', '11', '25', '15'] 


In [116]:

val = PATH/"valid"
val.mkdir(exist_ok=True)

for cl in class_names:
    elems = os.listdir(datapath/cl)
    train_set, val_set = train_test_split(elems, test_size=0.2)

    for i in tqdm(val_set):
        (val/cl).mkdir(exist_ok=True)
        shutil.move(datapath/cl/i, set_path/cl/i)



100%|██████████| 96/96 [00:00<00:00, 7734.55it/s]
100%|██████████| 96/96 [00:00<00:00, 14152.02it/s]
100%|██████████| 96/96 [00:00<00:00, 14362.52it/s]
100%|██████████| 96/96 [00:00<00:00, 13714.81it/s]
100%|██████████| 96/96 [00:00<00:00, 14322.16it/s]
100%|██████████| 96/96 [00:00<00:00, 17797.61it/s]
100%|██████████| 96/96 [00:00<00:00, 16587.84it/s]
100%|██████████| 96/96 [00:00<00:00, 14089.13it/s]
100%|██████████| 96/96 [00:00<00:00, 14382.53it/s]
100%|██████████| 96/96 [00:00<00:00, 14085.19it/s]
100%|██████████| 96/96 [00:00<00:00, 14150.03it/s]
100%|██████████| 96/96 [00:00<00:00, 17341.54it/s]
100%|██████████| 96/96 [00:00<00:00, 13899.45it/s]
100%|██████████| 96/96 [00:00<00:00, 14003.87it/s]
100%|██████████| 96/96 [00:00<00:00, 14090.11it/s]
100%|██████████| 96/96 [00:00<00:00, 17271.62it/s]
100%|██████████| 96/96 [00:00<00:00, 15566.89it/s]
100%|██████████| 96/96 [00:00<00:00, 13779.11it/s]
100%|██████████| 96/96 [00:00<00:00, 17367.72it/s]
100%|██████████| 96/96 [00:00<00