In [4]:
from fastai.vision import *
import os
import shutil
import random
from sklearn.model_selection import train_test_split
import numpy
import pandas as pd

In [5]:
path = Path("./output_full")

In [6]:
# Check original images for errors and rescale each image to max size of 500
# save scaled images to "/output/scaled/{folder}"
for folder in ("Gut", "Mittel", "Schlecht", "Sehr Gut"):
    print(folder)
    verify_images(path/folder, delete=False, max_size=500, dest="../scaled/"+folder)

Gut


Mittel


Schlecht


Sehr Gut


In [7]:
# Set up correct folder structure
scaled_path = path/"scaled"
data_dir = Path("./data_dir_full")

# create folder if it does not exist, else clear all files in it
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
else:
    shutil.rmtree(data_dir)
    os.makedirs(data_dir)

In [8]:
# utility function to make copying of files 
# from one folder to another while renaming them easier
def copy_rename(old_file_name, new_file_name, src_dir, dst_dir):
        src_file = os.path.join(src_dir, old_file_name)
        shutil.copy(src_file,dst_dir)
        
        dst_file = os.path.join(dst_dir, old_file_name)
        new_dst_file_name = os.path.join(dst_dir, new_file_name)
        os.rename(dst_file, new_dst_file_name)

In [9]:
# combine all images in one folder "data-dir" 
# and rename them to continuous index train_00_label ...
lbl_index = 1
for folder in ("Gut", "Mittel", "Schlecht", "Sehr Gut"):
    for file in os.listdir(scaled_path/folder):
        if not os.path.isdir(file): # skip directories created by ipython
            _, extension = os.path.splitext(file) # grab file extension
            new_file = "Image_{:02d}-{}{}".format(lbl_index, folder, extension)
                                            .replace(" ", "_")
            copy_rename(file, new_file, scaled_path/folder, data_dir)
            lbl_index += 1

In [10]:
all_files = os.listdir(data_dir) # grab all files currently in data_dir

In [11]:
# sort file names by numerical digit in ascending order
# to do this, extract numerical digits from string, 
# convert to integer and use as sort key
sorted_files = sorted(all_files, key=lambda x: int(x.split("_")[1]
                                                   .split("-")[0]))

In [12]:
#shuffle_data
random.shuffle(all_files)

# extract file names and corresponding labels from files
fnames = [x.split("-")[0] for x in sorted_files]
flabels = [os.path.splitext(x.split("-")[1])[0] for x in sorted_files]

In [13]:
# create dataframe in format [file_name, corresponding_label]
df = pd.DataFrame(list(zip(fnames, flabels)), 
               columns =['image_name', 'label'])

In [14]:
numpy.random.RandomState(42)

# create train set from total data set
X_train ,X_test = train_test_split(df,test_size=0.2, random_state=42)

# create valid set from train set
X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=42)

In [15]:
X_test.head()

Unnamed: 0,image_name,label
889,Image_890,Sehr_Gut
468,Image_469,Sehr_Gut
168,Image_169,Gut
405,Image_406,Sehr_Gut
70,Image_71,Gut


In [16]:
# grab all file labels to create required folders
labels = X_test["label"].unique()

In [17]:
# set up directory names
train_dir = data_dir/"train"
test_dir = data_dir/"test"
valid_dir = data_dir/"valid"

# set up folder structure
for label in labels:
    if not os.path.exists(train_dir/label):
        os.makedirs(train_dir/label)
    
    if not os.path.exists(valid_dir/label):
        os.makedirs(valid_dir/label)
    
    if not os.path.exists(test_dir/label):
        os.makedirs(test_dir/label)

In [20]:
# copy files from train set to correct subfolder in data directory
for idx, file in X_train.iterrows():
    full_filename = file[0] + "-" + file[1] + ".jpeg"
    copy_rename(full_filename, file[0]+".jpeg", data_dir, train_dir/file[1])

#copy files from validation set to correct subfolder in data directory
for idx, file in X_val.iterrows():
    full_filename = file[0] + "-" + file[1] + ".jpeg"
    copy_rename(full_filename, file[0]+".jpeg", data_dir, valid_dir/file[1])
    
# copy files from test set to correct subfolder in data directory
for idx, file in X_test.iterrows():
    full_filename = file[0] + "-" + file[1] + ".jpeg"
    copy_rename(full_filename, file[0]+".jpeg", data_dir, test_dir/file[1])

In [21]:
# clean up data_dir by removing duplicates/originals
for file in os.listdir(data_dir):
    os.path.isfile(data_dir/file) and os.remove(data_dir/file) # only delete files and skip folders

In [22]:
df.to_csv(data_dir/"labels.csv", index=False)