# Data augmentation : Malimg2022

- Step 1 : split data into train and test data in separate folders
- Step 2 : Count missing data
- Step 3 : Generate missing values per class
- Step 4 : Remove extra data
- Step 5 : Check new balanced data

*Latest update : 04/03/2022*

## STEP 1 : Split data in separate folders

*split_folders python library*

Installation :  pip install split-folders

Documanetation : https://pypi.org/project/split-folders/

This library allow us to split database to train, test and evaluation folders separatly for each class in the database
Split folders with files (e.g. images) into train, validation and test (dataset) folders.

Split with a ratio.
To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`.
splitfolders.ratio("input_folder", output="output", seed=1337, ratio=(.8, .1, .1), group_prefix=None) # default values

Split val/test with a fixed number of items e.g. 100 for each set.
To only split into training and validation set, use a single number to `fixed`, i.e., `10`.

In [1]:
import splitfolders

def split_data(original_data_folder , split_data_folder , test_value):
    '''
    This function allow us to split the any database into train and validation folders
     - original_data_folder : is the path of the original database
     - split_data_folder : is the output folder, will be created with the given name
     - test_value : a fixed value for validation folder, should be an integer
    '''
    splitfolders.fixed(original_data_folder, output=split_data_folder, seed=1337, fixed=(test_value), oversample=False, group_prefix=None)

In [2]:
original_database_path = "../malimg_paper_dataset_imgs"
output_folder = "new_output_folder_70"
test_value = 70

split_data(original_database_path, output_folder , test_value)

## STEP 2 : Count missing data

For each family, we'll check missing data in train folder to know how many images will be generated using various augmentation techniques

In [3]:
import os
import glob

def missing_data_counter(train_path, goal_value):
    '''
    This function calculate number of images in each family.
    For a given goal_value, the function calculate number of samples to be generated to reach the goal_value
    The goal_value is the number of samples in each class in a balanced database.
    
    RETURN :: a list of missing value for each class of the given database.
    '''
    os.chdir(train_path)
    list_fams = os.listdir(os.getcwd()) # vector of strings with family names
    no_imgs = [] # Get number of samples per family
    to_be_augmented = [] # Number of data to be augmented OR missing values
    
    for i in range(len(list_fams)):
        os.chdir(list_fams[i])
        len1 = len(glob.glob('*.png'))  # assuming the images are stored as 'png'
        no_imgs.append(len1)
        # we are going to banlance data with GOAL_VALUE samples per family
        missing = goal_value - len1
        if missing < 0 :
            print(" ", list_fams[i]," contains ", len1 , " ======> xxxxxx remove :  ", missing)
        elif missing == 0:
            print(" ", list_fams[i]," NO ACTION NEEDED")
        else:
            print(" ", list_fams[i]," contains ", len1 , " ======> to be augmented :  ", missing)

        to_be_augmented.append(missing)
        os.chdir('..')
        
    return to_be_augmented, list_fams

In [4]:
# path to train part of the splitted database
train_path_folder = "C:/Users/Zbook/Desktop/IKRAM_docs/Data_augmentation_code/new_output_folder_70/train/"
my_goal_value = 530 # modify this value if you want more data per family

tobeaugmented_values, list_fams = missing_data_counter(train_path_folder, my_goal_value)



## STEP 3 : Generate missing values per class

In [5]:
from keras.preprocessing.image import ImageDataGenerator

def generate_aug_data(fname, to_be_augmented, list_fams):
    '''
    Generate augmented data for each family depending on needed values
    save the augmented images in the same class folder with a specific prefix for each augmented method
    '''

    batches = ImageDataGenerator().flow_from_directory(directory=fname, target_size=(64,64), batch_size=8000)
    imgs, labels = next(batches)  
    
    X_train = imgs/255.
    y_train = labels
    
    # Generate augmented data and save them in each correponding folder
    for i in range(len(list_fams)):
        # absolute path to folder where we save new augmented images
        fname_aug = fname + list_fams[i]

        if to_be_augmented[i]==0 or to_be_augmented[i]<0 :
            continue
        
        # number of images to be added to_be_augmented[i] OR an integer
        #aug_size = int(to_be_augmented[i]*0.25)
        
        if tobeaugmented_values[i] % 4 == 0:
            aug_size = int(to_be_augmented[i]*0.25)
        else:
            aug_size = int(to_be_augmented[i]*0.25)+1
        
        # Method 1 : feature standardization 
        datagen_fs = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True)
        datagen_fs.fit(X_train)
        for X_batch, y_batch in datagen_fs.flow(X_train, y_train, batch_size=aug_size, save_to_dir=fname_aug, save_prefix='aug_fs_', save_format='png'):
            break

        # Method 2 : Random Rotation 
        datagen_rot = ImageDataGenerator(rotation_range=90)
        datagen_rot.fit(X_train)
        for X_batch, y_batch in datagen_rot.flow(X_train, y_train, batch_size=aug_size, save_to_dir=fname_aug, save_prefix='aug_rot_', save_format='png'):
            break

        # Method 3 : Random shifts 
        shift = 0.2
        datagen_shifts = ImageDataGenerator(width_shift_range=shift, height_shift_range=shift)
        datagen_shifts.fit(X_train)
        for X_batch, y_batch in datagen_shifts.flow(X_train, y_train, batch_size=aug_size, save_to_dir=fname_aug, save_prefix='aug_shift_', save_format='png'):
            break

        # Method 4 : Random flips
        datagen_flips = ImageDataGenerator(horizontal_flip=True, vertical_flip=True)
        datagen_flips.fit(X_train)
        for X_batch, y_batch in datagen_flips.flow(X_train, y_train, batch_size=aug_size, save_to_dir=fname_aug, save_prefix='aug_flips_', save_format='png'):
            break

In [6]:
generate_aug_data(train_path_folder, tobeaugmented_values, list_fams)

Found 7589 images belonging to 25 classes.


## STEP 4 : Remove extra data

In [7]:
import os
import random

def remove_extra_data(fname, missing):
    '''
    Remove n random images from a specific class/family
     - remove_n :: Number of images to be deleted
     - path :: class path 
    '''
    
    for i in range(len(list_fams)):
        if missing[i] < 0:
            remove_n = abs(missing[i])
            print(remove_n)
            family_path = fname + list_fams[i] # family path
            files = os.listdir(family_path)  # Get filenames in current folder
            files = random.sample(files, remove_n)  # Pick X random files to be deleted
            for file in files:  # Go over each file name to be deleted
                f = os.path.join(family_path, file)  # Create valid path to file
                os.remove(f)  # Remove the file
    print("DONE !")

In [8]:
# chcek missing update
check_missing, list_fams_check = missing_data_counter(train_path_folder, my_goal_value)
# then remove extra data
remove_extra_data(train_path_folder, check_missing)

  Agent.FYI  NO ACTION NEEDED
  C2LOP.gen!g  NO ACTION NEEDED
  Lolyda.AA2  NO ACTION NEEDED
  Malex.gen!J  NO ACTION NEEDED
  Skintrim.N  NO ACTION NEEDED
  Swizzor.gen!E  NO ACTION NEEDED
  Swizzor.gen!I  NO ACTION NEEDED
  VB.AT  NO ACTION NEEDED
2
2349
991
2
2
2
1
2
1
3
1
3
3
2
2
1
200
DONE !


## STEP 5 : Check new balanced data

In [9]:
check_missing, list_fams_check = missing_data_counter(train_path_folder, my_goal_value)

  Adialer.C  NO ACTION NEEDED
  Agent.FYI  NO ACTION NEEDED
  Allaple.A  NO ACTION NEEDED
  Allaple.L  NO ACTION NEEDED
  Alueron.gen!J  NO ACTION NEEDED
  Autorun.K  NO ACTION NEEDED
  C2LOP.gen!g  NO ACTION NEEDED
  C2LOP.P  NO ACTION NEEDED
  Dialplatform.B  NO ACTION NEEDED
  Dontovo.A  NO ACTION NEEDED
  Fakerean  NO ACTION NEEDED
  Instantaccess  NO ACTION NEEDED
  Lolyda.AA1  NO ACTION NEEDED
  Lolyda.AA2  NO ACTION NEEDED
  Lolyda.AA3  NO ACTION NEEDED
  Lolyda.AT  NO ACTION NEEDED
  Malex.gen!J  NO ACTION NEEDED
  Obfuscator.AD  NO ACTION NEEDED
  Rbot!gen  NO ACTION NEEDED
  Skintrim.N  NO ACTION NEEDED
  Swizzor.gen!E  NO ACTION NEEDED
  Swizzor.gen!I  NO ACTION NEEDED
  VB.AT  NO ACTION NEEDED
  Wintrim.BX  NO ACTION NEEDED
  Yuner.A  NO ACTION NEEDED
