# Data Augmentation

Data augmentation in data analysis are techniques used to increase the amount of data by adding slightly modified copies of already existing data or newly created synthetic data from existing data. 

It acts as a regularizer and helps reduce overfitting when training a machine learning model.

### Step-1: Collect Data

In [1]:
# Get filenames in a list
from os import listdir
from os.path import isfile, join

- **`listdir()`** method in python is used to get the list of all files and directories in the specified directory. If we don't specify any directory, then list of files and directories in the current working directory will be returned.

Syntax: 
```py
os.listdir(path)
```

- `os.path.isfile()` method in Python is used to check whether the specified path is an existing regular file or not.

In [2]:
mypath = "./data/datasets/images/"

file_names = [f for f in listdir(mypath) if isfile(join(mypath, f))]
len(file_names)

3002

In [3]:
for i in range(5):
    print(file_names[i])

cat.0.jpg
cat.1.jpg
cat.10.jpg
cat.100.jpg
cat.1000.jpg


### step-2: split our images into train and test/validation dataset

1. We need to store their labels (e.g. y_train, y_test)
2. resize all of the images into `150 x 150`
3. For training, we're going to use 1000 images of dog and 1000 images of cats
4. Dogs will be $label = 1$ and cats will be $label = 0$

In [4]:
import cv2
import numpy as np
import os
import sys
import shutil

In [10]:
dog_count = 0
cat_count = 0
training_size = 1000
test_size = 500
training_images = []
training_labels = []
test_images = []
test_labels = []
size = 150

# save directories
dog_dir_train = "./data/datasets/train/dogs/"
cat_dir_train = "./data/datasets/train/cats/"
dog_dir_val = "./data/datasets/validation/dogs/"
cat_dir_val = "./data/datasets/validation/cats/"

In [11]:
def make_dir(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory)

In [14]:
make_dir(dog_dir_train)
make_dir(cat_dir_train)
make_dir(dog_dir_val)
make_dir(cat_dir_val)

In [8]:
def get_zeros(number):
    if(number > 10 and number < 100):
        return "0"
    elif(number < 10):
        return "00"
    else:
        return ""

In [15]:
%%time

for i, file in enumerate(file_names):
    
    # dogs
    if(file_names[i][0] == "d"):
        dog_count += 1
        img = cv2.imread(mypath+file)
        img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
        if(dog_count <= training_size):
            training_images.append(img)
            training_labels.append(1)
            zeros = get_zeros(dog_count)
            cv2.imwrite(dog_dir_train + "dog" + str(zeros) + str(dog_count) + ".jpg", img)
        if(dog_count > training_size and dog_count < training_size+test_size):
            test_images.append(img)
            test_labels.append(1)
            zeros = get_zeros(dog_count - 1000)
            cv2.imwrite(dog_dir_val + "dog" + str(zeros) + str(dog_count-1000) + ".jpg", img)
            
    # cats
    if(file_names[i][0] == "c"):
        cat_count += 1
        img = cv2.imread(mypath+file)
        img = cv2.resize(img, (size, size), interpolation=cv2.INTER_AREA)
        if(cat_count <= training_size):
            training_images.append(img)
            training_labels.append(0)
            zeros = get_zeros(cat_count)
            cv2.imwrite(cat_dir_train + "cat" + str(zeros) + str(cat_count) + ".jpg", img)
        if(cat_count > training_size and cat_count < training_size+test_size):
            test_images.append(img)
            test_labels.append(0)
            zeros = get_zeros(cat_count - 1000)
            cv2.imwrite(cat_dir_val + "dog" + str(zeros) + str(cat_count-1000) + ".jpg", img)
            
    if(dog_count == training_size + test_size and cat_count == training_size + test_size):
        break

Wall time: 38.9 s


`cv2.resize()` ğŸ‘‰ **interpolation:** https://chadrick-kwag.net/cv2-resize-interpolation-methods/

`cv2.imwrite()` ğŸ‘‰ method is used to save an image to any storage device. This will save the image according to the specified format in current working directory.

### step-3: save the dataset's to `npz` format

with `np.savez()`, save several arrays into a single file in uncompressed ``.npz`` format.

In [17]:
np.savez("./data/datasets/cats_vs_dogs_training_data.npz", np.array(training_images))
np.savez("./data/datasets/cats_vs_dogs_training_labels.npz", np.array(training_labels))
np.savez("./data/datasets/cats_vs_dogs_test_data.npz", np.array(test_images))
np.savez("./data/datasets/cats_vs_dogs_test_labels.npz", np.array(test_labels))

In [18]:
# loader function
def load_training_and_test_data(datasetName):
    
    npzfile = np.load("./data/datasets/" + datasetName + "_training_data.npz")
    train = npzfile["arr_0"]
    
    npzfile = np.load("./data/datasets/" + datasetName + "_training_labels.npz")
    train_labels = npzfile["arr_0"]
    
    npzfile = np.load("./data/datasets/" + datasetName + "_test_data.npz")
    test = npzfile["arr_0"]
    
    npzfile = np.load("./data/datasets/" + datasetName + "_test_labels.npz")
    test_labels = npzfile["arr_0"]
    
    return (train, train_labels), (test, test_labels)

In [19]:
# load random image
for i in range(1,6):
    random = np.random.randint(0, len(training_images))
    cv2.imshow(f"image_{i}", training_images[random])
    
    if(training_labels[random] == 0):
        print(f"{i} - Cat")
    else:
        print(f"{i} - Dog")
        
    cv2.waitKey(0)
    
cv2.destroyAllWindows()

1 - Dog
2 - Cat
3 - Cat
4 - Dog
5 - Cat
