# Oversampling Data

We copy and mirror data in order to balance data.

In [1]:
import os
import sys
import cv2
import math
import random
import operator
import numpy as np

### Constants

In [2]:
random.seed(12)
TRAIN_PATH = '/home/aires/Documents/phd/dataset/DogCentric/train.txt'
LEFT = 3
RIGHT = 4
ONLY_MIRROR_CLASSES = [LEFT, RIGHT]
MIRROR = 1
COPY = 0
OPERATION = [COPY, MIRROR]

### Development

In [11]:
classes = dict()

# Read file.
with open(TRAIN_PATH, 'r') as r:
    # Run over the file lines.
    for line in r.readlines():
        path, y = line.strip().split()
        y = int(y)
        
        if not classes.has_key(y):
            classes[y] = [path]
        else:
            classes[y].append(path)
        
# Count classes samples.
max_class = np.argmax([len(classes[clss]) for clss in sorted(classes)])
max_elements = len(classes[max_class])
print "Found class with most samples: %d\nNumber of samples of this class: %d" % (max_class, max_elements)

Found class with most samples: 0
Number of samples of this class: 1855


In [12]:
'''
    Run over remaining classes, copy or mirror their elements until they have the same number of max_class.
    In this first part, we only use the classes that have less samples than the max class and avoid
    the two classes with mirror problems, which are LookLeft and LookRight.
'''

count = dict()

NOT_LISTED = ONLY_MIRROR_CLASSES + [max_class]

for clss in classes:
    # Run over remaining classes.
    if clss not in NOT_LISTED:
        
        print "Processing class: %d" % clss
        
        internal_count = 0 # Count the number of loops for each class.
        marked = [] # Mark paths already used to avoid repetitions.
        
        # Run while the number of elements of the classe is smaller than max_class.
        while len(classes[clss]) < max_elements:
            
            # Get a random element from the list.
            random_path = random.choice(classes[clss])
            # Choose the action (mirror or copy).
            random_action = random.choice(OPERATION)
            
            while random_path in marked:
                # Ensure getting a non-used path.
                random_path = random.choice(classes[clss])
            
            if not count.has_key(clss):
                # Initialize the count dictionary to the corresponding class.
                count[clss] = dict()
    
            if random_action:                
                img = cv2.imread(random_path, 1) # Read image.
                rimg = img.copy()
                rimg= cv2.flip(img,1) # Vertical flip.
                
                # Generate a new name to the output image using the existing one.
                filename, file_extension = os.path.splitext(random_path)
                new_filename = filename + "_mirror" + file_extension
                cv2.imwrite(new_filename, rimg)
                
                # Save the output path to the maked list.
                marked.append(new_filename)
                
                # Add to the list of paths in the class.
                classes[clss].append(new_filename)
                
                # Increase the counter.
                if not count[clss].has_key('mirror'):
                    count[clss]['mirror'] = 1
                else:
                    count[clss]['mirror'] += 1
                    
            else:
                # Just add a copy of the same path to the list of 
                classes[clss].append(random_path)
                
                if not count[clss].has_key('copy'):
                    count[clss]['copy'] = 1
                else:
                    count[clss]['copy'] += 1
                    
            internal_count += 1
        
        print "\tNumber of modification in this class: %d" % internal_count

Processing class: 1
	Number of modification in this class: 585
Processing class: 2
	Number of modification in this class: 350
Processing class: 5
	Number of modification in this class: 300
Processing class: 6
	Number of modification in this class: 365
Processing class: 7
	Number of modification in this class: 1065
Processing class: 8
	Number of modification in this class: 210
Processing class: 9
	Number of modification in this class: 540


In [13]:
'''
    To classes such as LookLeft and LookRight we just mirror them and throw the result in the other class
'''

for clss in ONLY_MIRROR_CLASSES:
    # Run over selected classes.
    
    print "Processing class: %d" % clss
    
    # Move the destination according the left/right indication.
    if clss == LEFT:
        destination = RIGHT
    else:
        destination = LEFT
    
    if not count.has_key(destination):
        count[destination] = dict()
    
    print "Saving in class: %d" % destination
    
    marked = []
    
    while len(classes[destination]) < max_elements:
        
        # Get a random element from the class.
        random_path = random.choice(classes[clss])
        
        if random_path in marked or '_mirror' in random_path:
            # Copy an existing image.
            new_random_path = random.choice(classes[destination])
            classes[destination].append(new_random_path)
                
            if not count[destination].has_key('copy'):
                count[destination]['copy'] = 1
            else:
                count[destination]['copy'] += 1
                
            continue
            
        else:
            marked.append(random_path)
        
        # Mirror.
        img = cv2.imread(random_path, 1)
        rimg = img.copy()
        rimg= cv2.flip(img,1)

        filename, file_extension = os.path.splitext(random_path)
        if destination == RIGHT:
            # Change the filename according to the destination.
            new_filename = filename.replace('Left', 'Right') + "_mirror" + file_extension
        else:
            new_filename = filename.replace('Right', 'Left') + "_mirror" + file_extension
            
        cv2.imwrite(new_filename, rimg)

        classes[destination].append(new_filename)

        if not count[destination].has_key('mirror'):
            count[destination]["mirror"] = 1
        else:
            count[destination]['mirror'] += 1

Processing class: 3
Saving in class: 4
Processing class: 4
Saving in class: 3


In [14]:
# Save new train.
new_train_file = TRAIN_PATH[:-4] + "_new.txt"

with open(new_train_file, 'w') as w:
    for clss in classes:

        for path in classes[clss]:
            w.write(path + ' ' + str(clss) + "\n")

In [15]:
# Save report.

report = '/home/aires/Documents/phd/dataset/DogCentric/report.txt'

with open(report, 'w') as ww:
    
    for clss in count:
        ww.write("Class: %d\n" % clss)
        ww.write('\tMirrors added: %d\n' % count[clss]['mirror'])
        if count[clss].has_key('copy'):
            ww.write('\tCopies added: %d\n' % count[clss]['copy'])

302
155
619
497
170
181
498
109
279
