In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from scipy.misc import imread, imsave
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


# This class creates augmented dataset from train.csv
### Minority classes are first determined based on mean on ground truth labels of that class
### Augmentation applied are: Zoom, range: 0.50-0.58 as zoom to a larger range can result in valuable features being cropped out, and vertical/horizontal flip
### Each image is augmented several times till # of that label reaches the 'target'
### Only train images are augmented and no TTA is applied

In [2]:
class AugmentDataset:
    def __init__(self, dataframe, classes):
        self.df = dataframe
        self.label_names = classes
        self.threshold = 0.1
        self.minority_majority(self.df)
        self.multiplier = 100
        self.target = 200
        self.class_aug_df = self.df
        self.image_aug = ImageDataGenerator(rotation_range=0, 
                                                 zoom_range=[0.50, 0.58], 
                                                 horizontal_flip=True, 
                                                 vertical_flip=True)
        
    def minority_majority(self, df):
        self.minorities, self.majorities = [], []
        means = df.describe().loc['mean'].values
        for idx,val in self.label_names.items():
            if means[idx] <= self.threshold:
                self.minorities.append(label_names[idx])
                
        for idx,val in self.label_names.items():
            if means[idx] > 0.1:
                self.majorities.append(label_names[idx])
                
        print(self.minorities, self.majorities)
        return self.minorities
    
    
    def oversampler(self):
        def row_op(row):
            images = self.load_image('train/', row.Id)[0:4]
            for i in range(0, multiplier):
                Id = row.Id + '_aug_' + str(i)
                labels = [tuple(row.values[1:])]
                col1 = pd.DataFrame({'Id': str(Id)}, index=[0])
                col2 = pd.DataFrame.from_records(labels, columns=self.label_names.values())
                new_row = pd.concat([col1, col2], axis=1)

                self.class_aug_df = pd.concat([self.class_aug_df, new_row])

                self.save_image(images, Id)
        
        for name in self.minorities:
            ones = self.df.loc[df[name] == 1, 'Id':label_names[27]]
            counts = len(ones)
            
            if(counts < self.target):
                print(name, counts)
                multiplier = np.int(np.floor(self.target/counts))
                ones.apply(row_op, axis=1)

    def load_image(self, basepath, image_id):
        images = np.zeros(shape=(4,512,512))
        images[0,:,:] = imread(basepath + image_id + "_green" + ".png")
        images[1,:,:] = imread(basepath + image_id + "_red" + ".png")
        images[2,:,:] = imread(basepath + image_id + "_blue" + ".png")
        images[3,:,:] = imread(basepath + image_id + "_yellow" + ".png")
        return images
    
    def save_image(self, img, Id):
        
        image = np.stack((img[0],
                          img[1],
                          img[2],
                          img[3]), -1)
        image = np.expand_dims(image, axis=0)
        
        #Save original image and save after augment
        self.save_colors(img=image, Id=Id)
        
        for x in self.image_aug.flow(image):
            self.save_colors(img=x, Id=Id+'_aug')
            break
        
    def save_colors(self, img, Id):
        colors = ['_green', '_red', '_blue', '_yellow']
        i=0
        for color in colors:
            file = 'train/' + Id + color + '.png'
            print('\nSaving image: ' + file)
            imsave(file, img[0, :, :, i])
            i += 1

In [5]:
df = pd.read_csv('train.csv')
label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

for k in label_names.keys():
    df[label_names[k]] = 0

def one_hot(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    
    for target in row.Target:
        row.loc[label_names[int(target)]] = 1
    return row
df = df.apply(one_hot, axis=1)
df = df.drop('Target', axis=1)
df.describe().loc['mean']

X_train, X_test, y_train, y_test = train_test_split(df.Id, df, test_size=0.1)

In [26]:
train_split = pd.read_csv('aug_train1.csv')
aug_data = AugmentDataset(dataframe=train_split, classes=label_names)

train_split.drop('Unnamed: 0', axis=1, inplace=True)
newdf = train_split.loc[train_split[label_names[0]] == 1][:1]

for idx,key in label_names.items():
    print(key)
    newdf = newdf.append(train_split.loc[train_split[key] == 1][:200])
newdf.to_csv('newdf.csv')

['Nucleoli', 'Nuclear speckles', 'Nuclear bodies', 'Endoplasmic reticulum', 'Golgi apparatus', 'Peroxisomes', 'Endosomes', 'Lysosomes', 'Intermediate filaments', 'Actin filaments', 'Focal adhesion sites', 'Microtubules', 'Microtubule ends', 'Cytokinetic bridge', 'Mitotic spindle', 'Microtubule organizing center', 'Centrosome', 'Lipid droplets', 'Plasma membrane', 'Mitochondria', 'Aggresome', 'Cytosol', 'Rods & rings'] ['Nucleoplasm', 'Nuclear membrane', 'Nucleoli fibrillar center', 'Cell junctions', 'Cytoplasmic bodies']
Nucleoplasm
Nuclear membrane
Nucleoli
Nucleoli fibrillar center
Nuclear speckles
Nuclear bodies
Endoplasmic reticulum
Golgi apparatus
Peroxisomes
Endosomes
Lysosomes
Intermediate filaments
Actin filaments
Focal adhesion sites
Microtubules
Microtubule ends
Cytokinetic bridge
Mitotic spindle
Microtubule organizing center
Centrosome
Lipid droplets
Plasma membrane
Cell junctions
Mitochondria
Aggresome
Cytosol
Cytoplasmic bodies
Rods & rings


In [27]:
newdf

Unnamed: 0,Id,Nucleoplasm,Nuclear membrane,Nucleoli,Nucleoli fibrillar center,Nuclear speckles,Nuclear bodies,Endoplasmic reticulum,Golgi apparatus,Peroxisomes,...,Microtubule organizing center,Centrosome,Lipid droplets,Plasma membrane,Cell junctions,Mitochondria,Aggresome,Cytosol,Cytoplasmic bodies,Rods & rings
7,8b40e98c-bba4-11e8-b2b9-ac1f6b6435d0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,8b40e98c-bba4-11e8-b2b9-ac1f6b6435d0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,4a651918-bb9c-11e8-b2b9-ac1f6b6435d0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10,a33aa3ee-bbb2-11e8-b2ba-ac1f6b6435d0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
11,2eef43d8-bbac-11e8-b2ba-ac1f6b6435d0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12,9dae4f52-bbb2-11e8-b2ba-ac1f6b6435d0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15,75d9db26-bbc2-11e8-b2bb-ac1f6b6435d0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
20,ce932558-bbb6-11e8-b2ba-ac1f6b6435d0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
21,68c881ec-bba6-11e8-b2ba-ac1f6b6435d0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,5cbe862e-bbbc-11e8-b2ba-ac1f6b6435d0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
aug_data.class_aug_df

Unnamed: 0,Id,Nucleoplasm,Nuclear membrane,Nucleoli,Nucleoli fibrillar center,Nuclear speckles,Nuclear bodies,Endoplasmic reticulum,Golgi apparatus,Peroxisomes,...,Microtubule organizing center,Centrosome,Lipid droplets,Plasma membrane,Cell junctions,Mitochondria,Aggresome,Cytosol,Cytoplasmic bodies,Rods & rings
23830,c4781282-bbb5-11e8-b2ba-ac1f6b6435d0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23874,c4d1c464-bbc7-11e8-b2bc-ac1f6b6435d0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7993,4283deac-bba4-11e8-b2b9-ac1f6b6435d0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23610,c2ad140e-bbb3-11e8-b2ba-ac1f6b6435d0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5008,29a794ca-bbb1-11e8-b2ba-ac1f6b6435d0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7083,3aae99ae-bbba-11e8-b2ba-ac1f6b6435d0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8703,4876bcc0-bbb4-11e8-b2ba-ac1f6b6435d0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23278,bfe5e5ca-bbbd-11e8-b2ba-ac1f6b6435d0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27679,e40836f2-bbbc-11e8-b2ba-ac1f6b6435d0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29176,f03dd1fc-bbbc-11e8-b2ba-ac1f6b6435d0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


array([0, 1])

{0: 1.0,
 1: 11.559930008748907,
 2: 3.572046499053798,
 3: 9.05620287868403,
 4: 7.435565559932471,
 5: 4.841700256504214,
 6: 10.595829991980754,
 7: 5.0354420731707314,
 8: 24.5139146567718,
 9: 13.387031408308005,
 10: 16.68308080808081,
 11: 13.134194831013916,
 12: 20.42194744976816,
 13: 13.678053830227743,
 14: 11.43982683982684,
 15: 25.9078431372549,
 16: 10.403937007874015,
 17: 20.01969696969697,
 18: 13.86463798530955,
 19: 9.29184247538678,
 20: 20.906645569620252,
 21: 3.62596048298573,
 22: 17.50066225165563,
 23: 4.7631578947368425,
 24: 22.244107744107744,
 25: 1.665154379332073,
 26: 22.859861591695502,
 27: 25.9078431372549}

[13213,
 1143,
 3699,
 1459,
 1777,
 2729,
 1247,
 2624,
 539,
 987,
 792,
 1006,
 647,
 966,
 1155,
 510,
 1270,
 660,
 953,
 1422,
 632,
 3644,
 755,
 2774,
 594,
 7935,
 578,
 510]

0.10714285714285714

NameError: name 'preds' is not defined