In [1]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split, KFold
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score

from keras.callbacks import TensorBoard
from time import time

from numpy.random import seed
seed(2)
from tensorflow import set_random_seed
set_random_seed(2)

import matplotlib.pyplot as plt
from scipy.misc import imread
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import h5py
import warnings
warnings.filterwarnings('ignore')

from keras import backend as K

Using TensorFlow backend.


In [2]:
df = pd.read_csv('aug_train.csv')
label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}
df.drop('Unnamed: 0', axis=1, inplace=True)
odf = pd.read_csv('train.csv')
for k in label_names.keys():
    odf[label_names[k]] = 0

def one_hot(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    
    for target in row.Target:
        row.loc[label_names[int(target)]] = 1
    return row
odf = odf.apply(one_hot, axis=1)
odf = odf.drop('Target', axis=1)

def class_weights(df):
    total = 0
    weights = dict()
    for idx,key in label_names.items():
        total += df[key].value_counts()[1]
    
    for idx,key in label_names.items():
        weights[idx] = total/df[key].value_counts()[1]
    
    return weights

aug1 = pd.read_csv('aug_train1.csv')
aug1 = aug1.drop('Unnamed: 0', axis=1)
_, X_test, _, y_test = train_test_split(odf.Id, odf, test_size=0.1, shuffle=True, random_state=2)
X_train, y_train = aug1.Id, aug1

X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
weights = class_weights(y_train)

sub_csv = pd.read_csv('test.csv')
sub_csv

(29220,) (29220, 29) (3108,) (3108, 29)


Unnamed: 0,Id,Predicted
0,00008af0-bad0-11e8-b2b8-ac1f6b6435d0,0
1,0000a892-bacf-11e8-b2b8-ac1f6b6435d0,0
2,0006faa6-bac7-11e8-b2b7-ac1f6b6435d0,0
3,0008baca-bad7-11e8-b2b9-ac1f6b6435d0,0
4,000cce7e-bad4-11e8-b2b8-ac1f6b6435d0,0
5,00109f6a-bac8-11e8-b2b7-ac1f6b6435d0,0
6,001765de-bacd-11e8-b2b8-ac1f6b6435d0,0
7,0018641a-bac9-11e8-b2b8-ac1f6b6435d0,0
8,00200f22-bad7-11e8-b2b9-ac1f6b6435d0,0
9,0026f154-bac6-11e8-b2b7-ac1f6b6435d0,0


In [3]:
#f1 accuracy and focal loss function
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def fl(y_true, y_pred):#with tensorflow
    gamma=2
    alpha=1
    eps = 1e-16
    y_pred=K.clip(y_pred,eps,1-eps)#improve the stability of the focal loss and see issues 1 for more information
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))

    pt_1 = K.clip(pt_1, 1e-6, 1.-1e-6)
    pt_0 = K.clip(pt_0, 1e-6, 1.-1e-6)

    return -(K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) + 
            K.sum(alpha * K.pow(1-(1-pt_0), gamma) * K.log(1. - pt_0)))

In [4]:
class ModelParameters():
    def __init__(self):
        self.n_classes = 28
        self.image_rows = 512
        self.image_cols = 512
        self.n_channels = 4
        self.batch_size = 16
        self.n_epochs = 50
        self.shuffle = False
        self.image_row_scale_factor = 1
        self.image_col_scale_factor = 1
        self.scaled_row_dim = np.int(self.image_rows/self.image_row_scale_factor)
        self.scaled_col_dim = np.int(self.image_cols/self.image_row_scale_factor)
        self.input_shape = (self.scaled_row_dim, self.scaled_col_dim, self.n_channels)

params = ModelParameters()

In [5]:
class DataLoader:
    def __init__(self, X_train, y_train, X_test, y_test, params, X_sub=None):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.params = params
        self.X_sub = X_sub
        
    def build_iterators(self, num_threads=6, num_prefetch=8):
        def parse_function(filename, label):
            dims = (params.scaled_row_dim, params.scaled_col_dim)
            c_image = [None] * 4

            for idx,color in enumerate(['_green', '_red', '_blue', '_yellow']):
                fullname = 'test/' + filename + color + '.png'
                image_string = tf.read_file(fullname)
                c_image[idx] = tf.image.decode_png(image_string, channels=1)
                c_image[idx] = tf.image.convert_image_dtype(c_image[idx], tf.float32)
                c_image[idx] = tf.image.resize_images(c_image[idx], dims)
                c_image[idx] = tf.squeeze(c_image[idx], axis=-1)

            image = tf.stack([c_image[0],
                              c_image[1],
                              c_image[2],
                              c_image[3]], axis=-1)
            return image, label

        
        #Build graph - train_dataset
        filenames = self.X_train.as_matrix()
        labels = self.y_train.drop(['Id'], axis=1).as_matrix()
        self.set_name = 'train/'
        
        train_dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
        train_dataset = train_dataset.shuffle(len(filenames))
        train_dataset = train_dataset.map(parse_function, num_parallel_calls=num_threads)
        train_dataset = train_dataset.batch(self.params.batch_size)
        train_dataset = train_dataset.prefetch(num_prefetch)
        
        #Build graph - valid_dataset
        filenames = self.X_test.as_matrix()
        labels = self.y_test.drop(['Id'], axis=1).as_matrix()
        self.set_name = 'test/'
        
        test_dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
        test_dataset = test_dataset.shuffle(len(filenames))
        test_dataset = test_dataset.map(parse_function, num_parallel_calls=num_threads)
        test_dataset = test_dataset.batch(self.params.batch_size)
        test_dataset = test_dataset.prefetch(num_prefetch)
        
        #Build graph - test submission
        filenames = self.X_sub.as_matrix()
        labels = np.zeros(shape=(len(filenames), 28), dtype=np.int64)
        sub_dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
        sub_dataset = sub_dataset.map(parse_function, num_parallel_calls=num_threads).batch(self.params.batch_size).prefetch(num_prefetch)
        
        iterator = tf.data.Iterator.from_structure(train_dataset.output_types,
                                                   train_dataset.output_shapes)
        next_element = iterator.get_next()
        
        self.train_init_op = iterator.make_initializer(train_dataset)
        self.test_init_op = iterator.make_initializer(test_dataset)
        self.sub_init_op = iterator.make_initializer(sub_dataset)
        return next_element


In [6]:
from models.densenet_121 import get_model
from tqdm import tqdm

class Model:
    def __init__(self, params):
        self.params = params
        

    def build(self, lr=0.01):
        self.data_loader = DataLoader(X_train, y_train, X_test, y_test, self.params, X_sub=sub_csv.Id)
        next_element = self.data_loader.build_iterators()

        #tf.reset_default_graph()

        with tf.name_scope('placeholders'):

            #self.y = tf.placeholder(tf.float32, shape=[None, params.n_classes], name='y')
            self.y = next_element[1]
            
        with tf.name_scope('convolutional'):
            out = get_model(next_element[0], 28, 'channels_last', True)

        with tf.name_scope('dense'):
            dense = tf.layers.dense(out, 64, activation=tf.nn.relu, name='dense')
            self.output = tf.layers.dense(dense, 28, activation=tf.nn.sigmoid, name='output')

        with tf.name_scope('loss'):
            self.entropy = fl(self.y, self.output)
            self.loss = tf.reduce_mean(self.entropy)

        with tf.name_scope('train'):
            self.optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = self.optimizer.minimize(self.loss)
            
    def learn(self, epochs=150):
        saver = tf.train.Saver()
        with tf.Session() as sess:
            
            sess.run(tf.global_variables_initializer())
            saver.restore(sess, 'dense_models/densenet_stage2_4.ckpt')
            
            for epoch in range(epochs):
                self.score(session=sess)
                train_len = int(np.floor(len(X_train)/self.params.batch_size))
                loss_all = []
                
                print('Epoch: ' + str(epoch))
                with tqdm(total=train_len) as pbar:
                    sess.run(self.data_loader.train_init_op)
                    
                    for steps in range(train_len):
                        train, loss = sess.run([self.train_op, self.loss])
                        loss_all.append(loss)
                        pbar.set_description('loss: {:.4f}'.format(loss))
                        pbar.update(1)
                    
                    print('train_loss: ' + str(np.mean(loss_all)))
                saver.save(sess, 'dense_models/densenet_stage2_5.ckpt')
    
    def score(self, session=None, gen='test'):
        with session.as_default():
            if gen == 'test':
                gen_len = int(np.floor(len(X_test)/self.params.batch_size))
                
                preds_all = np.zeros((gen_len * self.params.batch_size, 28))
                true = np.zeros((gen_len * self.params.batch_size, 28), dtype=np.int8)
                loss_all = []
                
                session.run(self.data_loader.test_init_op)
                with tqdm(total=gen_len) as pbar:
                    for steps in range(gen_len):

                        preds, loss, y_batch = session.run([self.output, self.loss, self.y])
                        
                        true[steps:steps+self.params.batch_size, :] = y_batch
                        preds_all[steps:steps+self.params.batch_size, :] = preds
                        loss_all.append(loss)
                        
                        pbar.set_description('loss: {:.4f}'.format(loss))
                        pbar.update(1)
                        

                    preds_arr = preds_all > 0.5

                    assert(preds_arr.shape == true.shape, 'Pred, truth shape mismatch')
                    f1 = f1_score(true, preds_arr, average='macro')
                    precision = precision_score(true, preds_arr, average='macro')
                    print('F1: ' + str(f1), 'Precision: ' + str(precision), 'val_loss: ' + str(np.mean(loss_all)))
                    return preds_arr
                
    def create_submission(self, session=None, X=None):
        saver = tf.train.Saver()
        p_all = []
        with tf.Session() as session:
            gen_len = int(np.floor(len(X)/self.params.batch_size))
            session.run(tf.global_variables_initializer())
            
            preds_all = np.zeros((gen_len * self.params.batch_size, 28))
            
            saver.restore(session, 'dense_models/densenet_stage2_5.ckpt')
            
            session.run(self.data_loader.sub_init_op)
            with tqdm(total=gen_len) as pbar:
                for steps in range(gen_len):
                    preds = session.run([self.output])
                    preds_all[steps:steps+self.params.batch_size] = preds[0]
                    p_all.append(preds[0])
                    pbar.update(1)
                    
                preds_ar = np.array(p_all)

                preds_ar = np.reshape(preds_ar, (preds_ar.shape[0]*preds_ar.shape[1],preds_ar.shape[2]))

                preds_arr0 = preds_all > 0.5
                
                return preds_ar

In [7]:
model = Model(params)
model.build(lr=1e-5)
#model.learn()

In [20]:
subm = model.create_submission(X=sub_csv)

INFO:tensorflow:Restoring parameters from dense_models/densenet_stage2_5.ckpt


100%|██████████| 731/731 [01:38<00:00,  9.40it/s]


In [21]:
list_preds = []
threshold = 0.1
sub = subm > threshold
for line in sub:
    i=0
    str_label = ''
    flag=False
    for c_pred in line:
        if c_pred:
            str_label += str(i) + ' '
            flag=True
        else:
            str_label += ''
        i += 1
        
    list_preds.append(str_label.strip())

    
for i in range(6):
    list_preds.append('')

In [22]:

submissions = pd.DataFrame({'Id': sub_csv.Id, 'Predicted': list_preds})

submissions.to_csv('submission.csv', index=False)
submissions

# Stage 2 Summary

### dense_models/densenet_stage2_1.ckpt -- Epochs: 14 -- lr: 1e-3
                                        loss: 15.1176: 100%|██████████| 1826/1826 [11:47<00:00,  2.77it/s]
                                        train_loss: 11.212003
                                        loss: 11.1562: 100%|██████████| 194/194 [00:23<00:00,  8.81it/s]
                                          0%|          | 0/1826 [00:00<?, ?it/s]
                                        F1: 0.41749941032695653 Precision: 0.5143140034299384 val_loss: 11.958441
                                        
### dense_models/densenet_stage2_2.ckpt -- Epochs: 4 -- lr:1e-4
                                        loss: 12.0911: 100%|██████████| 1826/1826 [11:49<00:00,  2.79it/s]
                                        train_loss: 9.5457945
                                        loss: 9.6333: 100%|██████████| 194/194 [00:23<00:00,  8.63it/s] 
                                          0%|          | 0/1826 [00:00<?, ?it/s]
                                        F1: 0.5170505314840885 Precision: 0.6042577958457712 val_loss: 10.7184
                                        
### dense_models/densenet_stage2_3.ckpt -- Epoch: 2 -- lr:1e-5
                                        loss: 11.7702: 100%|██████████| 1826/1826 [11:49<00:00,  2.81it/s]
                                        train_loss: 9.353686
                                        loss: 9.6262: 100%|██████████| 194/194 [00:23<00:00,  8.18it/s] 
                                          0%|          | 0/1826 [00:00<?, ?it/s]
                                        F1: 0.534295450029398 Precision: 0.6561235025549912 val_loss: 10.566189