In [1]:
import pandas as pd
import numpy as np
from os import listdir
import keras
# import cv2
from skimage.io import imread
from skimage.transform import resize

Using TensorFlow backend.


In [2]:
train_path = "/projectnb/dl-course/jxtang/EC500_proj/train/"
test_path = "/projectnb/dl-course/jxtang/EC500_proj/test/"
label_path = '/projectnb/dl-course/jxtang/EC500_proj/train.csv'
train_files = listdir(train_path)
test_files = listdir(test_path)


In [3]:
train_labels = pd.read_csv(label_path)
train_labels.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [4]:
names_dict = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}
# reverse_names_dict = dict((v,k) for k,v in names_dict.items())

In [5]:
def fill_targets(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = names_dict[int(num)]
        row.loc[name] = 1
    return row

for key in names_dict.keys():
    train_labels[names_dict[key]] = 0
train_labels = train_labels.apply(fill_targets, axis=1)
train_labels["number_of_targets"] = train_labels.drop(["Id", "Target"],axis=1).sum(axis=1)

In [6]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
for train_idx, test_idx in kf.split(train_labels.index.values):
    partition = {}
    partition["train"] = train_labels.Id.values[train_idx]
    partition["validation"] = train_labels.Id.values[test_idx]
#     X_train, X_test = train_labels.Id.values[train_idx], train_labels.Id.values[test_idx]
#     y_train, y_test = train_labels.Id.values[train_idx], train_labels.Id.values[test_idx]

In [7]:
partition

{'train': array(['00070df0-bbc3-11e8-b2bc-ac1f6b6435d0',
        '000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0',
        '000a9596-bbc4-11e8-b2bc-ac1f6b6435d0', ...,
        'ab33dff8-bba7-11e8-b2ba-ac1f6b6435d0',
        'ab351f1c-bbb6-11e8-b2ba-ac1f6b6435d0',
        'ab385b2e-bbba-11e8-b2ba-ac1f6b6435d0'], dtype=object),
 'validation': array(['ab3978aa-bbbb-11e8-b2ba-ac1f6b6435d0',
        'ab3b9258-bbab-11e8-b2ba-ac1f6b6435d0',
        'ab3ca16e-bbc1-11e8-b2bb-ac1f6b6435d0', ...,
        'fff189d8-bbab-11e8-b2ba-ac1f6b6435d0',
        'fffdf7e0-bbc4-11e8-b2bc-ac1f6b6435d0',
        'fffe0ffe-bbc0-11e8-b2bb-ac1f6b6435d0'], dtype=object)}

In [8]:
class ModelParameters(object):
    path = train_path
    num_classes=28
    image_rows=512
    image_cols=512
    batch_size=100
    n_channels=3
    shuffle=False
    scaled_row_dim = 139
    scaled_col_dim = 139 
    n_epochs=10

parameter = ModelParameters()

In [9]:
class ImagePreprocessor:
    
    def __init__(self, modelparameter):
        self.parameter = modelparameter
        self.path = self.parameter.path
        self.scaled_row_dim = self.parameter.scaled_row_dim
        self.scaled_col_dim = self.parameter.scaled_col_dim
        self.n_channels = self.parameter.n_channels
    
    def preprocess(self, image):
        image = self.resize(image)
        image = self.reshape(image)
        image = self.normalize(image)
        return image
    
    def resize(self, image):
        return resize(image, (self.scaled_row_dim, self.scaled_col_dim))
    
    def reshape(self, image):
        return np.reshape(image, (image.shape[0], image.shape[1], self.n_channels))
    
    def normalize(self, image):
#         image /= 255
#         return image
        return (image / 255.0 - 0.5) / 0.5
            
    def load_image(self, image_id):
        image = np.zeros(shape=(512,512,4))
        image[:,:,0] = imread(self.basepath + image_id + "_green" + ".png")
        image[:,:,1] = imread(self.basepath + image_id + "_blue" + ".png")
        image[:,:,2] = imread(self.basepath + image_id + "_red" + ".png")
        image[:,:,3] = imread(self.basepath + image_id + "_yellow" + ".png")
        return image[:,:,0:self.parameter.n_channels]

In [10]:
preprocessor = ImagePreprocessor(parameter)

In [11]:
class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, list_IDs, labels, modelparameter, imagepreprocessor):
        self.params = modelparameter
        self.labels = labels
        self.list_IDs = list_IDs
        self.dim = (self.params.scaled_row_dim, self.params.scaled_col_dim)
        self.batch_size = self.params.batch_size
        self.n_channels = self.params.n_channels
        self.num_classes = self.params.num_classes
        self.preprocessor = imagepreprocessor
        self.shuffle = self.params.shuffle
        self.on_epoch_end()
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def get_targets_per_image(self, identifier):
        return self.labels.loc[self.labels.Id==identifier].drop(
                ["Id", "Target", "number_of_targets"], axis=1).values
            
    def __data_generation(self, list_IDs_temp):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.num_classes), dtype=int)
        for i, identifier in enumerate(list_IDs_temp):
            image = self.preprocessor.load_image(identifier)
            image = self.preprocessor.preprocess(image)
            X[i] = image
            y[i] = self.get_targets_per_image(identifier)
        return X, y
    
    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))
 
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

In [12]:
class PredictGenerator:
    
    def __init__(self, predict_Ids, imagepreprocessor, predict_path):
        self.preprocessor = imagepreprocessor
        self.preprocessor.basepath = predict_path
        self.identifiers = predict_Ids
    
    def predict(self, model):
        y = np.empty(shape=(len(self.identifiers), self.preprocessor.parameter.num_classes))
        for n in range(len(self.identifiers)):
            image = self.preprocessor.load_image(self.identifiers[n])
            image = self.preprocessor.preprocess(image)
            image = image.reshape((1, *image.shape))
            y[n] = model.predict(image)
        return y

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.losses import binary_crossentropy
from keras.optimizers import Adadelta
from keras.models import load_model

class BaseLineModel:
    
    def __init__(self, modelparameter):
        self.params = modelparameter
        self.num_classes = self.params.num_classes
        self.img_rows = self.params.scaled_row_dim
        self.img_cols = self.params.scaled_col_dim
        self.n_channels = self.params.n_channels
        self.input_shape = (self.img_rows, self.img_cols, self.n_channels)
        self.my_metrics = ['accuracy']
    
    def build_model(self):
        self.model = Sequential()
        self.model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=self.input_shape))
        self.model.add(Conv2D(32, (3, 3), activation='relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Dropout(0.25))
        self.model.add(Flatten())
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(self.num_classes, activation='sigmoid'))

    def compile_model(self):
        self.model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=self.my_metrics)
    
    def set_generators(self, train_generator, validation_generator):
        self.training_generator = train_generator
        self.validation_generator = validation_generator
    
    def learn(self):
        return self.model.fit_generator(generator=self.training_generator,
                    validation_data=self.validation_generator,
                    epochs=self.params.n_epochs, 
                    steps_per_epoch=100,
                    use_multiprocessing=True,
                    validation_steps=50,
                    workers=8)
    
    def score(self):
        return self.model.evaluate_generator(generator=self.validation_generator,
                                      use_multiprocessing=True, 
                                      workers=8)

    def predict(self, predict_generator):
        y = predict_generator.predict(self.model)
        return y
    
    def save(self, modeloutputpath):
        self.model.save(modeloutputpath)
    
    def load(self, modelinputpath):
        self.model = load_model(modelinputpath)

In [14]:
labels = train_labels
print("Number of samples in train: {}".format(len(partition["train"])))
print("Number of samples in validation: {}".format(len(partition["validation"])))

Number of samples in train: 20715
Number of samples in validation: 10357


In [15]:
training_generator = DataGenerator(partition['train'], labels, parameter, preprocessor)
validation_generator = DataGenerator(partition['validation'], labels, parameter, preprocessor)
predict_generator = PredictGenerator(partition['validation'], preprocessor, train_path)

In [21]:
import warnings
warnings.filterwarnings("ignore")
model = BaseLineModel(parameter)
model.build_model()
model.compile_model()
model.set_generators(training_generator, validation_generator)
history = model.learn()
#model.save("baseline_model.h5")
proba_predictions = model.predict(predict_generator)
baseline_proba_predictions = pd.DataFrame(proba_predictions, columns=train_labels.drop(
    ["Target", "number_of_targets", "Id"], axis=1).columns)
baseline_proba_predictions.to_csv("baseline_predictions.csv")

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch 1/100


InternalError: Failed to create session.

In [None]:
baseline_proba_predictions

In [16]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14510786567249704164
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11273211085
locality {
  bus_id: 1
  links {
    link {
      device_id: 1
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 6308853822187419784
physical_device_desc: "device: 0, name: Tesla K40m, pci bus id: 0000:03:00.0, compute capability: 3.5"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 11273211085
locality {
  bus_id: 1
  links {
    link {
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 4448833557162495234
physical_device_desc: "device: 1, name: Tesla K40m, pci bus id: 0000:04:00.0, compute capability: 3.5"
, name: "/device:GPU:2"
device_type: "GPU"
memory_limit: 11273211085
locality {
  bus_id: 2
  numa_node: 1
  links {
    link {
      device_id: 3
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 10515289147129957701
physical_dev

In [17]:
import keras.backend as K

def base_f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return f1

def f1_min(y_true, y_pred):
    f1 = base_f1(y_true, y_pred)
    return K.min(f1)

def f1_max(y_true, y_pred):
    f1 = base_f1(y_true, y_pred)
    return K.max(f1)

def f1_mean(y_true, y_pred):
    f1 = base_f1(y_true, y_pred)
    return K.mean(f1)

def f1_std(y_true, y_pred):
    f1 = base_f1(y_true, y_pred)
    return K.std(f1)

In [18]:
class TrackHistory(keras.callbacks.Callback):
    
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))

In [19]:
class ImprovedDataGenerator(DataGenerator):
    
    # in contrast to the base DataGenerator we add a target wishlist to init
    def __init__(self, list_IDs, labels, modelparameter, imagepreprocessor, target_wishlist):
        super().__init__(list_IDs, labels, modelparameter, imagepreprocessor)
        self.target_wishlist = target_wishlist
    
    def get_targets_per_image(self, identifier):
        return self.labels.loc[self.labels.Id==identifier][self.target_wishlist].values


In [20]:
use_dropout = False
from keras.models import Model, Sequential,load_model
from keras.layers import Activation, Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import BatchNormalization
from keras.losses import binary_crossentropy
# from inception_resnet_v2 import InceptionResNetV2
from keras.optimizers import Adadelta,Adam
from keras.models import load_model
import tensorflow as tf
from keras.layers import Input

class ImprovedModel(BaseLineModel):
    
    def __init__(self, modelparameter,
                 use_dropout,
                 my_metrics=[f1_mean, f1_std, f1_min, f1_max,'accuracy']):
        
        super().__init__(modelparameter)
        self.my_metrics = my_metrics
        self.use_dropout = use_dropout
        
    def learn(self):
        self.history = TrackHistory()
        return self.model.fit_generator(generator=self.training_generator,
                    validation_data=self.validation_generator,
                    epochs=self.params.n_epochs, 
                    use_multiprocessing=True,
                    workers=8,
                    steps_per_epoch=100,
                    validation_steps=50,                                     
                    callbacks = [self.history])
    
    def build_model(self):
        self.model = Sequential()
        self.model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=self.input_shape))
        self.model.add(Conv2D(32, (3, 3), activation='relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))
        self.model.add(Dropout(0.25))
        self.model.add(Flatten())
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Dense(self.num_classes, activation='sigmoid'))       
    

In [21]:
print(keras.__version__)

2.0.8


In [22]:
parameter = ModelParameters()
preprocessor = ImagePreprocessor(parameter)
labels = train_labels
training_generator = DataGenerator(partition['train'], labels,
                                           parameter, preprocessor)
validation_generator = DataGenerator(partition['validation'], labels,
                                             parameter, preprocessor)
predict_generator = PredictGenerator(partition['validation'], preprocessor, train_path)

In [23]:
config = tf.ConfigProto(intra_op_parallelism_threads=2,\
        inter_op_parallelism_threads=2, allow_soft_placement=True,\
        device_count = {'CPU' : 2, 'GPU' : 1})
session = tf.Session(config=config)
K.set_session(session)

In [24]:
import warnings
warnings.filterwarnings("ignore")
model = ImprovedModel(parameter, use_dropout=use_dropout)
model.build_model()
model.compile_model()
model.set_generators(training_generator, validation_generator)
epoch_history = model.learn()
proba_predictions = model.predict(predict_generator)
#model.save("improved_model.h5")
improved_proba_predictions = pd.DataFrame(proba_predictions)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
pd.DataFrame(proba_predictions)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.435248,0.056095,0.160077,0.068470,0.075845,0.121998,0.046047,0.131524,0.003802,0.007955,...,0.047340,0.076626,0.006314,0.161534,0.037209,0.139064,0.014042,0.308225,0.015129,0.003911
1,0.431858,0.050662,0.128042,0.046942,0.040974,0.093354,0.044119,0.106936,0.002390,0.005458,...,0.027585,0.060010,0.003468,0.160404,0.027471,0.105933,0.008125,0.316956,0.008367,0.002491
2,0.429936,0.039634,0.138280,0.052903,0.060935,0.100455,0.030663,0.108412,0.001686,0.003892,...,0.033643,0.056420,0.003088,0.136009,0.024523,0.117712,0.008126,0.286155,0.008878,0.001747
3,0.429860,0.039201,0.138427,0.052335,0.059751,0.099938,0.030040,0.106245,0.001659,0.003679,...,0.033047,0.055610,0.002973,0.135393,0.023953,0.116278,0.007722,0.285410,0.008611,0.001719
4,0.434904,0.061005,0.144354,0.058547,0.052167,0.108814,0.053142,0.121740,0.003687,0.007941,...,0.036010,0.071629,0.005306,0.174064,0.035118,0.121988,0.011695,0.324707,0.012027,0.003812
5,0.432274,0.050722,0.138208,0.053917,0.051736,0.102044,0.042046,0.113366,0.002470,0.005554,...,0.032809,0.062744,0.003880,0.157552,0.029179,0.116717,0.009260,0.309145,0.009851,0.002563
6,0.430631,0.044036,0.137183,0.052785,0.055083,0.100297,0.034738,0.109370,0.001935,0.004440,...,0.032656,0.058134,0.003299,0.144678,0.026004,0.116429,0.008203,0.295423,0.009056,0.002003
7,0.431717,0.041810,0.146072,0.055348,0.064299,0.105269,0.032730,0.109090,0.002084,0.004066,...,0.035917,0.060343,0.003503,0.140703,0.026173,0.119453,0.008742,0.290713,0.009673,0.002166
8,0.433062,0.046025,0.145592,0.055576,0.061132,0.106047,0.038009,0.112523,0.002564,0.005055,...,0.036042,0.064348,0.004086,0.149204,0.028629,0.119266,0.009934,0.301247,0.010377,0.002662
9,0.436490,0.060621,0.156116,0.064372,0.064286,0.117707,0.052771,0.126958,0.004366,0.008374,...,0.042293,0.077666,0.006347,0.172340,0.037800,0.130054,0.013914,0.321646,0.014152,0.004515


In [26]:
improved_proba_predictions.to_csv("Base_predictions.csv")