# Develop

Some basics of Keras and tensorflow I have learned with the help of these notebooks: [ImageGenerator](https://www.kaggle.com/sdelecourt/cnn-with-keras), [ROC curves](https://www.kaggle.com/fmarazzi/baseline-keras-cnn-roc-fast-10min-0-925-lb)

In [0]:
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow import keras
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score
import zipfile
from google.colab import files

Using TensorFlow backend.


In [0]:
import tensorflow as tf
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [0]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11346269693117955111
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 7354349205296020109
physical_device_desc: "device: XLA_CPU device"
]


In [0]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

[]

### Define Data class

In [0]:
class Data:
    def __init__(self, train_path, train_labels_path, test_path, unique_identifier, target_class_column):
        self.train_path = train_path
        self.train_labels_path = train_labels_path
        self.test_path = test_path
    
        self.train_data_gen = ImageDataGenerator(validation_split = 0.2, 
                                                 # Fraction of images reserved for validation
                                                rescale = 1./255 ) # Normalize
        self.test_data_gen = ImageDataGenerator(rescale = 1./255)

        self.image_size = (96, 96)
        self.n_channels = 3
        self.unique_identifier = 'id'
        self.target_class_column = 'label'
        
        self.train_df = self._create_train_df(self.train_labels_path)

        self.test_df = self._create_test_df(self.test_path)
        self._create_data_generators()
        
    def _create_train_df(self, train_labels_path):
        train_df = self._load_target_labels(train_labels_path)
        train_df['id'] = train_df['id'].apply(lambda x: x+".tif")
        return train_df.astype({'label': 'str'})

    def _create_test_df(self, test_path):
        filenames =[]
        for dirname, _, filename in os.walk(test_path):
            filenames.extend(filename)
        return pd.DataFrame({"id":filenames})

    def _create_data_generators(self):
        pars = {'dataframe': self.train_df, 
                'directory': self.train_path, 
                'x_col': self.unique_identifier, # filenames of images
                'y_col': self.target_class_column, # class
                'target_size': self.image_size,
                'class_mode':'binary', 
                'batch_size': 64}

        self.train_generator = self.train_data_gen.flow_from_dataframe(**pars, subset = 'training') # specify if validation_split is specified in ImageDataGenerator
        self.validation_generator = self.train_data_gen.flow_from_dataframe(**pars, subset = 'validation') # specify if validation_split is specified in ImageDataGenerator
                                                        
        self.test_generator = self.test_data_gen.flow_from_dataframe(dataframe = self.test_df,
                                                    directory = test_path,
                                                    x_col = self.unique_identifier, # filename 
                                                    class_mode = None,
                                                    target_size = self.image_size,
                                                    batch_size = 1)
    def _load_target_labels(self, filename):
        return pd.read_csv(filename)

### Define Model Container class:

From Textbook: "If the training set was very skewed, with some classes being overrepresented and others underrepresented, it would be useful to set the class_weight argument when calling the fit() method, which would give a larger weight to underrepresented classes and a lower weight to overrepresented classes."

In [0]:
class CheckpointDownloadCallback(keras.callbacks.Callback):    
  def on_epoch_end(self, epoch, logs):
    filename = self.model.name + ".h5"
    print(filename)
    files.download(filename)  #Download from colab to local.

class ModelContainer:
    
    def __init__(self, models=[]):
        self.models = {}
        [self.add_model(model) for model in models]

        self.best_model = None
        self.predictions = None
        self.roc_auc = {}
        self.val_roc_auc = {} # Validation set
    
    def add_model(self, model):
        self.models[model.name] = model
    
    def train_model(self, data, model_name):
        # Do this one at a time since training these models takes considerable time
       
        model = self.models[model_name]

        checkpoint_cb = keras.callbacks.ModelCheckpoint(model.name + ".h5")
        #download_model_cb = CheckpointDownloadCallback()
        early_stopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

        history = model.fit(data.train_generator, 
                            epochs = 15, 
                            validation_data = (data.validation_generator), 
                            #use_multiprocessing=True,
                            callbacks = [checkpoint_cb, early_stopping_cb])
            
        self.roc_auc[model.name] = history.history['binary_accuracy'][-1]
        self.val_roc_auc[model.name] = history.history['val_binary_accuracy'][-1]

    def select_best_model(self):
        # Selects and saves the best model. 
        self.best_model = min(self.roc_auc, key=self.roc_auc.get)
        # Save to file:
        self.models[self.best_model].save(self.best_model.name + ".h5")
        
    def best_model_predict(self, data_gen):
        self.predictions = self.models[self.best_model].predict(data_gen)
    
    def print_summary(self):
        print('\nModel Summaries:\n')
        for model in self.models:
            print('\n', model.name, '- ROC AUC:', models.roc_auc[model])
            print('\n', model.name, '- Validation ROC AUC:', models.val_roc_auc[model])

        print('\nBest Model:\n', self.best_model)
        print('\nROC AUC of Best Model\n', models.roc_auc[self.best_model])


### Define parameters for this project & download the data from Kaggle:

In [0]:
# Use this to upload the kaggle.json from your local machine:
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
!cp kaggle.json '/root/.kaggle/'
!chmod 600 '/root/.kaggle/kaggle.json'
!kaggle config set -n path -v '/content/'
!kaggle competitions download histopathologic-cancer-detection

- path is now set to: /content/
Downloading sample_submission.csv.zip to /content/competitions/histopathologic-cancer-detection
  0% 0.00/1.33M [00:00<?, ?B/s]
100% 1.33M/1.33M [00:00<00:00, 93.1MB/s]
Downloading train_labels.csv.zip to /content/competitions/histopathologic-cancer-detection
 98% 5.00M/5.10M [00:00<00:00, 42.5MB/s]
100% 5.10M/5.10M [00:00<00:00, 32.6MB/s]
Downloading test.zip to /content/competitions/histopathologic-cancer-detection
100% 1.30G/1.30G [00:15<00:00, 129MB/s]
100% 1.30G/1.30G [00:15<00:00, 90.4MB/s]
Downloading train.zip to /content/competitions/histopathologic-cancer-detection
100% 4.97G/4.98G [01:11<00:00, 138MB/s]
100% 4.98G/4.98G [01:11<00:00, 74.6MB/s]


In [0]:
root_dir = '/content/competitions/histopathologic-cancer-detection/'
train_path = os.path.join(root_dir, 'train/')
test_path = os.path.join(root_dir, 'test/')
train_labels_path = os.path.join(root_dir, 'train_labels.csv/train_labels.csv')

unique_identifier = 'id'
target_class_column = 'label'

In [0]:
# Unzip files in all zipped directories:
os.chdir(root_dir)

for path_to_zip_file in os.listdir():
    new_directory = os.path.splitext(path_to_zip_file)[0]
    os.makedirs(new_directory, exist_ok=True)
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
      zip_ref.extractall(new_directory)
      zip_ref.close()

In [0]:
os.chdir(root_dir)
! ls -l | head -10

total 6617088
drwxr-xr-x 2 root root       4096 Oct 21 19:19 sample_submission.csv
-rw-r--r-- 1 root root    1394488 Oct 21 19:14 sample_submission.csv.zip
drwxr-xr-x 2 root root    4395008 Oct 21 19:19 test
-rw-r--r-- 1 root root 1401100547 Oct 21 19:15 test.zip
drwxr-xr-x 2 root root   16666624 Oct 21 19:18 train
drwxr-xr-x 2 root root       4096 Oct 21 19:18 train_labels.csv
-rw-r--r-- 1 root root    5352900 Oct 21 19:15 train_labels.csv.zip
-rw-r--r-- 1 root root 5346961539 Oct 21 19:16 train.zip


In [74]:
data = Data(train_path, train_labels_path, test_path, unique_identifier, target_class_column)

Found 176020 validated image filenames belonging to 2 classes.
Found 44005 validated image filenames belonging to 2 classes.
Found 57458 validated image filenames.


## Make some simple models:

In [0]:
# Baseline model:
architect1 = [
    keras.layers.Flatten(input_shape=[96, 96, 3]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid") # Sigmoid, 1 output for binary classification
]

model1 = keras.models.Sequential(architect1, name = 'two_layer_MLP')

##model2 = keras.models.Sequential(architect1)
#model2.name = 
model1.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.001), metrics =['binary_accuracy'])
#model2.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.003), metrics =['binary_accuracy'])


In [0]:
baseline_layers = [
    keras.layers.Conv2D(filters = 64 , kernel_size = 7, activation = "relu", padding = "valid", input_shape = [data.image_size[0], data.image_size[1], data.n_channels]), 
    keras.layers.MaxPooling2D(pool_size = 2), 
    keras.layers.Conv2D(128, 3, activation = "relu", padding = "same"), 
    keras.layers.Conv2D(128, 3 , activation = "relu" , padding = "same"), 
    keras.layers.MaxPooling2D(2), 
    keras.layers.Conv2D(256, 3 , activation = "relu" , padding = "same"), 
    keras.layers.Conv2D(256, 3 , activation = "relu" , padding = "same"),   
    keras.layers.MaxPooling2D(2), 
    keras.layers.Flatten(), 
    keras.layers.Dense(128, activation = "relu"), 
    keras.layers.Dropout(0.5), 
    keras.layers.Dense(64, activation = "relu"), 
    keras.layers.Dropout(0.5), 
    keras.layers.Dense(1, activation = "sigmoid")
] 
baseline_model = keras.models.Sequential(baseline_layers, name = 'baseline')

baseline_model.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.001), metrics =['binary_accuracy'])

In [27]:
baseline_model.summary()

Model: "baseline"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 90, 90, 64)        9472      
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 45, 45, 64)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 45, 45, 128)       73856     
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 45, 45, 128)       147584    
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 22, 22, 128)       0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 22, 22, 256)       295168    
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 22, 22, 256)       590

In [0]:
models = ModelContainer()
models.add_model(baseline_model)

In [44]:
models.models

{'baseline': <tensorflow.python.keras.engine.sequential.Sequential at 0x7fd53b4fcf60>}

In [0]:
models.train_model(data, baseline_model.name)

Epoch 1/15

In [72]:
#!chmod 777 "baseline.h5"
files.download("baseline.h5")

KeyboardInterrupt: ignored

In [70]:
!ls -l | head -10

total 6637016
-rwxrwxrwx 1 root root   20402752 Oct 22 00:52 baseline.h5
drwxr-xr-x 2 root root       4096 Oct 21 19:19 sample_submission.csv
-rw-r--r-- 1 root root    1394488 Oct 21 19:14 sample_submission.csv.zip
drwxr-xr-x 2 root root    4395008 Oct 21 19:19 test
-rw-r--r-- 1 root root 1401100547 Oct 21 19:15 test.zip
drwxr-xr-x 2 root root   16666624 Oct 21 19:18 train
drwxr-xr-x 2 root root       4096 Oct 21 19:18 train_labels.csv
-rw-r--r-- 1 root root    5352900 Oct 21 19:15 train_labels.csv.zip
-rw-r--r-- 1 root root 5346961539 Oct 21 19:16 train.zip


In [0]:
models.add_model(model2)
models.train_model(data)


In [52]:
models.select_best_model()
#models.best_model_predict(data.test_generator)

NameError: ignored

In [51]:
models.best_model.name

AttributeError: ignored

In [0]:
models.print_summary()



Model Summaries:


 two_layer_MLP - ROC AUC: 1.0

 two_layer_MLP - Validation ROC AUC: 0.75

Best Model:
 <tensorflow.python.keras.engine.sequential.Sequential object at 0x7f4871d7aa58>

ROC AUC of Best Model
 1.0


In [0]:
#!watch -n1 nvidia-smi
alpha_vals = [0.001,0.003,0.01,0.03,0.1,0.3]
alpha_tuning_df = pd.DataFrame({"alpha":alpha_vals})
alpha_tuning_df = alpha_tuning_df.set_index('alpha')
for alpha in alpha_vals:
    model1.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=alpha), metrics =['binary_accuracy'])
    history = model1.fit(train_generator, epochs = 1, validation_data = (validation_generator), use_multiprocessing=False)
    alpha_tuning_df['accuracy'].loc[alpha] = history.history['acc']
    alpha_tuning_df['validation_accuracy'].loc[alpha] = history.history['val_acc']
    
    

NameError: ignored

In [0]:
# Learning Curve
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
plt.show()

In [0]:
y = model.predict(test_generator)