# Develop

Some basics of Keras and tensorflow I have learned with the help of these notebooks: [ImageGenerator](https://www.kaggle.com/sdelecourt/cnn-with-keras), [ROC curves](https://www.kaggle.com/fmarazzi/baseline-keras-cnn-roc-fast-10min-0-925-lb)

In [0]:
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow import keras
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score
import zipfile


In [0]:
import tensorflow as tf
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [0]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11346269693117955111
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 7354349205296020109
physical_device_desc: "device: XLA_CPU device"
]


In [0]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

[]

### Define Data class

In [0]:
class Data:
    def __init__(self, train_path, train_labels_path, test_path, unique_identifier, target_class_column):
        self.train_path = train_path
        self.train_labels_path = train_labels_path
        self.test_path = test_path
    
        self.train_data_gen = ImageDataGenerator(validation_split = 0.2, 
                                                 # Fraction of images reserved for validation
                                                rescale = 1./255 ) # Normalize
        self.test_data_gen = ImageDataGenerator(rescale = 1./255)

        self.image_size = (96, 96)
        self.unique_identifier = 'id'
        self.target_class_column = 'label'
        
        self.train_df = self._create_train_df(self.train_labels_path)

        #!!!
        self.train_df = self.train_df.head(20)
        #!!!
        
        self.test_df = self._create_test_df(self.test_path)
        self._create_data_generators()
        
    def _create_train_df(self, train_labels_path):
        train_df = self._load_data_labels(train_labels_path)
        train_df['id'] = train_df['id'].apply(lambda x: x+".tif")
        return train_df.astype({'label': 'str'})

    def _create_test_df(self, test_path):
        filenames =[]
        for dirname, _, filename in os.walk(test_path):
            filenames.extend(filename)
        return pd.DataFrame({"id":filenames})

    def _create_data_generators(self):
        pars = {'dataframe': self.train_df, 
                'directory': self.train_path, 
                'x_col': self.unique_identifier, # filenames of images
                'y_col': self.target_class_column, # class
                'target_size': self.image_size,
                'class_mode':'binary', 
                'batch_size': 64}

        self.train_generator = self.train_data_gen.flow_from_dataframe(**pars, subset = 'training') # specify if validation_split is specified in ImageDataGenerator
        self.validation_generator = self.train_data_gen.flow_from_dataframe(**pars, subset = 'validation') # specify if validation_split is specified in ImageDataGenerator
                                                        
        self.test_generator = self.test_data_gen.flow_from_dataframe(dataframe = self.test_df,
                                                    directory = test_path,
                                                    x_col = self.unique_identifier, # filename 
                                                    class_mode = None,
                                                    target_size = self.image_size,
                                                    batch_size = 1)
    def _load_data_labels(self, filename):
        return pd.read_csv(filename)

### Define Model Container class:

In [0]:
class ModelContainer:
    
    def __init__(self, models=[]):
        self.models = models
        self.best_model = None
        self.predictions = None
        self.roc_auc = {}
        self.val_roc_auc = {} # Validation set
    
    def add_model(self, model):
        self.models.append(model)
    
    def score_models(self, data):

        for (idx, model) in enumerate(self.models):
            # Want to keep the fitted models because will take too long to 
            # retrain once decide on the best. So keep in self.models.
            
            history = self.models[idx].fit(data.train_generator, 
                                           epochs = 1, 
                                           validation_data = (data.validation_generator), 
                                           # Data on which to evaluate the loss and any model 
                                           # metrics at the end of each epoch. The model will 
                                           # not be trained on this data.
                                           use_multiprocessing=True)
            self.roc_auc[model] = history.history['binary_accuracy'][-1]
            self.val_roc_auc[model] = history.history['val_binary_accuracy'][-1]
       
    def select_best_model(self):
        self.best_model = min(self.roc_auc, key=self.roc_auc.get)
        
    def best_model_predict(self, data_gen):
        self.predictions = self.best_model.predict(data_gen)
    
    def print_summary(self):
        print('\nModel Summaries:\n')
        for model in models:
            print('\n', model, '- ROC AUC:', models.roc_auc[model])
            print('\n', model, '- Validation ROC AUC:', models.val_roc_auc[model])

        print('\nBest Model:\n', models.best_model)
        print('\nROC AUC of Best Model\n', models.roc_auc[models.best_model])


### Define parameters for this project & download the data from Kaggle:

In [1]:
# Use this to upload the kaggle.json from your local machine:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [6]:
!cp kaggle.json '/root/.kaggle/'
!chmod 600 '/root/.kaggle/kaggle.json'
!kaggle config set -n path -v '/content/'
!kaggle competitions download histopathologic-cancer-detection

- path is now set to: /content/
Downloading sample_submission.csv.zip to /content/competitions/histopathologic-cancer-detection
  0% 0.00/1.33M [00:00<?, ?B/s]
100% 1.33M/1.33M [00:00<00:00, 93.6MB/s]
Downloading train_labels.csv.zip to /content/competitions/histopathologic-cancer-detection
 98% 5.00M/5.10M [00:00<00:00, 26.9MB/s]
100% 5.10M/5.10M [00:00<00:00, 25.0MB/s]
Downloading test.zip to /content/competitions/histopathologic-cancer-detection
100% 1.30G/1.30G [00:12<00:00, 103MB/s]
100% 1.30G/1.30G [00:12<00:00, 110MB/s]
Downloading train.zip to /content/competitions/histopathologic-cancer-detection
100% 4.98G/4.98G [02:50<00:00, 22.2MB/s]
100% 4.98G/4.98G [02:50<00:00, 31.3MB/s]


In [0]:
root_dir = '/content/competitions/histopathologic-cancer-detection/'
train_path = os.path.join(root_dir, 'train/')
test_path = os.path.join(root_dir, 'test/')
train_labels_path = os.path.join(root_dir, 'train_labels.csv/train_labels.csv')

unique_identifier = 'id'
target_class_column = 'label'

In [0]:
# Unzip files in all zipped directories:
os.chdir(root_dir)

for path_to_zip_file in os.listdir():
    new_directory = os.path.splitext(path_to_zip_file)[0]
    os.makedirs(new_directory, exist_ok=True)
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
      zip_ref.extractall(new_directory)
      zip_ref.close()

In [11]:
! ls -l | head -10

total 6616892
drwxr-xr-x 2 root root       4096 Oct 17 18:26 sample_submission
drwxr-xr-x 2 root root       4096 Oct 17 18:22 sample_submission.csv
-rw-r--r-- 1 root root    1394488 Oct 17 18:18 sample_submission.csv.zip
drwxr-xr-x 2 root root    4337664 Oct 17 18:26 test
-rw-r--r-- 1 root root 1401100547 Oct 17 18:18 test.zip
drwxr-xr-x 2 root root   16515072 Oct 17 18:25 train
drwxr-xr-x 2 root root       4096 Oct 17 18:25 train_labels.csv
-rw-r--r-- 1 root root    5352900 Oct 17 18:18 train_labels.csv.zip
-rw-r--r-- 1 root root 5346961539 Oct 17 18:21 train.zip


In [24]:
data = Data(train_path, train_labels_path, test_path, unique_identifier, target_class_column)

Found 16 validated image filenames belonging to 2 classes.
Found 4 validated image filenames belonging to 2 classes.
Found 57458 validated image filenames.


## Make some simple models:

In [0]:
# Baseline model:
architect1 = [
    keras.layers.Flatten(input_shape=[96, 96, 3]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
]
model1 = keras.models.Sequential(architect1)
model2 = keras.models.Sequential(architect1)
model1.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.001), metrics =['binary_accuracy'])
model2.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.003), metrics =['binary_accuracy'])


In [0]:
model1.get_config()

{'name': 'sequential_6',
 'layers': [{'class_name': 'Flatten',
   'config': {'name': 'flatten_6',
    'trainable': True,
    'batch_input_shape': (None, 96, 96, 3),
    'dtype': 'float32',
    'data_format': 'channels_last'}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_18',
    'trainable': True,
    'dtype': 'float32',
    'units': 300,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None, 'dtype': 'float32'}},
    'bias_initializer': {'class_name': 'Zeros',
     'config': {'dtype': 'float32'}},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_19',
    'trainable': True,
    'dtype': 'float32',
    'units': 100,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': 

In [0]:
models = ModelContainer()
models.add_model(model1)
models.add_model(model2)

In [17]:
models.models

[<tensorflow.python.keras.engine.sequential.Sequential at 0x7efd6a1971d0>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x7efd6a197cf8>]

In [33]:
models.score_models(data)
models.select_best_model()
models.best_model_predict(data.test_generator)

dict_keys(['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy'])
dict_keys(['loss', 'binary_accuracy', 'val_loss', 'val_binary_accuracy'])


KeyboardInterrupt: ignored

In [0]:
models.print_summary()


In [0]:
#!watch -n1 nvidia-smi
alpha_vals = [0.001,0.003,0.01,0.03,0.1,0.3]
alpha_tuning_df = pd.DataFrame({"alpha":alpha_vals})
alpha_tuning_df = alpha_tuning_df.set_index('alpha')
for alpha in alpha_vals:
    model.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=alpha), metrics =['binary_accuracy'])
    history = model.fit(train_generator, epochs = 1, validation_data = (validation_generator), use_multiprocessing=True)
    alpha_tuning_df['accuracy'].loc[alpha] = history.history['acc']
    alpha_tuning_df['validation_accuracy'].loc[alpha] = history.history['val_acc']
    
    

In [0]:
import
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))


In [0]:
history = model.fit(train_generator, epochs = 15, validation_data = (validation_generator))

In [0]:
# Learning Curve
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
plt.show()

In [0]:
y = model.predict(test_generator)