# Develop

Some basics of Keras and tensorflow I have learned with the help of these notebooks: [ImageGenerator](https://www.kaggle.com/sdelecourt/cnn-with-keras), [ROC curves](https://www.kaggle.com/fmarazzi/baseline-keras-cnn-roc-fast-10min-0-925-lb)

In [18]:
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow import keras
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import cross_val_score


In [19]:
import tensorflow as tf
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [20]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11346269693117955111
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 7354349205296020109
physical_device_desc: "device: XLA_CPU device"
]


In [21]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

[]

### Define Data class

In [22]:
class Data:
    def __init__(self, train_path, train_labels_path, test_path, unique_identifier, target_class_column):
        self.train_path = train_path
        self.train_labels_path = train_labels_path
        self.test_path = test_path
    
        self.train_data_gen = ImageDataGenerator(validation_split = 0.2, \
                                rescale = 1./255 ) # Normalize
        self.test_data_gen = ImageDataGenerator(rescale = 1./255)

        self.image_size = (96, 96)
        self.unique_identifier = 'id'
        self.target_class_column = 'label'
        
        self.train_df = self._create_train_df(self.train_labels_path)
        self.test_df = self._create_test_df(self.test_path)
        self._create_generators()
        
    def _create_train_df(self, train_labels_path):
        train_df = self._load_labels(train_labels_path)
        train_df['id'] = train_df['id'].apply(lambda x: x+".tif")
        return train_df.astype({'label': 'str'})

    def _create_test_df(self, test_path):
        filenames =[]
        for dirname, _, filename in os.walk(test_path):
            filenames.extend(filename)
        return pd.DataFrame({"id":filenames})

    def _create_generators(self):
        pars = {'dataframe': self.train_df, 
                'directory': self.train_path, 
                'x_col': self.unique_identifier, # filenames of images
                'y_col': self.target_class_column, # class
                'target_size': self.image_size,
                'class_mode':'binary', 
                'batch_size': 64}

        self.train_generator = self.train_data_gen.flow_from_dataframe(**pars, subset = 'training') # specify if validation_split is specified in ImageDataGenerator
        self.validation_generator = self.train_data_gen.flow_from_dataframe(**pars, subset = 'validation') # specify if validation_split is specified in ImageDataGenerator
                                                        
        self.test_generator = self.test_data_gen.flow_from_dataframe(dataframe = self.test_df,
                                                    directory = test_path,
                                                    x_col = self.unique_identifier, # filename 
                                                    class_mode = None,
                                                    target_size = self.image_size,
                                                    batch_size = 1)
    def _load_labels(self, filename):
        return pd.read_csv(filename)

### Define Model Container class:

In [23]:
class ModelContainer:
    
    def __init__(self, models=[]):
        self.models = models
        self.best_model = None
        self.predictions = None
        self.roc_auc = {}
        self.val_roc_auc = {} # Validation set
    
    def add_model(self, model):
        self.models.append(model)
    
    def score_models(self, data):

        for (model, idx) in enumerate(self.models):
            # Want to keep the fitted models because will take too long to retrain once decide on the best:
            history = self.models[idx].fit(data.train_generator, epochs = 1, validation_data = (data.validation_generator), use_multiprocessing=True)
            self.roc_auc[model] = history.history['acc'][-1]
            self.val_roc_auc[model] = history.history['val_acc'][-1]
       
    def select_best_model(self):
        self.best_model = min(self.roc_auc, key=self.roc_auc.get)
        
    def best_model_predict(self, data_gen):
        self.predictions = self.best_model.predict(data_gen)
    
    def print_summary(self):
        print('\nModel Summaries:\n')
        for model in models.mean_mse:
            print('\n', model, '- ROC AUC:', models.roc_auc[model])
            print('\n', model, '- Validation ROC AUC:', models.val_roc_auc[model])

        print('\nBest Model:\n', models.best_model)
        print('\nROC AUC of Best Model\n', models.roc_auc[models.best_model])


### Define parameters for this project:

In [24]:
train_path = '/kaggle/input/histopathologic-cancer-detection/train'
test_path = '/kaggle/input/histopathologic-cancer-detection/test'
train_labels_path = '/kaggle/input/histopathologic-cancer-detection/train_labels.csv'
unique_identifier = 'id'
target_class_column = 'label'


In [32]:
# Simple architecture
architect1 = [
    keras.layers.Flatten(input_shape=[96, 96, 3]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
]
model1 = keras.models.Sequential(architect1)
model2 = keras.models.Sequential(architect1)
model1.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.001), metrics =['binary_accuracy'])
model2.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=0.003), metrics =['binary_accuracy'])


In [34]:
model1.get_config()

{'name': 'sequential_6',
 'layers': [{'class_name': 'Flatten',
   'config': {'name': 'flatten_6',
    'trainable': True,
    'batch_input_shape': (None, 96, 96, 3),
    'dtype': 'float32',
    'data_format': 'channels_last'}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_18',
    'trainable': True,
    'dtype': 'float32',
    'units': 300,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None, 'dtype': 'float32'}},
    'bias_initializer': {'class_name': 'Zeros',
     'config': {'dtype': 'float32'}},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_19',
    'trainable': True,
    'dtype': 'float32',
    'units': 100,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': 

In [37]:
data = Data(train_path, train_labels_path, test_path, unique_identifier, target_class_column)

Found 176020 validated image filenames belonging to 2 classes.
Found 44005 validated image filenames belonging to 2 classes.
Found 57458 validated image filenames.


In [None]:
#test_df = test_df.astype({'id': 'str'})

In [35]:
models = ModelContainer()
models.add_model(model1)
models.add_model(model2)

In [36]:
models.models

[<tensorflow.python.keras.engine.sequential.Sequential at 0x7f4df85c8da0>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x7f4df85b92b0>]

In [38]:
models.score_models(data)
models.select_best_model()
models.best_model_predict(data.test_generator)

TypeError: list indices must be integers or slices, not Sequential

In [None]:
models.print_summary()


In [None]:
#!watch -n1 nvidia-smi
alpha_vals = [0.001,0.003,0.01,0.03,0.1,0.3]
alpha_tuning_df = pd.DataFrame({"alpha":alpha_vals})
alpha_tuning_df = alpha_tuning_df.set_index('alpha')
for alpha in alpha_vals:
    model.compile(loss = keras.losses.binary_crossentropy, optimizer=keras.optimizers.SGD(lr=alpha), metrics =['binary_accuracy'])
    history = model.fit(train_generator, epochs = 1, validation_data = (validation_generator), use_multiprocessing=True)
    alpha_tuning_df['accuracy'].loc[alpha] = history.history['acc']
    alpha_tuning_df['validation_accuracy'].loc[alpha] = history.history['val_acc']
    
    

In [None]:
import
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))


In [None]:
history = model.fit(train_generator, epochs = 15, validation_data = (validation_generator))

In [None]:
# Learning Curve
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
plt.show()

In [None]:
y = model.predict(test_generator)