In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os

import tensorflow as tf
from tensorflow.keras import layers 
from tensorflow import keras
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

try:
    from kaggle_datasets import KaggleDatasets
    dataset_gcs = KaggleDatasets().get_gcs_path('siim-isic-melanoma-classification')
except ModuleNotFoundError:
    #hardcode path while testing locally
    dataset_gcs = 'gs://kds-599205fd0d8963558ce1308147ba090f776d31b1662a67f2ddccfa38'


In [None]:
tf.__version__


In [None]:
params = {
    'batch_size' : 3,
    'img_size' : [512, 512],
    'epochs': 10
}


In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
sub.head(1)

In [None]:
train_df = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
train_df.groupby('target').count()

In [None]:
def decode_image_label(tfrec):
    '''
    function to decode an image and target label from tfrecord
    
    args:
        tfrec: tfrecord, single record of training/validation data
    
    returns:
        decoded_image: tensor, converted image from tfrecord
        label: tensor, integer, either 1 or 0
    
    '''
    
    features_dictionary = {
        'image': tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
        }
    features = tf.io.parse_single_example(tfrec, features_dictionary)
    decoded_image = tf.io.decode_jpeg(features['image'], 3)
    decoded_image = tf.image.resize(decoded_image, params['img_size'])
    label = features['target']
    
    return decoded_image, label
    


In [None]:
def decode_image(tfrec):
    '''
    function to decode an image from tfrecord
    
    args:
        tfrec: tfrecord, single record of training/validation data
    
    returns:
        decoded_image: tensor, converted image from tfrecord
    
    '''
    
    features_dictionary = {
        'image': tf.io.FixedLenFeature([], tf.string)
        }
    features = tf.io.parse_single_example(tfrec, features_dictionary)
    decoded_image = tf.io.decode_jpeg(features['image'], 3)
    decoded_image = tf.image.resize(decoded_image, img_size)
    
    return decoded_image

In [None]:
def normalize_image_label(decoded_image, label):
    '''
    function to convert an image tensor values from 0 to 255 
    -> -1 to 1
    to be used when dealing with tfrecords containing labels
    
    args:
        decoded_image: tensor that is an image with values from 0 to 255
        label: tensor, target label
    
    returns: 
        image_tensor: tensor that is an image with values from -1 to 1
        label, same as input
    
    '''
    
    #add dim at the zero axis Shape will be from (x, y, z) -> (None, x, y, z)
    image_tensor = tf.expand_dims(decoded_image, 0)
    #undo the above line -- this is needed due to TF not allowing a filtered tensor py_function
    image_tensor = tf.gather(image_tensor, 0)

    #convert tensor values to between -1 and 1 (0 to 255 -> -1 to 1)
    image_tensor = (tf.cast(image_tensor, tf.float32) - 127.5) / 127.5

    return image_tensor, label

In [None]:
def normalize_image(decoded_image):
    '''
    function to convert an image tensor values from 0 to 255 
    -> -1 to 1
    
    args:
        decoded_image: tensor that is an image with values from 0 to 255
    
    returns: 
        image_tensor: tensor that is an image with values from -1 to 1
    
    '''
    
    #add dim at the zero axis Shape will be from (x, y, z) -> (None, x, y, z)
    image_tensor = tf.expand_dims(decoded_image, 0)
    #undo the above line -- this is needed due to TF not allowing a filtered tensor py_function
    image_tensor = tf.gather(image_tensor, 0)

    #convert tensor values to between -1 and 1 (0 to 255 -> -1 to 1)
    image_tensor = (tf.cast(image_tensor, tf.float32) - 127.5) / 127.5

    return image_tensor

In [None]:
def random_flip(image, label):
    '''
    function to randomly flip images on the x and/or y axis
    
    args:
        image: tensor, an image
        label: tensor, target label
    
    returns: 
        image: tensor, same as input, but possibly flipped on x and/or y axis
        label, tensor, same as input
    '''
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    return image, label  

In [None]:
def get_train_ds(tfrecords, batch_size):
    '''
    function to create a ds pipeline from tfrecord files
    
    args:
        tfrecords: list, tfrecord file paths
        batch_size: int, batch size for number of records to pass into
            model at a time
    returns:
        ds: tensorflow input pipeline with images and labels
    '''
    ds = (tf.data.TFRecordDataset(filenames=[tfrecords]).
          cache().
          map(decode_image_label).
          map(normalize_image_label).
          map(random_flip).
          repeat().
          shuffle(256).
          batch(batch_size,
               drop_remainder=True).
          prefetch(tf.data.experimental.AUTOTUNE)
         )
    

    
    return ds
    

In [None]:
def get_test_ds(tfrecords):
    '''
    function to create a dataset for test data
    args:
        tfrecords: list, tfrecord file paths
        batch_size: int, batch size for number of records to pass into
            model at a time
    returns:
        ds: tensorflow input pipeline with images and labels
    
    '''
#
    ds = (tf.data.TFRecordDataset(filenames=[tfrecords]).
          cache().
          map(decode_image).
          map(normalize_image).
#           map(random_flip).
          batch(batch_size,
               drop_remainder=True).
          prefetch(tf.data.experimental.AUTOTUNE)
         )
    ###come back to this

In [None]:
def set_of_layers(model_, filters_, kernal, strides_, dropout=0):
    '''
    function to add the following layers to a model:
    Conv2D, MaxPooling2D, BatchNormalization, LeadyReLU, Dropout

    args:
      model_ : tf.keras.Sequential model
      filters_: int, number of filters in Conv2D layer
      kernal: int, kernal size in Conv2D layer
      strides_: int, stride size in MaxPooling2D layer
      dropout: float, dropout percentage in Dropout layer, default is 0.0
        must be less than 1.0

    returns:
      model_: tf.keras.Sequential model that is the same as the model_ input plus above 
        layers added
    '''
    model_.add(layers.Conv2D(filters_, (kernal, kernal), padding='same'))
    model_.add(layers.MaxPooling2D(strides_, strides_))
    model_.add(layers.BatchNormalization())
    model_.add(layers.LeakyReLU())
#     model_.add(layers.Dropout(dropout)) #hold off on this for now

    return model_

In [None]:
def create_model(input_shape=[*params['img_size'], 3], bias_output=None):
    '''
    function to create a model that will be trained on train DS
    
    args:
        input_shape: array, default: [1024, 1024, 3], shape
            of input tensor that will be fed into model
    
    returns:
        model: tf.sequential() model
    '''
    model = tf.keras.Sequential()
    
    
    model.add(layers.Conv2D(32, (5, 5), padding='same',
                                     input_shape=input_shape)) 
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    set_of_layers(model, 64, 5, 2)
    set_of_layers(model, 128, 5, 2)
    set_of_layers(model, 256, 5, 2)
    set_of_layers(model, 512, 5, 2)

    
    model.add(layers.Flatten())
    model.add(layers.Dense(512))
    
    if bias_output is not None:
        bias_output = tf.keras.initializers.Constant(bias_output)
    model.add(layers.Dense(1, activation='sigmoid', bias_initializer=bias_output))
    

    metrics = [
          keras.metrics.TruePositives(name='tp'),
          keras.metrics.FalsePositives(name='fp'),
          keras.metrics.TrueNegatives(name='tn'),
          keras.metrics.FalseNegatives(name='fn'), 
          keras.metrics.BinaryAccuracy(name='accuracy'),
          keras.metrics.Precision(name='precision'),
          keras.metrics.Recall(name='recall'),
          keras.metrics.AUC(name='auc'),
    ]
    
    model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.01),
    metrics=metrics
)
    

    
    return model

create_model().summary()

In [None]:
def get_ds_size(files):
    '''
    function to get size of tfrecord Dataset, based on file name
    
    the file name has the number of records in the file, for example:
    train09-2071.tfrec has 2017 records
    
    args:
        files: list of str file names, each item should be the path to a tfrecord file
    
    returns:
        size: int, size of dataset
    '''
    size = 0
    for file in files:
        file_size = int(file.split('.tfrec')[0].split('tfrecords/')[1].split('-')[1])
        size += file_size
    return size

In [None]:


#get test file paths
test_files = tf.io.gfile.glob(dataset_gcs + '/tfrecords/test*.tfrec')

#get train and validation file paths
train_files, valid_files = train_test_split(tf.io.gfile.glob(dataset_gcs + '/tfrecords/train*.tfrec'),
                              test_size=.1, random_state=1)

train_ds = get_train_ds(train_files, params['batch_size'])
valid_ds = get_train_ds(valid_files, params['batch_size'])

In [None]:
train_size, valid_size = get_ds_size(train_files), get_ds_size(valid_files)
test_size = get_ds_size(test_files)
print('the dataset consists of: {} training images, {} validation images, and {} test images'.
     format(train_size, valid_size, test_size))

In [None]:
epoch_steps = train_size // params['batch_size']
valid_steps = valid_size // params['batch_size']

In [None]:
#calculate class weights

#hardcode number of images for each outcome based on DF in 6th cell
target_0 = 32542
target_1 = 584
total = target_0 + target_1

class_weight_0 = (1 / target_0) * (total) / 2.0
class_weight_1 = (1 / target_1) * (total) / 2.0

class_weights = {0: class_weight_0, 1: class_weight_1}

initial_bias = np.log([target_1 / target_0])



In [None]:
with strategy.scope():
    model = create_model(bias_output=initial_bias)

In [None]:
history = model.fit(
    train_ds,
#     batch_size=params['batch_size'],
    epochs= params['epochs'], 
    steps_per_epoch=epoch_steps,
    validation_data=valid_ds,
    validation_steps=valid_steps,
    class_weight=class_weights
)

In [None]:
model.predict(valid_ds, steps=2.2)