In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# tensorflow imports
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt, cv2
import tensorflow_io as tfio
from tqdm import tqdm

# get dataset path
#from kaggle_datasets import KaggleDatasets
#GCS_PATH = KaggleDatasets().get_gcs_path()

# regex
import re

# EXTRA RUN INSTRUCTIONS
# change Settings -> Accelerator -> TPU v3-8 to use TPU (avoid burning through TPU hours when not in session)

In [None]:
"""
REFERENCE
# Code is a mix of tutorials (kaggle and google codelabs tutorials) for the TPU dataloading
# and Tensorflow tutorial for a basic cnn
# Code has been modified to fit competition dataset, along with pre-processing to analyze the data

TPU Tutorials:
https://www.kaggle.com/docs/tpu
https://www.kaggle.com/code/mgornergoogle/five-flowers-with-keras-and-xception-on-tpu/notebook
https://codelabs.developers.google.com/codelabs/keras-flowers-tpu/#4

Along with, for CNN:
https://www.tensorflow.org/tutorials/images/cnn
"""

In [None]:
# DATA VISUALIZATION AND EXPLORATION

# explore data

In [None]:
# ENABLE TPU

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [23]:
# SET HYPERPARAMETERS

# ideal batch size is 128 per TPU core, (128 * 8 = 1024), TPU v3-8 the core count is 8
# BATCH_SIZE = 16 * tpu_strategy.num_replicas_in_sync
BATCH_SIZE = 16 

# learning rate, should increase learning rate with batch size
LEARNING_RATE = 1

# number of epochs
NUMBER_OF_EPOCHS = 1

# activation function
ACTIVATION_FUNCTION = "relu" 

# send multiple batches to the TPU at once
STEPS_PER_EXECUTION = 32

# training data split
TRAINING_DATA_SPLIT = .7

# image size
# ideal image size for TPUs is 512 x 512 or 256 x 256 for imagenet
IMAGE_WIDTH = 512
IMAGE_HEIGHT = 512
IMAGE_CHANNELS = 3

# AUTOTUNE for dataset
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
# DATA PRE-PROCESSING - LABEL DATA

# get training data csv into panda object
train_csv = pd.read_csv("../input/prostate-cancer-grade-assessment/train.csv")

# replace "negative" with 0+0 since they are equivalent
train_csv['gleason_score'] = train_csv['gleason_score'].replace("negative" ,"0+0")

# get categories/labels, should have 10, determine categories from unique labels of gleason_scores
CATEGORIES = train_csv.gleason_score.unique()
CATEGORIES_LENGTH = len(CATEGORIES)

### DATA PRE-PROCESSING

# get categories/labels, should have 10, determine categories from unique labels of gleason_scores
categories = train_csv.gleason_score.unique()
categories_length = len(categories)

#relabel gleason score
label,imagePath = [],[]
for i in range(len(train_csv)):
    if train_csv.iloc[i]['gleason_score'] == '0+0':
        label.append(0)
    if train_csv.iloc[i]['gleason_score'] == 'negative':
        label.append(0)
    elif train_csv.iloc[i]['gleason_score'] == '3+3':
        label.append(1)
    elif train_csv.iloc[i]['gleason_score'] == '3+4':
        label.append(2)
    elif train_csv.iloc[i]['gleason_score'] == '4+3':
        label.append(3)
    elif train_csv.iloc[i]['gleason_score'] == '4+4':
        label.append(4)
    elif train_csv.iloc[i]['gleason_score'] == '3+5':
        label.append(5)
    elif train_csv.iloc[i]['gleason_score'] == '5+3':
        label.append(6)
    elif train_csv.iloc[i]['gleason_score'] == '4+5':
        label.append(7)
    elif train_csv.iloc[i]['gleason_score'] == '5+4':
        label.append(8)
    elif train_csv.iloc[i]['gleason_score'] == '5+5':
        label.append(9)
train_csv['label'] = label

# remove the suspicious row, ##
train_csv = train_csv.drop([train_csv.index[7273]])

# drop lab and ISUP grade column from train_csv dataframe
train_csv = train_csv.drop(columns=['data_provider','isup_grade', 'gleason_score'])

# this should be our list of images and labels
data_image_labels_array = pd.DataFrame(train_csv)

print(data_image_labels_array)

In [24]:
### DATA PRE-PROCESSING - TRAINING SPLIT AND SET-UP FOR MODEL INTAKE

# -HELPER FUNCTIONS
# convert tiff image, enter filename, return array of image as rgb and label
def decode_tiff_full(filename):
    # get the bits for file name
    bits = tf.io.read_file(filename)
    # decode into tiff, array shape is [height, width, 4], 4 is RGBA
    image_as_rgba = tfio.experimental.image.decode_tiff(bits)
    # convert rgba to rgb
    image_as_rgb = tfio.experimental.color.rgba_to_rgb(image_as_rgba)
    # get image id from filename
    image_id_from_filename = (re.search("(\w*).tiff", filename))[1]
    # find row in train_csv data where image id matches filename
    entry = train_csv.loc[train_csv['image_id'] == image_id_from_filename]
    # set the label to the gleason score
    label = entry['gleason_score'].to_numpy()[0]
    return image_as_rgb, label

# helper functions for TFRecord
def _bytes_feature(value):
  #Returns a bytes_list from a string or byte
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  #Returns a float_list from a float or double
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  #Returns an int64_list from a bool / enum / int / uint
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(feature0, feature1, feature2):
  feature = {
      'image': _bytes_feature(feature0),
      'image_id': _bytes_feature(feature1),
      'label': _int64_feature(feature2)
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

# process images into TFRecords
lenames = image_filenames[:split]

image_id = train_csv.image_id.values
SIZE = 885
total_image = len(train_csv)
for i in range(12):
    print('writing TFRecord')
    OneFile = min(885,total_image-i*885)
    with tf.io.TFRecordWriter('TFRecord'+str(i)+'_'+str(OneFile)+'.tfrec') as writer:
        for k in tqdm(range(OneFile)):
            img = cv2.imread('../input/panda-resized-train-data-512x512/train_images/train_images/'+str(image_id[885*i+k])+'.png')
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
            TFRd = serialize_example(
                img, 
                str.encode(image_id[885*i+k]),
                train_csv.loc[train_csv.image_id==image_id[885*i+k],'label'].values[0]                           
            )
            writer.write(TFRd)

dataset = 1 # load something
dataset = dataset.shuffle(1000) # shuffle the dataset with a buffer of 1000
dataset = dataset.cache() # cache the dataset in RAM or on disk
dataset = dataset.repeat() # repeat the dataset indefinitely
dataset = dataset.batch(128) # batch data elements together in batches of 128
dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch(es) while training

filenames_dataset = tf.data.Dataset.list_files(GCS_PATH + "/train_images/*.tiff")

# configure streaming options
dataset = dataset.shuffle(1000)
dataset = dataset.cache()
dataset = dataset.repeat()
dataset = dataset.batch(BATCH_SIZE)
AUTOTUNE = tf.data.experimental.AUTOTUNE
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False
dataset = dataset.prefetch(AUTOTUNE)

filenames = tf.io.gfile.glob(GCS_PATH + "/train_images/*.tiff")
#dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
dataset = dataset.with_options(ignore_order)
dataset = dataset.map(read_TFRecord, num_parallel_calls=AUTOTUNE)
dataset = force_images_sizes(dataset, (IMAGE_WIDTH, IMAGE_HEIGHT))

print(filenames[0])
print(decode_tiff(filenames[0]))

# images should be converted to an ndarray with shape (# of images, 512 x 512, 3)
# labels should be converted to an ndarray with shape (# of images, 1
train_images = 1;
train_labels = 1;

test_images = 1;
test_labels = 1;

In [None]:
"""

# configure streaming options
dataset = dataset.shuffle(1000)
dataset = dataset.cache()
dataset = dataset.repeat()
dataset = dataset.batch(BATCH_SIZE)
AUTOTUNE = tf.data.experimental.AUTOTUNE
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False
dataset = dataset.prefetch(AUTOTUNE)

filenames = tf.io.gfile.glob(GCS_PATH + "/train_images/*.tiff")
#dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
dataset = dataset.with_options(ignore_order)
dataset = dataset.map(read_TFRecord, num_parallel_calls=AUTOTUNE)
dataset = force_images_sizes(dataset, (IMAGE_WIDTH, IMAGE_HEIGHT))

print(filenames[0])
print(decode_tiff(filenames[0]))

####



# convert to numpy for iteration
# train_csv_array = train_csv.to_numpy()
# print(train_csv_array)

# build TFRecord reader
def read_TFRecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "class": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, features)
    image = tfio.experimental.image.decode_tiff(example['image'], index=0, name=None)
    class_label = tf.cast(example['class'], tf.int32)
    return image
    
# set options
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

# build training dataset
training_dataset = tf.data.TFRecordDataset(training_filenames, num_parallel_reads=AUTO)
training_dataset = dataset.with_options(ignore_order)
training_dataset = dataset.map(read_TFRecord, num_parallel_calls=AUTO)
training_dataset = force_image_sizes(dataset, (IMAGE_WIDTH, IMAGE_HEIGHT))

training_dataset = load_dataset(training_filenames)
print(training_dataset) 

# training_dataste = tf.data.Dataset


def get_training_dataset():
    dataset = load_dataset(training_filenames)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def input_

train_labels = np.empty((IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS), int)
test_labels = np.empty_like((IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS))


print(train_labels)

# for row in train_csv_array

# train_images = 
# train_labels =

# test_images =
# test_labels =

# image data loading and construct arrays
# print(image_dataset)


# images should be converted to an ndarray with shape (# of images, 512 x 512, 3)
# labels should be converted to an ndarray with shape (# of images, 1)
# train_images, train_labels and test_images, test_labels should be constructed

# normalize pixel values between 0 and 1
# train_images =
# test_images =

###########################

# image_dataset = tf.data.TFRecordDataset(image_filenames)
"""

In [None]:
# DATA VISUALIZATION AND EXPLORATION

# explore data

In [None]:
# SET AND RUN MODEL

# run model design and model compile within TPU strategy scope, to prepare for TPU computation
with tpu_strategy.scope():
    # MODEL DESIGN (PART 1)
    model = tf.keras.models.Sequential()
    # Convolutional layers
    # 64 filters, (3,3) feature kernel, input image 512x512 w/ 3 channels
    model.add(tf.keras.layers.Conv2D(64, (3,3), activation=ACTIVATION_FUNCTION, input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(128, (3,3), activation=ACTIVATION_FUNCTION)
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(128, (3,3), activation=ACTIVATION_FUNCTION)
    # Dense Layers
    model.add(tf.keras.layers.Flatten())
    # determine model summary to get shape from last layer of (Conv2D), print(model.summary)
    model.add(tf.keras.layers.Dense(64, activation=ACTIVATION_FUNCTION))
    # final output should be 10, since we have 10 classes (these are the gleason_scores)
    model.add(tf.keras.layers.Dense(categories_length))

    # COMPILE MODEL (PART 2)
    # set optimizer for learning rate for adam optimizer
    model_optimizer = keras.optimizers.Adam(LEARNING_RATE)

    # compile model
    model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'], steps_per_execution=STEPS_PER_EXECUTION)

# build history
history = model.fit(train_images, train_labels, epochs=NUMBER_OF_EPOCHS, validation_data=(test_images, test_labels))

In [None]:
# MODEL EVALUATION

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower_right')

test_loss, test_acc, model.evaluate()