##Imports

In [0]:
%tensorflow_version 2.x

#data
import numpy as np
from numpy import asarray
import pandas as pd
import statistics
import math
import dask.array as da

#model
import keras
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.applications.vgg16 import VGG16
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.losses import mean_squared_error

#images
import skimage
from PIL import Image, ImageEnhance
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, load_img

#visualization
import matplotlib.pyplot as plt

In [0]:
#access drive files
from google.colab import drive
drive.mount("/content/drive", force_remount = True)

##Images

In [0]:
def determine_letter(l):
  '''determines the corret letter for the filename
        l: the number that corresponds to a letter that will be returned
  '''
    if l == 0:
        return "A"
    if l == 1:
        return "B"
    if l == 2:
        return "C"
    if l == 3:
        return "D"
    if l == 4:
        return "E"
    if l == 5:
        return "F"
    if l == 6:
        return "G"
    else:
        return "H"

In [0]:
def determine_word(p, d, l, n):
  '''determines whether or not the filename includes the word "Phase" or "Default0"
        p: passage of the mouse of the current sample
        d: diet of the mouse of the current sample
        l: letter of the mouse of the current sample's well
        n: number of the mouse of the current sample's well
  '''
  returned = "Phase";

  if passage == "1" and l != "A" and l != "E":
    if diet == "MD":
      if n == 9 or n == 10 or n == 11 or n == 12:
          returned = "Default0"
    else:
      returned = "Default0"
  
  if passage == "7" and l != "A" and l != "E":
    if diet == "MD" and (n == 1 or n == 2 or n == 3 or n == 4):
      returned = "Default0"
  
  if passage == "7" and diet == "STD":
    if l == "A" or l == "E":
      returned = "Default0"
    elif n != 5 and n != 6 and n != 7 and n != 8:
      returned = "Default0"
  
  return returned

In [0]:
#put all the image filenames in an array
X = []

for p in range(0,2): #passage
    for d in range(0,2): #diet
        for l in range(0,8): #letter
            for n in range(1,13): #number
                image_set = [] #each set has 25 images
            
                for i in range(0,25): #image number
                    if p == 0:
                        passage = "1"
                    else:
                        passage = "7"
                    if d == 0:
                        diet = "STD"
                    else:
                        diet = "MD"

                    letter = determine_letter(l)

                    im = ('''data not publicly available''')

                    #append to image set array
                    X.append(im)

##Diversity indices

In [0]:
#access the diversity spreadsheet
diversity_sheet = pd.read_excel('''data not publicly available''')

#drop all columns except for diets, wells, and shannon diversity indices
diversity_sheet = diversity_sheet.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4",
                                          "Unnamed: 5", "T7", "Unnamed: 8", 
                                           "T1", "Unnamed: 11"], axis = 1)

#rename the remaining columns
diversity_sheet = diversity_sheet[1:]
diversity_sheet.columns = ["diet", "well", "p7 diversity", "p1 diversity"]

In [0]:
#data frame without diversities
sheet = diversity_sheet.drop(["p1 diversity", "p7 diversity"], axis = 1)

#data frame for p1 diversities
diversity_sheet_p1 = sheet.copy()
diversity_sheet_p1["passage"] = "1"
diversity_sheet_p1["diversity"] = diversity_sheet["p1 diversity"]

#data frame for p7 diversities
diversity_sheet_p7 = sheet.copy()
diversity_sheet_p7["passage"] = "7"
diversity_sheet_p7["diversity"] = diversity_sheet["p7 diversity"]

#combine the data frames
diversity_sheet = diversity_sheet_p1.append(diversity_sheet_p7, 
                    ignore_index = True)

In [0]:
Y = np.array(diversity_sheet["diversity"])

length = len(Y) #to make sure max index for loop doesn't change
    
for index in range(length):
    new_index = index*25
    rows = [Y[new_index]] * 24
    Y = np.insert(Y, new_index+1, rows)

In [0]:
#add the image columns to the diversity dataframe
data = pd.DataFrame({'X': X, 'Y': Y})

In [0]:
#drop all rows with shannon diversity index of 0
data = data[data["Y"] != 0]

##Data Prep

In [0]:
#split into input/output arrays
X = np.array(data["X"].tolist())
Y = np.array(data["Y"].tolist())

In [0]:
partition = {}
train = []
valid = []
test = []

samples = len(X)/25 #each sample has 25 images
len_tr = (int(samples*0.6))*25 #all 25 images should be in the same set
len_val = (int((samples-(len_tr/25))/2))*25
len_test = int((samples*25)-len_tr-len_val)

for i in range(len_tr):
  train.append(X[i])

for i in range(len_tr, len_val+len_tr):
  valid.append(X[i])

for i in range(len_val+len_tr, len_test+len_val+len_tr): 
  test.append(X[i])

partition['train'] = train
partition['valid'] = valid
partition['test'] = test

values = {}
for i in range(len(X)):
  values[X[i]] = Y[i]

In [0]:
class DataGenerator(Sequence):
    '''generates data for the model'''
    
    def __init__(self, list_IDs, values, batch_size=32, dim=(640, 540), n_channels=3, shuffle=True):
      '''initializes all the variables
          list_IDs: the list of all the IDs in the dataset (filenames)
          values: the list of all the diversity indices
      '''
        self.dim = dim
        self.batch_size = batch_size
        self.values = values
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        '''determines the number of batches per epoch'''
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
      '''generates one batch of data
            index: the batch's index
      '''
        #generate the batch indices
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]

        #get the IDs with the specified indices
        list_IDs_temp = [self.list_IDs[k] for k in indices]

        #generate the data from the specified IDs
        X, Y = self.__data_generation(list_IDs_temp)

        return X, Y

    def on_epoch_end(self):
        '''updates indices after each epoch'''
        self.indices = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indices)

    def __data_generation(self, list_IDs_temp):
      '''generates the set of data containing batch_size samples
            list_IDs_temp: the list of IDs for the current batch
      '''
        #initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        Y = np.empty((self.batch_size), dtype=float)

        #generate data
        for i, ID in enumerate(list_IDs_temp):
            
            #store sample
            img = image.load_img(ID, grayscale = False)

            #resize
            img = img.resize((540, 640))
            
            #standardize
            pixels = asarray(img)
            pixels = pixels.astype('float32')
            mean, std = pixels.mean(), pixels.std()
            X[i,] = (pixels - mean) / std

            #store diversity index (Y)
            Y[i] = self.values[ID]

        return X, Y

##Training The Model

In [0]:
#generators
training_generator = DataGenerator(partition['train'], values)
validation_generator = DataGenerator(partition['valid'], values)
testing_generator = DataGenerator(partition['test'], values)

In [0]:
#define the base model
base_model = VGG16(weights = 'imagenet', include_top = False, input_shape = (640, 540, 3))

#freeze the base model's layers
for layer in base_model.layers:
    layer.trainable = False
    
#create model
model = Sequential()

#add the base model
model.add(base_model)

#add layers for regression
model.add(Flatten())
model.add(Dense(1, activation= 'linear'))

#model summary
model.summary()

In [0]:
def c_index_metric(y_true, y_pred):
  '''determines the concordance while training
        y_true: true values
        y_pred: predicted values
  '''
  #compares predicted values with each other
  s1 = tf.less(tf.expand_dims(y_pred, -1), y_pred)
  s2 = tf.equal(tf.expand_dims(y_pred, -1), y_pred)
  s = tf.cast(s2, tf.float32) * 0.5 + tf.cast(s1, tf.float32)

  #compares true values with each other
  n = tf.less(tf.expand_dims(y_true, -1), y_true)
  n = tf.cast(n, tf.float32)

  #concordant pairings
  s = tf.reduce_sum(tf.multiply(s,n))

  #total pairings
  n = tf.reduce_sum(n)

  #concordant pairings/total pairings
  return tf.where(tf.equal(s,0), 0.0, s/n)

In [0]:
#define the optimizer
sgd = SGD(lr = 0.00001, momentum = 0.9, decay = 0.0, nesterov = True)

#compile the model
model.compile(loss = mean_squared_error, optimizer = sgd, metrics = [c_index_metric])

#train model on dataset
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    workers=6, epochs=20)

##Evaluating The Model

In [0]:
#determine the predictions' loss & concordance
evaluation = model.evaluate_generator(testing_generator, steps=len(testing_generator), verbose=0)
loss = evaluation[0]
print("loss: " + str(loss) + "\n" + "concordance: " + str(evaluation[1])) 

In [0]:
#get array of true values
y_true = []
for name in partition["test"]:
  y_true.append(values[name])

#get array of predicted values
y_pred = model.predict_generator(testing_generator)

In [0]:
def c_index(gt, pred, **kwargs):
  '''determines the concordance of the testing data.
        gt: ground truth
        pred: predictions
  '''
    assert len(gt) == len(pred), #ground truth and predictions must have same size
    s = 0
    n = 0

    for i, y1 in enumerate(gt):
        for j, y2 in enumerate(gt):
            if y1 < y2:
                s += (pred[i] < pred[j]) + 0.5 * (pred[i] == pred[j])
                n += 1

    if n == 0:
        if 'throw_error' in kwargs:
            raise ValueError(f'all ground truth values are equal to {gt[0]}')
        else:
            return 0

    return s / n

print("concordance index for testing data: " + str(c_index(y_true, y_pred)))