In [5]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
import datetime
import time
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, Flatten, Activation, MaxPooling2D

from sklearn.metrics import recall_score, confusion_matrix, precision_score
from scripts.model_functions import create_model
import matplotlib.pylab as plt
import seaborn as sns


In [6]:
# Create new variables to be used in Keras and the CNN

# number of items to use for training
BATCH_SIZE = 400 

# Number of identifying classes 
#   WE have two, Bloom and no bloom 1/0
NUM_CLASSES = 2 

# number of times to repeat process
EPOCHS = 100

In [7]:

# Load the data
df_train = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df_test = pd.read_csv('../../data/cleaned/site2_bird.csv')


In [8]:
target = df_train['BGA-Phycocyanin RFU'].apply(lambda x : x/0.2334)
df_train = df_train.drop(columns=['Chlorophyll (ug/L)', 'Chlorophyll RFU'])
df_train['BGA (ug/L)'] = target


target = df_test['BGA-Phycocyanin RFU'].apply(lambda x : x/0.2334)
df_test = df_test.drop(columns=['Chlorophyll (ug/L)', 'Chlorophyll RFU'])
df_test['BGA (ug/L)'] = target


In [9]:
from datetime import datetime
timestamp = df_train['Date (mm.dd.yyyy)'] + ' '+ df_train['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df_train['Timestamp'] = timestamp

timestamp = df_test['Date (mm.dd.yyyy)'] + ' '+ df_test['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df_test['Timestamp'] = timestamp


In [10]:
# dont need data and time now that we have Timestamp. Lets remove them

df_train = df_train.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
df_test = df_test.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])



In [11]:
train_target = df_train['BGA (ug/L)'].apply(lambda x: 1 if x > 20 else 0)
df_train['Bloom'] = train_target

test_target = df_test['BGA (ug/L)'].apply(lambda x: 1 if x > 20 else 0)
df_test['Bloom'] = test_target

In [8]:
from sklearn.preprocessing import MinMaxScaler

dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)', 'Bloom']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(df_train[dataset_columns])
df_train = pd.DataFrame(ds_scaled,columns=dataset_columns)

ds_scaled = scaler.fit_transform(df_test[dataset_columns])
df_test = pd.DataFrame(ds_scaled,columns=dataset_columns)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


# Functions to take a moving window of the data of 10 time stamps

In [13]:
'''
determines the window size for the daata set
@param dataset - The dataset to get windows for
@param window_size - the size of the window  
@param shift - the amout to shift the window
'''
def windows(dataset, window_size, shift):
    start = 0
    while start+window_size < dataset.shape[0]: 
        yield (int(start), int(start+window_size))
        # shift the window five blocks of time
        start += shift
        if start % 300 == 0:
            print('Window Segmentation {0:.2f}% done'.format(((start+window_size) / dataset.shape[0]) * 100 ))


'''

Segments the dataset based on the parameters that are passed in.
@param dataset - the dataset to segment into window
@param columns - the array of columns from the dataset to be looked at
@param window_size - the size of the window you would like to be looked at. Defualt is 10

'''
def segment_dataset(dataset, columns, target, window_size=10):    
    print('WINDOW SIZE',window_size)
    print('NUMBER OF COULUMNS',len(columns))
    segments = np.empty((0, window_size, len(columns)))
    labels = np.empty((0))
    count = 0
    for (start, end) in windows(dataset, window_size, 1):
        count+=1
        values = dataset[columns][start:end]
        if(values.shape[0] == window_size):
            segments = np.vstack([segments, np.stack([values])])
            # Takes the larger of the two variables if there are more than one. 
            # This makes it more likly to predict a bloom. Can be changed to iloc[0] to
            # be less likly to predict a bloom (more 0s in the label array)
            
            labels = np.append(labels, dataset[target][start:end].mode().iloc[-1])
        else:
            print("No more Windows available... Exiting")
            break
    return (segments, labels)

In [14]:
feature_columns = dataset_columns[:-1]
(x_train, y_train) = segment_dataset(df_train, feature_columns, 'Bloom', 9)
(x_test, y_test) = segment_dataset(df_test, feature_columns, 'Bloom', 9)

print('done')

WINDOW SIZE 9
NUMBER OF COULUMNS 7
Window Segmentation 1.63% done
Window Segmentation 3.21% done
Window Segmentation 4.80% done
Window Segmentation 6.38% done
Window Segmentation 7.96% done
Window Segmentation 9.55% done
Window Segmentation 11.13% done
Window Segmentation 12.71% done
Window Segmentation 14.30% done
Window Segmentation 15.88% done
Window Segmentation 17.46% done
Window Segmentation 19.05% done
Window Segmentation 20.63% done
Window Segmentation 22.21% done
Window Segmentation 23.80% done
Window Segmentation 25.38% done
Window Segmentation 26.96% done
Window Segmentation 28.55% done
Window Segmentation 30.13% done
Window Segmentation 31.71% done
Window Segmentation 33.30% done
Window Segmentation 34.88% done
Window Segmentation 36.46% done
Window Segmentation 38.05% done
Window Segmentation 39.63% done
Window Segmentation 41.21% done
Window Segmentation 42.80% done
Window Segmentation 44.38% done
Window Segmentation 45.97% done
Window Segmentation 47.55% done
Window Segm

In [15]:
print(x_train.shape)
print(x_test.shape)

(18938, 9, 7)
(17086, 9, 7)


In [16]:
print(y_train.shape)
print(y_test.shape)

(18938,)
(17086,)


# Shaping the data to be used in the model.

In [13]:
x_train = x_train.reshape(len(x_train),9,7,1)
x_test = x_test.reshape(len(x_test),9,7,1)

In [14]:
y_train = y_train.reshape(y_train.shape[0],1)
y_test = y_test.reshape(y_test.shape[0],1)

# Breaking apart training and test data

In [15]:
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

x_train shape: (18938, 9, 7, 1)
x_test shape: (17086, 9, 7, 1)
y_train shape: (18938, 1)
y_test shape: (17086, 1)


In [16]:
y_train_mod = ks.utils.to_categorical(y_train, NUM_CLASSES)
y_test_mod = ks.utils.to_categorical(y_test, NUM_CLASSES)
input_shape = (9,7,1)


In [17]:
# Gets the precision of the different metrics
def create_class_predictions(pred):
    retval = np.array([])
    for row in pred:
        max_value = (-1,-1)
        for index, value in enumerate(row):
            if value > max_value[1]:
                max_value = (index, value)
        retval = np.append(retval, max_value[0])
    return retval


def create_layers(num_layers):
    layers = [Flatten(), Dropout(0.2), Dense(NUM_CLASSES, activation='softmax', input_dim=2)]
    for i in range(0, num_layers):
        layers.insert(0, Conv2D(44, 7, input_shape=input_shape, activation='relu', padding='same'))
    return layers

# Come on, let's create the model already!

In [19]:
values = []

print("...And Here we go....")
st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Started at",st)
max_layers = 10
for i in range(2, max_layers+1):
    layers = create_layers(i)
    model = create_model(44, 7, input_shape, NUM_CLASSES, 0.0001, layers=layers)
    model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=0)
    # What is our score?
    score = model.evaluate(x_train, y_train_mod, verbose=0)
    predictions = model.predict(x_test)
    predict = create_class_predictions(predictions)
    recall = recall_score(y_test.reshape(-1,), predict)
    precision = precision_score(y_test.reshape(-1,), predict)
    cm = confusion_matrix(y_test.reshape(-1,), predict)
    value = (i, recall, precision, cm)
    values.append(value)
    print("Layers:{}, Recall:{}, Precision {}\nCN {}".format(i, recall, precision, cm))
    print("Current Time: ", st)

st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Finished at",st)
print(values)

...And Here we go....
Started at 2019-02-09 11:19:23


KeyboardInterrupt: 

In [32]:
# Lets try iterating over neurons and and window size
values = []


print("...And Here we go....")
st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Started at",st)
min_neurons = 12
max_neurons = 44
for num_neurons in range(min_neurons, max_neurons+1):
    for window_size in range(2, 10):
        window = (window_size, window_size)
        layers = [Conv2D(num_neurons, window, input_shape=input_shape, activation='relu', padding='same'),
                  Conv2D(num_neurons * 2, window_size, activation='relu', padding='same'),
                  Dense(num_neurons), Flatten(), Dropout(0.2), Dense(NUM_CLASSES, activation='softmax')]
        model = create_model(num_neurons, window, input_shape, NUM_CLASSES, 0.1, layers=layers)
        model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=0)
        # What is our score?
        score = model.evaluate(x_train, y_train_mod, verbose=0)
        predictions = model.predict(x_test)
        predict = create_class_predictions(predictions)
        recall = recall_score(y_test.reshape(-1,), predict)
        precision = precision_score(y_test.reshape(-1,), predict)
        cm = confusion_matrix(y_test.reshape(-1,), predict)
        value = (num_neurons, window_size, recall, cm)
        values.append(value)
        print('Number of Neurons: {}\nWindow Size: {}\nRecall:{}\nPrecision:{}\nCM:{}'.format(num_neurons, window_size, recall, precision, cm))
    print(values[-10])

st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Finished at",st)
print(values)
for value in values:
    if value[2] > 0:
        print(value)

...And Here we go....
Started at 2019-02-05 19:37:47


  'precision', 'predicted', average, warn_for)


Number of Neurons: 12
Window Size: 2
Recall:0.0
Precision:0.0
CM:[[1888    0]
 [   6    0]]


  'precision', 'predicted', average, warn_for)


Number of Neurons: 12
Window Size: 3
Recall:0.0
Precision:0.0
CM:[[1888    0]
 [   6    0]]
Number of Neurons: 12
Window Size: 4
Recall:1.0
Precision:0.0031678986272439284
CM:[[   0 1888]
 [   0    6]]


  'precision', 'predicted', average, warn_for)


Number of Neurons: 12
Window Size: 5
Recall:0.0
Precision:0.0
CM:[[1888    0]
 [   6    0]]


  'precision', 'predicted', average, warn_for)


Number of Neurons: 12
Window Size: 6
Recall:0.0
Precision:0.0
CM:[[1888    0]
 [   6    0]]


  'precision', 'predicted', average, warn_for)


Number of Neurons: 12
Window Size: 7
Recall:0.0
Precision:0.0
CM:[[1888    0]
 [   6    0]]


KeyboardInterrupt: 

In [33]:
# Lets try iterating over multiple layers and types of layers


layers = []
layer_to_keep = (None,0,0,None)
choices = [
            MaxPooling2D(pool_size=(3,3)), 
            Conv2D(44, 4, activation='relu', padding='same'),
            Conv2D(26, 7, activation='relu', padding='same'),
            Conv2D(44, 7, activation='relu', padding='same'),
            Conv2D(26, 4, activation='relu', padding='same'),
           ]

def create_model_with_layer(model, layers=[]):
    if layers:
        for layer in layers:
            model.add(layer)
    return model

print("...And Here we go....")
st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Started at",st)

for i in range(8):
    model = Sequential()
    model.add(Conv2D(44, 7, input_shape=input_shape, activation='relu', padding='same'))
    model = create_model_with_layer(model, layers)
    for layer in choices:
        model.add(layer)
        model.add(Flatten())
        model.add(Dropout(0.2)) 
        model.add(Dense(44))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        model.compile(loss=ks.losses.categorical_crossentropy,
                optimizer=ks.optimizers.Adam(lr=0.0001),
                metrics=['accuracy'])
        model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=0)
        # What is our score?
        score = model.evaluate(x_train, y_train_mod, verbose=1)
        predictions = model.predict(x_test)
        predict = create_class_predictions(predictions)
        recall = recall_score(y_test.reshape(-1,), predict)
        precision = precision_score(y_test.reshape(-1,), predict)
        cm = confusion_matrix(y_test.reshape(-1,), predict)

        if recall > layer_to_keep[1] and precision > layer_to_keep[2]:
            layer_to_keep = (layer, recall, precision,cm)
        model = Sequential()
        model.add(Conv2D(44, 7, input_shape=input_shape, activation='relu', padding='same'))
        model = create_model_with_layer(model, layers)
        
        print("Layer {} done and produced a recall score of {}".format(layer, recall))
    layers.append(layer_to_keep[0])
    layer_to_keep = (None,0,0,None)
    print("Iteration {} done!".format(i))
st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Finished at",st)

...And Here we go....
Started at 2019-02-05 19:41:16


  'precision', 'predicted', average, warn_for)


Layer <keras.layers.pooling.MaxPooling2D object at 0x000001D1B490EB00> done and produced a recall score of 0.0


KeyboardInterrupt: 

In [34]:
for values in layers:
    print(values)

In [35]:
# From the above cell it was found to be the following layers to be the best
# Conv2D 44,4
# Conv2D 44, 7
# Conv2D 44, 4
# Conv2D 44 7
# Flatten()
# Dropout(0.2)
# Dense(44)
# Dense(2)

# let's train a model to see if we get similar results with that
model = Sequential()
model.add(Conv2D(44, 7, input_shape=input_shape, activation='relu', padding='same'))
model.add(Conv2D(44, 4, activation='relu', padding='same'))
model.add(Conv2D(44, 4, activation='relu', padding='same'))
model.add(Conv2D(44, 7, activation='relu', padding='same'))
model.add(Flatten())
model.add(Dropout(0.2)) 
model.add(Dense(44))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss=ks.losses.categorical_crossentropy,
        optimizer=ks.optimizers.Adam(lr=0.0001),
        metrics=['accuracy'])
model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)
score = model.evaluate(x_train, y_train_mod, verbose=1)
predictions = model.predict(x_test)
predict = create_class_predictions(predictions)
recall = recall_score(y_test.reshape(-1,), predict)
precision = precision_score(y_test.reshape(-1,),predict)
cm = confusion_matrix(y_test.reshape(-1,),predict)
print("RECALL:",recall)
print("PRECISION:", precision)
print("CONFUSTION MATRIX", cm)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Save the model for deployment

In [None]:
save_model(model, cnn_model, 1)
# ignoring dropout for deployment
K.set_learning_phase(0)
 
# Set a file path to save the model in.
model_name = "cnn_model"
model_version = "1"
tf_path = "./../../saved_models/{}/{}".format(model_name, model_version)
 
# Get the session from the Keras back-end to save the model in TF format.
with K.get_session() as sess:
    tf.saved_model.simple_save(sess, tf_path, inputs={'input': model.input}, outputs={t.name: t for t in model.outputs})
