In [1]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from scripts.model_functions import create_model_mult, create_model
import matplotlib.pylab as plt
import datetime
import time

Using TensorFlow backend.


In [4]:
# Create new variables to be used in Keras and the CNN

# number of items to use for training
BATCH_SIZE = 100 

# Number of identifying classes 
#   WE have two, Bloom and no bloom 1/0
NUM_CLASSES = 2 

# number of times to repeat process
EPOCHS = 30

In [5]:

# Load the data
df_train = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df_test = pd.read_csv('../../data/cleaned/site2_bird.csv')

In [6]:
target = df_train['BGA-Phycocyanin RFU'].apply(lambda x : x/0.2334)
df_train = df_train.drop(columns=['Chlorophyll (ug/L)', 'Chlorophyll RFU'])
df_train['BGA (ug/L)'] = target
df_train.head(5)


target = df_test['BGA-Phycocyanin RFU'].apply(lambda x : x/0.2334)
df_test = df_test.drop(columns=['Chlorophyll (ug/L)', 'Chlorophyll RFU'])
df_test['BGA (ug/L)'] = target
df_test.head(5)

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L)
0,5/5/2017,0:00,15.37,2184,-100.0,8.41,10.7,92.2,9.16,0.1,0.428449
1,5/5/2017,0:15,15.45,2139,-101.0,8.43,9.92,93.3,9.25,0.1,0.428449
2,5/5/2017,0:30,15.49,2057,-102.3,8.45,8.9,94.8,9.4,0.2,0.856898
3,5/5/2017,0:45,15.67,1978,-102.6,8.45,8.62,96.0,9.49,0.2,0.856898
4,5/5/2017,1:00,15.34,2136,-100.2,8.41,9.88,92.7,9.22,0.1,0.428449


In [7]:
from datetime import datetime

timestamp = df_train['Date (mm.dd.yyyy)'] + ' '+ df_train['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df_train['Timestamp'] = timestamp

timestamp = df_test['Date (mm.dd.yyyy)'] + ' '+ df_test['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df_test['Timestamp'] = timestamp

In [8]:
# dont need data and time now that we have Timestamp. Lets remove them
df_train = df_train.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
df_test = df_test.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])


In [9]:
train_target = df_train['BGA (ug/L)'].apply(lambda x: 1 if x > 20 else 0)
df_train['Bloom'] = train_target

test_target = df_test['BGA (ug/L)'].apply(lambda x: 1 if x > 20 else 0)
df_test['Bloom'] = test_target

In [10]:
# lets try to normalize this now....
from sklearn.preprocessing import MinMaxScaler

dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)', 'Bloom']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(df_train[dataset_columns])
df_train = pd.DataFrame(ds_scaled,columns=dataset_columns)

ds_scaled = scaler.fit_transform(df_test[dataset_columns])
df_test = pd.DataFrame(ds_scaled,columns=dataset_columns)


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


# Functions to take a moving window of the data of 10 time stamps

In [11]:
'''
determines the window size for the daata set
@param dataset - The dataset to get windows for
@param window_size - the size of the window  
@param shift - the amout to shift the window
'''
def windows(dataset, window_size, shift):
    start = 0
    while start+window_size < dataset.shape[0]: 
        yield (int(start), int(start+window_size))
        # shift the window five blocks of time
        start += shift
        if start % 300 == 0:
            print('Window Segmentation {0:.2f}% done'.format(((start+window_size) / dataset.shape[0]) * 100 ))


'''

Segments the dataset based on the parameters that are passed in.
@param dataset - the dataset to segment into window
@param columns - the array of columns from the dataset to be looked at
@param window_size - the size of the window you would like to be looked at. Defualt is 10

'''
def segment_dataset(dataset, columns, target, window_size=10):    
    print('WINDOW SIZE',window_size)
    print('NUMBER OF COULUMNS',len(columns))
    segments = np.empty((0, window_size, len(columns)))
    labels = np.empty((0))
    count = 0
    for (start, end) in windows(dataset, window_size, 1):
        count+=1
        values = dataset[columns][start:end]
        if(values.shape[0] == window_size):
            segments = np.vstack([segments, np.stack([values])])
            # Takes the larger of the two variables if there are more than one. 
            # This makes it more likly to predict a bloom. Can be changed to iloc[0] to
            # be less likly to predict a bloom (more 0s in the label array)
            
            labels = np.append(labels, dataset[target][start:end].mode().iloc[-1])
        else:
            print("No more Windows available... Exiting")
            break
    return (segments, labels)

In [12]:
feature_columns = dataset_columns[:-1]
(x_train, y_train) = segment_dataset(df_train, feature_columns, 'Bloom', 9)
(x_test, y_test) = segment_dataset(df_test, feature_columns, 'Bloom', 9)

print('done')

WINDOW SIZE 9
NUMBER OF COULUMNS 7
Window Segmentation 1.63% done
Window Segmentation 3.21% done
Window Segmentation 4.80% done
Window Segmentation 6.38% done
Window Segmentation 7.96% done
Window Segmentation 9.55% done
Window Segmentation 11.13% done
Window Segmentation 12.71% done
Window Segmentation 14.30% done
Window Segmentation 15.88% done
Window Segmentation 17.46% done
Window Segmentation 19.05% done
Window Segmentation 20.63% done
Window Segmentation 22.21% done
Window Segmentation 23.80% done
Window Segmentation 25.38% done
Window Segmentation 26.96% done
Window Segmentation 28.55% done
Window Segmentation 30.13% done
Window Segmentation 31.71% done
Window Segmentation 33.30% done
Window Segmentation 34.88% done
Window Segmentation 36.46% done
Window Segmentation 38.05% done
Window Segmentation 39.63% done
Window Segmentation 41.21% done
Window Segmentation 42.80% done
Window Segmentation 44.38% done
Window Segmentation 45.97% done
Window Segmentation 47.55% done
Window Segm

In [13]:
print(x_train.shape)
print(x_test.shape)


(18938, 9, 7)
(17086, 9, 7)


In [14]:
print(y_train.shape)
print(y_test.shape)

(18938,)
(17086,)


# Shaping the data to be used in the model.

In [15]:
x_train = x_train.reshape(len(x_train),9,7,1)
x_test = x_test.reshape(len(x_test),9,7,1)

In [16]:
y_train = y_train.reshape(y_train.shape[0],1)
y_test = y_test.reshape(y_test.shape[0],1)

# Breaking apart training and test data

In [17]:
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

x_train shape: (18938, 9, 7, 1)
x_test shape: (17086, 9, 7, 1)
y_train shape: (18938, 1)
y_test shape: (17086, 1)


In [18]:
y_train_mod = ks.utils.to_categorical(y_train, NUM_CLASSES)
y_test_mod = ks.utils.to_categorical(y_test, NUM_CLASSES)
input_shape = (9,7,1)

In [19]:
# Gets the recall of the different metrics
def create_class_predictions(pred):
    retval = np.array([])
    for row in pred:
        max_value = (-1,-1)
        for index, value in enumerate(row):
            if value > max_value[1]:
                max_value = (index, value)
        retval = np.append(retval, max_value[0])
    return retval

# Come on, let's create the model already!

In [25]:
values = []


print("...And Here we go....")
st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Started at",st)
for i in range(2, 101, 2):
    model = create_model(i, 7, input_shape, NUM_CLASSES,0.0001)
    model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=0)
    # What is our score?
    score = model.evaluate(x_train, y_train_mod, verbose=0)
    predictions = model.predict(x_test)
    predict = create_class_predictions(predictions)
    recall = recall_score(y_test.reshape(-1,), predict)
    precision = precision_score(y_test.reshape(-1,), predict)
    value = (i, recall, precision)
    values.append(value)
    print('{0:.2f}% complete'.format(i))

st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Finished at",st)

#This created the values that are listed in the following cell. but took 3 hours to run

...And Here we go....
Started at 2019-02-09 11:28:39


  'precision', 'predicted', average, warn_for)


2.00% complete


KeyboardInterrupt: 

In [23]:
for value in values:
    print(value)

(2, 0.0, 0.0, array([[1844,    0],
       [  50,    0]], dtype=int64))
(4, 0.0, 0.0, array([[1844,    0],
       [  50,    0]], dtype=int64))
(6, 0.0, 0.0, array([[1844,    0],
       [  50,    0]], dtype=int64))
(8, 0.0, 0.0, array([[1844,    0],
       [  50,    0]], dtype=int64))
(10, 0.06, 1.0, array([[1844,    0],
       [  47,    3]], dtype=int64))
(12, 0.42, 0.6774193548387096, array([[1834,   10],
       [  29,   21]], dtype=int64))
(14, 0.18, 0.6428571428571429, array([[1839,    5],
       [  41,    9]], dtype=int64))
(16, 0.26, 0.65, array([[1837,    7],
       [  37,   13]], dtype=int64))
(18, 0.24, 0.9230769230769231, array([[1843,    1],
       [  38,   12]], dtype=int64))
(20, 0.3, 0.6818181818181818, array([[1837,    7],
       [  35,   15]], dtype=int64))
(22, 0.26, 0.6842105263157895, array([[1838,    6],
       [  37,   13]], dtype=int64))
(24, 0.6, 0.7317073170731707, array([[1833,   11],
       [  20,   30]], dtype=int64))
(26, 0.3, 0.7142857142857143, array([[1838,

values

[(2, 0.0), (4, 0.0), (6, 0.0), (8, 0.0), (10, 0.0), (12, 0.3333333333333333), (14, 0.16666666666666666), (16, 0.3333333333333333), (18, 0.3333333333333333), (20, 0.6666666666666666), (22, 0.5), (24, 0.5), (26, 0.6666666666666666), (28, 0.6666666666666666), (30, 0.3333333333333333), (32, 0.3333333333333333), (34, 0.0), (36, 0.3333333333333333), (38, 0.3333333333333333), (40, 0.3333333333333333), (42, 0.3333333333333333), (44, 1.0), (46, 0.3333333333333333), (48, 0.3333333333333333), (50, 0.3333333333333333), (52, 0.3333333333333333), (54, 0.3333333333333333), (56, 0.3333333333333333), (58, 0.3333333333333333), (60, 0.6666666666666666), (62, 0.6666666666666666), (64, 0.3333333333333333), (66, 0.3333333333333333), (68, 0.6666666666666666), (70, 0.3333333333333333), (72, 0.3333333333333333), (74, 0.5), (76, 0.3333333333333333), (78, 1.0), (80, 0.3333333333333333), (82, 0.3333333333333333), (84, 0.3333333333333333), (86, 0.3333333333333333), (88, 0.3333333333333333), (90, 0.6666666666666666), (92, 0.5), (94, 0.5), (96, 0.3333333333333333), (98, 0.3333333333333333), (100, 0.3333333333333333)]

Lets take the top performing ones and see if the n +/- 1 of the output layers will get better /similar performance

In [None]:
number_array = [19,20,21,25,26,27,27,28,29,43,44,45,59,60,61,62,63,67,68,69,77,78,79,89,90,91]
values = []

print("...And Here we go....")
st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Started at",st)
count = 0
for i in number_array:
    count +=1
    model = create_model(i, 7, input_shape, NUM_CLASSES,0.0001)
    model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=0)
    # What is our score?
    score = model.evaluate(x_train, y_train_mod, verbose=0)
    predictions = model.predict(x_test)
    predict = create_class_predictions(predictions)
    recall = recall_score(y_test.reshape(-1,), predict)
    precision = precision_score(y_test.reshape(-1,), predict)
    cm = confusion_matrix(y_test.reshape(-1,), predict)
    value = (i, recall, precision, cm)
    values.append(value)
    print('{0:.2f}% complete'.format((count / len(number_array)) * 100))

st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Finished at",st)

model = Sequential()
model.add(Conv2D(20, 7, input_shape=input_shape, activation='relu',padding='same'))
model.add(MaxPooling2D(pool_size=(4,4)))
model.add(Conv2D(32, (3,3), activation='relu',padding='same'))
model.add(MaxPooling2D(pool_size=(1,1)))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(15))
model.add(Dense(NUM_CLASSES, activation='softmax'))

model.compile(loss=ks.losses.categorical_crossentropy,
              optimizer=ks.optimizers.Adam(lr=0.0001),
              metrics=[precision, recall])

In [None]:
for entry in values:
    print(entry)

In [None]:
# This is to see if the increasing or decreasing of the output array for the second conv layer will help with predicion.

mult_array = [0.33,0.5,0.66,0.75,1,1.33,1.5,1.66,1.75,2,2.33,2.5,2.66,2.75,3,3.33,3.5,3.66,3.75,4,4.33,4.5,4.66,4.75]
values = []

print("...And Here we go....")
st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Started at",st)
count = 0
for i in mult_array:
    count +=1
    model = create_model_mult(i, 7, input_shape, NUM_CLASSES,0.0001)
    model.fit(x=x_train, y=y_train_mod, batch_size=BATCH_SIZE, epochs=EPOCHS,verbose=0)
    # What is our score?
    score = model.evaluate(x_train, y_train_mod, verbose=0)
    predictions = model.predict(x_test)
    predict = create_class_predictions(predictions)
    recall = recall_score(y_test.reshape(-1,), predict)
    precision = precision_score(y_test.reshape(-1,), predict)
    cm = confusion_matrix(y_test.reshape(-1,), predict)
    value = (i, recall, precision, cm)
    print('{0:.2f}% complete'.format((count / len(mult_array))* 100)

st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print("Finished at",st)

In [None]:
for entry in values:
    print(entry)

## Save the model for deployment

In [None]:
# ignoring dropout for deployment
K.set_learning_phase(0)
 
# Set a file path to save the model in.
model_name = "cnn_model"
model_version = "1"
tf_path = "./../../saved_models/{}/{}".format(model_name, model_version)
 
# Get the session from the Keras back-end to save the model in TF format.
with K.get_session() as sess:
    tf.saved_model.simple_save(sess, tf_path, inputs={'input': model.input}, outputs={t.name: t for t in model.outputs})
