This notebook runs the image generator version of the CNN (based on E. Culhane cnn codes) and uses both the microsoft azure VM for data storage/access, and for running the code. This CNN-v1 is for a basic CNN, and without use of tensorboard image callbacks.
The CNN is trained using labeled NAAMES data.
A. Chase, Jan 2023

In [2]:
# import libraries


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import imageio
import getpass
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
import tensorflow.keras as keras
import tensorflow as tf
from keras.callbacks import Callback
import datetime
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import sys
sys.path.append('../')
import imp
import cv2

import data_utils as du
imp.reload(du)
import ml_models
imp.reload(ml_models)

#pip install -r ../requirements.txt

# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [3]:
# read in the csv file of all image lables and filenames

csv_path = '/home/azureuser/data/image-file-directory.csv'

inventory_df = pd.read_csv(csv_path)
inventory_df

Unnamed: 0,file_name,class_raw,high_group,missing_high_group,id,lat,long,Area,Biovolume,ConvexArea,...,ScatInt,FluoInt,ScatPeak,FluoPeak,NumberOfROIinTrigger,missing_meta_data,ESDA_exclude,excluded_1,set,binary_label
0,IFCB107D20151104T112022P00433_Unicellular.png,Unicellular,Other,False,IFCB107D20151104T112022P00433,,,,,,...,,,,,,True,False,1,train,plankton
1,IFCB107D20151104T114135P00194_Unicellular.png,Unicellular,Other,False,IFCB107D20151104T114135P00194,,,,,,...,,,,,,True,False,1,train,plankton
2,IFCB107D20151104T114135P00246_Unicellular.png,Unicellular,Other,False,IFCB107D20151104T114135P00246,,,,,,...,,,,,,True,False,1,train,plankton
3,IFCB107D20151104T124515P00027_Unicellular.png,Unicellular,Other,False,IFCB107D20151104T124515P00027,,,,,,...,,,,,,True,False,1,train,plankton
4,IFCB107D20151105T174918P00074_Dinophyceae.png,Dinophyceae,Dinoflagellate,False,IFCB107D20151105T174918P00074,,,,,,...,,,,,,True,False,1,train,plankton
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998895,IFCB107D20180412T010113P00020_Ceratium.png,Ceratium,Dinoflagellate,False,IFCB107D20180412T010113P00020,40.348,-68.296,1507.093426,20001.839825,2501.816609,...,1.09682,0.36801,3.33541,1.37789,1.0,False,False,0,train,plankton
1998896,IFCB107D20180412T012434P00137_Ditylum.png,Ditylum,Diatom,False,IFCB107D20180412T012434P00137,40.324,-68.387,4811.591696,91203.058213,6707.612457,...,3.49742,1.77298,3.35693,3.31523,1.0,False,False,0,train,plankton
1998897,IFCB107D20180412T012434P00348_Ditylum.png,Ditylum,Diatom,False,IFCB107D20180412T012434P00348,40.324,-68.387,2446.626298,30443.529921,3317.560554,...,2.46922,0.90272,3.35900,2.21282,1.0,False,False,0,train,plankton
1998898,IFCB107D20180412T021117P00330_Ceratium.png,Ceratium,Dinoflagellate,False,IFCB107D20180412T021117P00330,40.273,-68.571,1500.951557,19808.704267,2787.802768,...,0.61201,0.04541,2.85538,0.37142,1.0,False,False,0,train,plankton


#### Remove rows in the dataframe without metadata (132,781 out of ~2 million)

In [4]:
inventory_df.missing_meta_data == True
inventory_df[inventory_df.missing_meta_data == True].shape

(132781, 28)

In [5]:
df_metadata = inventory_df[inventory_df.missing_meta_data == False]
df_metadata.shape

(1866119, 28)

In [10]:
# optional: subsample the full dataframe for workflow testing
#df_subsample = inventory_df.iloc[:10000,]
#df_subsample = df_metadata.iloc[:1000,]
df_subsample = df_metadata

In [11]:
df_subsample.shape

(1866119, 28)

#### Create png_path column and 3-column dataframe

In [12]:
# add a column of the png path to the df dataframe
df_subsample['png_path'] = df_subsample['id'].apply(lambda row : du.buildPNGsName(row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subsample['png_path'] = df_subsample['id'].apply(lambda row : du.buildPNGsName(row))


In [13]:
# create a dataframe with just the columns of png_path and high_group
df_files_labels = df_subsample[['png_path', 'high_group']]

In [14]:
# add a column called full_path
df_files_labels['full_path'] = df_files_labels['png_path'].apply(lambda row : os.path.join('/home/azureuser/data/NAAMES_ml/', row))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_files_labels['full_path'] = df_files_labels['png_path'].apply(lambda row : os.path.join('/home/azureuser/data/NAAMES_ml/', row))


In [15]:
df_files_labels

Unnamed: 0,png_path,high_group,full_path
3348,D20151106T121918_IFCB107/IFCB107D20151106T1219...,Other,/home/azureuser/data/NAAMES_ml/D20151106T12191...
3349,D20151106T121918_IFCB107/IFCB107D20151106T1219...,Other,/home/azureuser/data/NAAMES_ml/D20151106T12191...
3350,D20151106T121918_IFCB107/IFCB107D20151106T1219...,Other,/home/azureuser/data/NAAMES_ml/D20151106T12191...
3351,D20151106T121918_IFCB107/IFCB107D20151106T1219...,Other,/home/azureuser/data/NAAMES_ml/D20151106T12191...
3352,D20151106T121918_IFCB107/IFCB107D20151106T1219...,Other,/home/azureuser/data/NAAMES_ml/D20151106T12191...
...,...,...,...
1998895,D20180412T010113_IFCB107/IFCB107D20180412T0101...,Dinoflagellate,/home/azureuser/data/NAAMES_ml/D20180412T01011...
1998896,D20180412T012434_IFCB107/IFCB107D20180412T0124...,Diatom,/home/azureuser/data/NAAMES_ml/D20180412T01243...
1998897,D20180412T012434_IFCB107/IFCB107D20180412T0124...,Diatom,/home/azureuser/data/NAAMES_ml/D20180412T01243...
1998898,D20180412T021117_IFCB107/IFCB107D20180412T0211...,Dinoflagellate,/home/azureuser/data/NAAMES_ml/D20180412T02111...


In [16]:
# restructure lables to handle very small groups

summary = df_files_labels.groupby('high_group').agg({'png_path' : 'count'})
summary 

Unnamed: 0_level_0,png_path
high_group,Unnamed: 1_level_1
Artefact,179
Chloro,11990
Chryso,13
Cilliate,3386
Corrupt,113552
Crypto,42171
Cyanobacteria,48
Diatom,200173
Dictyo,11228
Dinoflagellate,135342


In [17]:

# Chrysophyte --> other
# Clumps --> invalid
# Cyanobacterium --> other
# NOT-CLASSIFIED --> exclude actually 
# Not-living --> invalid
# Rhizaria --> other
# Zoo --> other

class_dict = {'Artefact' : 'Corrupt',
            'Chryso' : 'Other', 
            'Cyanobacteria' : 'Corrupt',
            'Multiple' : 'Other',
            'Not living' : 'Corrupt', 
            'Rhizaria' : 'Other',
             'Zoo' : 'Other'}


df_files_labels['high_group'] = df_files_labels.apply(lambda row: du.change_class(row, class_dict), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_files_labels['high_group'] = df_files_labels.apply(lambda row: du.change_class(row, class_dict), axis=1)


In [18]:
summary = df_files_labels.groupby('high_group').agg({'png_path' : 'count'})
summary 

Unnamed: 0_level_0,png_path
high_group,Unnamed: 1_level_1
Chloro,11990
Cilliate,3386
Corrupt,194595
Crypto,42171
Diatom,200173
Dictyo,11228
Dinoflagellate,135342
Eugleno,61572
Other,1185486
Prymnesio,20176


In [19]:
# remove 'Corrupt' examples (194k total) 

df_reduced = df_files_labels.loc[df_files_labels['high_group'] != 'Corrupt']

# look at new class distributions 

summary = df_reduced.groupby('high_group').agg({'png_path' : 'count'})
summary 

Unnamed: 0_level_0,png_path
high_group,Unnamed: 1_level_1
Chloro,11990
Cilliate,3386
Crypto,42171
Diatom,200173
Dictyo,11228
Dinoflagellate,135342
Eugleno,61572
Other,1185486
Prymnesio,20176


#### Split and prepare data

In [20]:
# split the data for testing and training
train, validation = train_test_split(df_reduced, test_size=0.2)

In [21]:
# look at training distribution 

train_summary = train.groupby('high_group').agg({'png_path' : 'count'})
train_summary

Unnamed: 0_level_0,png_path
high_group,Unnamed: 1_level_1
Chloro,9523
Cilliate,2708
Crypto,33676
Diatom,160237
Dictyo,9043
Dinoflagellate,107990
Eugleno,49272
Other,948688
Prymnesio,16082


In [22]:
# randomly downsample 'Other' images keeping 200k of >900k 

other = train.loc[train['high_group'] == 'Other']
not_other = train.loc[train['high_group'] != 'Other']
other_keep = other.sample(n=200000)
train = pd.concat([not_other, other_keep])

In [23]:
# look at new distribution of training 

train_summary = train.groupby('high_group').agg({'png_path' : 'count'})
train_summary

Unnamed: 0_level_0,png_path
high_group,Unnamed: 1_level_1
Chloro,9523
Cilliate,2708
Crypto,33676
Diatom,160237
Dictyo,9043
Dinoflagellate,107990
Eugleno,49272
Other,200000
Prymnesio,16082


In [20]:
# upsample the low classes in the training example set 

# classes = list(set(v.label))
# upsample = [] 
# for c in classes: 
#     if len(train.loc[train['label'] == c]) < 20000: 
#         upsample.append(c)

# for c in upsample: 
#     new_rows = pd.concat(add_training_examples(c, 20000, train))
#     train = pd.concat([train, new_rows])

# train.groupby('label').agg({'image_path' : 'count'})

In [21]:
# -- 
# one hot encode the labels 

lb = LabelBinarizer()
labels = set(df_reduced.high_group)
lb.fit(list(labels))
batch_size = 100
#batch_size = 10

In [22]:
# instantiate generators 

trainGen = du.image_generator(train, batch_size, lb)
validationGen = du.image_generator(validation, batch_size, lb)

#### Define CNN model

In [23]:
ml_models.create_cnn_model_A1

<function ml_models.create_cnn_model_A1(width, height, depth, filters=(32, 16, 64, 32, 128, 128, 64, 256, 256, 128), regress=False)>

In [24]:
# define and compile cnn with function 
# should do layer by layer going forward for legibility

cnn = ml_models.create_cnn_model_A1(128, 128, 1, regress=False)
x = Dense(1000, activation="relu")(cnn.output)
x = Dropout(rate=0.1)(x)
x = Dense(len(labels), activation="softmax")(x)
model = Model(inputs=cnn.input, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

2023-03-31 20:41:45.952345: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-03-31 20:41:45.987755: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0001:00:00.0 name: Tesla V100-PCIE-16GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2023-03-31 20:41:45.988066: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2023-03-31 20:41:45.990005: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2023-03-31 20:41:45.993584: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2023-03-31 20:41:45.994030: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so

#### Train model

In [25]:
totalTrain = len(train)
totalVal = len(validation)

In [26]:
# When training with Keras's Model.fit(), adding the tf.keras.callbacks.
# TensorBoard callback ensures that logs are created and stored. 
# Additionally, enable histogram computation every epoch with histogram_freq=1 (this is off by default)

#Place the logs in a timestamped subdirectory to allow easy selection of different training runs.

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

2023-03-31 20:41:51.725451: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session started.
2023-03-31 20:41:51.725525: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1363] Profiler found 1 GPUs
2023-03-31 20:41:51.725750: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libcupti.so.10.1'; dlerror: libcupti.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /anaconda/envs/azureml_py38/lib/python3.8/site-packages/cv2/../../lib64:
2023-03-31 20:41:51.725770: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1408] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.
2023-03-31 20:41:51.725781: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1447] function cupti_interface_->ActivityRegisterCallbacks( AllocCuptiActivityBuffer, FreeCuptiActivityBuffer)failed with

In [None]:
df_reduced.shape

In [29]:
%%time
# train model and save history object 

' training in smaller increments to monitor progress '

print("[INFO] training simple network...")
H = model.fit(
    trainGen,
    steps_per_epoch=totalTrain // batch_size,
    validation_data=validationGen,
    validation_steps=totalVal // batch_size,
    #epochs=10,
    epochs=16)#,
#    callbacks=[tensorboard_callback])

[INFO] training simple network...


StopIteration: 

In [87]:
# save model weights and structure from history object 

''' save model and weights '''
model_json = model.to_json()
with open("model-test-cnn-v1.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("model-test-cnn-v1.h5")
print("Saved model to disk")

Saved model to disk


In [111]:
# predicting with the trained model before exporting & reloading etc

test_split = np.array_split(validation, 10)
test_preds = []
n = 1
for df in test_split: 
    labels = df['high_group'].values
    test_labels = lb.fit_transform(labels)
    v_dat = df.drop(['full_path', 'high_group'], axis=1).values
    image_data = []
    for i in range(len(df)): 
        row = df.iloc[i]
        input_path = row['full_path'] 
        image_data.append(du.preprocess_input(cv2.imread(input_path)))
        #
    test_input = np.array(image_data)
    predictions = model.predict(test_input)
    pred_frame = pd.DataFrame(predictions)
    pred_frame['full_path'] = df['full_path'].values.tolist()
    top_1 = [np.argmax(i) for i in predictions]
    df['pred_label'] = top_1
    df['true_label'] = [np.argmax(i) for i in test_labels]
    df['is_correct'] = df.apply(lambda row: du.is_correct(row), axis=1)
    df_out = pd.merge(df, pred_frame, on = 'full_path', how='left')
    test_preds.append(df_out)
    print('completed ' + str(n) + ' of 10 testing subsets')
    n +=1 
    del image_data



test_eval = pd.concat(test_preds)

if len(test_eval) == len(test):
    print('generated predictions for all examples in testing dataset')

completed 1 of 10 testing subsets
completed 2 of 10 testing subsets


[ WARN:0@220737.610] global /io/opencv/modules/imgcodecs/src/loadsave.cpp (239) findDecoder imread_('/home/azureuser/data/NAAMES_ml/D20160513T121618_IFCB107/IFCB107D20160513T121618P02005.png'): can't open/read file: check file path/integrity


AttributeError: 'NoneType' object has no attribute 'shape'

In [105]:
# look at history objct for training hueristics

print(H.history.keys())
H.history['val_accuracy']

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


[0.891753613948822,
 0.8972428441047668,
 0.9097256064414978,
 0.8927979469299316,
 0.8848884105682373,
 0.9043503999710083,
 0.9142099618911743,
 0.8593658804893494,
 0.8864887952804565,
 0.9143206477165222,
 0.8925104737281799,
 0.8375950455665588,
 0.8563655018806458,
 0.8182103633880615,
 0.8913558125495911,
 0.9075840711593628]

In [106]:
H.history['accuracy']

[0.8396170735359192,
 0.8903592228889465,
 0.9028232097625732,
 0.9102937579154968,
 0.9148202538490295,
 0.9192448854446411,
 0.9226541519165039,
 0.926394522190094,
 0.9293216466903687,
 0.932408332824707,
 0.9357650279998779,
 0.9390724301338196,
 0.9415462017059326,
 0.9449062347412109,
 0.9473868012428284,
 0.9500592350959778]

In [110]:
# --
# obtain accruacy for final model prediction 

float(sum(test_eval.is_correct))/float(len(test_eval))


# # -- 
# # make reference for reviewing by class model accuracy 

test_summ = test_eval[[ u'image_path', u'label', u'pred_label', u'true_label', u'is_correct',
                               0,                        1,
                               2,                        3,
                               4,                        5,
                               6,                        7,
                               8,                        9,
                              10]]


NameError: name 'test_eval' is not defined

In [112]:
# add top 5 accuracy score to output & build by class summary frame

test_summ['top_5'] = test_summ.apply(lambda row: get_top_5(row), axis=1)

test_agg = test_summ.groupby('high_group').agg({'png_path' : 'count', 'is_correct' : 'sum', 'top_5' : 'sum'})
test_agg.reset_index(inplace=True)
test_agg.columns = ['high_group', 'top_1', 'top_5', 'n_obs']
test_agg['top_1_acc'] = test_agg.apply(lambda row: get_percent(row, 'top_1'), axis=1)
test_agg['top_5_acc'] = test_agg.apply(lambda row: get_percent(row, 'top_5'), axis=1)


test_summ.to_csv('./model-summary-cnn-v1.csv', index=False)
test_agg.sort_values('top_1_acc', ascending=False)



NameError: name 'test_summ' is not defined

In [113]:
model

<tensorflow.python.keras.engine.training.Model at 0x7f37b505f190>