In [1]:

import pandas as pd
import numpy as np
from numpy import array
from numpy import argmax
import os, re
import cv2
import locale
import zipfile

import tensorflow.keras as keras
from keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelBinarizer
from keras.models import model_from_json



In [14]:
# user defined functions 

def preprocess_input(image):
    fixed_size = 128
    image_size = image.shape[:2] 
    ratio = float(fixed_size)/max(image_size)
    new_size = tuple([int(x*ratio) for x in image_size])
    img = cv2.resize(image, (new_size[1], new_size[0]))
    delta_w = fixed_size - new_size[1]
    delta_h = fixed_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    ri = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    gray_image = cv2.cvtColor(ri, cv2.COLOR_BGR2GRAY)
    gimg = np.array(gray_image).reshape(128,128,1)
    img_n = cv2.normalize(gimg, gimg, 0, 255, cv2.NORM_MINMAX)
    return(img_n)

def get_full_path(row, data_path): 
    _id = str(row['ImageId']).zfill(5)
    comps = row['bucket_id'].split('_')
    png = comps[1] + comps[0] + 'P' + _id +'.png'
    base = data_path
    out = base + row['bucket_id'] + '/' + png
    return out

def get_top_2(row):
    preds = [row[str(i)] for i in range(11)]
    top_2 = sorted(zip(preds, range(11)), reverse=True)[:2]
    index = top_2[1][1]
    return index

# def get_percent(row, df): 
#     n = float(row['n_examples'])
#     d = float(sum(df.n_examples))
#     return n / d

def get_percent(row, c): 
    val = row[c]
    return float(val) / float(row['n_obs'])

def change_class(row, cd): 
    l = row['label_group']
    if l in cd.keys(): 
        out = cd[l]
    else: 
        out = l 
    return out

In [3]:
#data_path = '/Users/alisonchase/Documents/IFCB/EXPORTS_ml/'
DATA_PATH = '/Users/alisonchase/Dropbox/UTOPIA/test/ml/'
MODEL_PATH = '/Users/alisonchase/Dropbox/UTOPIA/ml-workflow/model_ckpt/'
MODEL = 'model-cnn-v1-b3'
MODEL_SUMMARY = 'model-summary-cnn-v1-b3.csv'

In [4]:
# get image paths by traversing directory 

buckets = os.listdir(DATA_PATH)
image_paths = []
i = 0
for b in buckets: 
    i += 1
    base = DATA_PATH + b +'/'
    if 'DS_Store' in base: 
        continue 
    else : 
        for p in os.listdir(base): 
            if '.png' in p:
                image_paths.append(base + p)
            # - 
    print('completed ' + str(i) + ' of ' + str(len(buckets)))


len(image_paths)

image_paths = pd.DataFrame(image_paths)
image_paths.columns = ['image_path']

completed 1 of 4
completed 3 of 4
completed 4 of 4


In [5]:
image_paths
#image_paths.to_csv('test.csv')

Unnamed: 0,image_path
0,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
1,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
2,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
3,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
4,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
...,...
204,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
205,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
206,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...
207,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...


In [6]:
# make test input csv from bucket id csv files

# image_dirs = []
# i = 0
# for b in buckets: 
#     i += 1
#     base = DATA_PATH + b +'/'
#     if 'DS_Store' in base: 
#         continue 
#     else : 
#         csv_path = base + b + '_ml.csv'
#         dat = pd.read_csv(csv_path)
#         df = pd.DataFrame(dat)
#         df['bucket_id'] = b
#         image_dirs.append(df)
#         # - 
#     print('completed ' + str(i) + ' of ' + str(len(buckets)))


# image_dir = pd.concat(image_dirs) 


# if len(image_dir) == len(image_paths): 
#     print(' Record counts match for metadata and directory ')


In [7]:
#image_dir

In [8]:
# add image_path to csv image directory  

# image_dir['image_path'] = image_dir.apply(lambda row: get_full_path(row, data_path), axis=1)
# data_all = image_dir[['EquivalentDiameter', 'image_path']]

In [9]:
#image_paths

In [10]:
# load saved model architecture and weights 

json_file = open(MODEL_PATH + MODEL + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights(MODEL_PATH + MODEL + '.h5')
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
# make predictions on testing data using saved model 

n_splits = 50
test_split = np.array_split(image_paths, n_splits)
test_preds = []

In [12]:
# make predcitions 

n = 1
for df in test_split: 
    image_data = []
    for i in range(len(df)): 
        row = df.iloc[i]
        input_path = row['image_path']
        image_data.append(preprocess_input(cv2.imread(input_path)))
        # - 
    test_input = np.array(image_data)
    predictions = loaded_model.predict(test_input)
    pred_frame = pd.DataFrame(predictions)
    pred_frame['image_path'] = df['image_path'].values.tolist()
    top_1 = [np.argmax(i) for i in predictions]
    pred_frame['pred_label'] = top_1
    test_preds.append(pred_frame)
    print('completed ' + str(n) + ' of ' + str(n_splits) + ' testing subsets')
    n +=1 
    del image_data

completed 1 of 50 testing subsets
completed 2 of 50 testing subsets
completed 3 of 50 testing subsets
completed 4 of 50 testing subsets
completed 5 of 50 testing subsets
completed 6 of 50 testing subsets
completed 7 of 50 testing subsets
completed 8 of 50 testing subsets
completed 9 of 50 testing subsets
completed 10 of 50 testing subsets
completed 11 of 50 testing subsets
completed 12 of 50 testing subsets
completed 13 of 50 testing subsets
completed 14 of 50 testing subsets
completed 15 of 50 testing subsets
completed 16 of 50 testing subsets
completed 17 of 50 testing subsets
completed 18 of 50 testing subsets
completed 19 of 50 testing subsets
completed 20 of 50 testing subsets
completed 21 of 50 testing subsets
completed 22 of 50 testing subsets
completed 23 of 50 testing subsets
completed 24 of 50 testing subsets
completed 25 of 50 testing subsets
completed 26 of 50 testing subsets
completed 27 of 50 testing subsets
completed 28 of 50 testing subsets
completed 29 of 50 testing su

In [13]:
test_eval = pd.concat(test_preds)

if len(test_eval) == len(image_paths):
    print('generated predictions for all valid examples in exports dataset')


generated predictions for all valid examples in exports dataset


In [14]:
test_eval

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,image_path,pred_label
0,9.658441e-05,2.027927e-04,4.488858e-04,2.454126e-02,3.490195e-04,2.548410e-03,3.143696e-04,0.048315,1.021233e-01,0.821060,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,9
1,1.369881e-04,3.082993e-12,7.449825e-05,8.155113e-04,2.146445e-06,4.473814e-04,1.472076e-06,0.771073,6.799227e-04,0.226769,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
2,7.800916e-05,4.832570e-16,5.617907e-06,4.102923e-03,1.205301e-09,2.971140e-04,2.909469e-08,0.949049,1.149731e-04,0.046353,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
3,5.635270e-05,7.529041e-16,3.527660e-05,1.126263e-04,1.765839e-08,8.017465e-05,2.097754e-08,0.913373,1.381593e-04,0.086204,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
4,1.580189e-03,9.604798e-08,4.610286e-05,1.132822e-02,4.269811e-05,3.694359e-03,1.940514e-06,0.857420,6.688284e-04,0.125217,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
...,...,...,...,...,...,...,...,...,...,...,...,...
3,2.954215e-02,7.650062e-05,2.646709e-04,2.482498e-01,5.101819e-04,6.246306e-03,1.145376e-05,0.586966,7.381122e-04,0.127395,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
0,7.727576e-04,1.742599e-14,4.534035e-04,9.041741e-03,2.656839e-09,9.829971e-04,1.700756e-06,0.979095,5.191551e-04,0.009133,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
1,5.350925e-04,3.338311e-13,1.682205e-05,5.189911e-02,1.817417e-08,1.394335e-03,4.186197e-06,0.885423,2.194658e-04,0.060508,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7
2,8.013196e-07,5.041963e-28,6.407052e-08,8.748604e-08,1.172570e-17,6.595614e-07,4.625755e-15,0.999730,3.558087e-07,0.000268,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7


In [15]:
# add argmax2 for top 2 accuracy / may be helpful for manual review of performance 

# test_eval.columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', u'image_path', u'pred_label']
# test_eval['top_2_label'] = test_eval.apply(lambda row: get_top_2(row), axis=1)

In [16]:
summary = test_eval.groupby('pred_label').agg({'image_path' : 'count'})
summary 

Unnamed: 0_level_0,image_path
pred_label,Unnamed: 1_level_1
0,1
3,13
7,135
9,60


#### next: convert prediction numbers into strings, and then save a dataframe that has the image_path and the predicted label, with one row per image

In [9]:
# get string labels for top 1 and top 2 probabilites 

test_naames = pd.read_csv(MODEL_PATH + MODEL_SUMMARY)
test_naames

Unnamed: 0,full_path,high_group,pred_label,true_label,is_correct,0,1,2,3,4,5,6,7,8,9,top_5
0,/home/azureuser/data/NAAMES_ml/D20160513T22082...,Other,7,7,1,1.788756e-05,1.146114e-18,2.892286e-07,9.871857e-06,4.914940e-13,3.955169e-03,1.792351e-06,0.995379,6.035990e-04,3.274584e-05,1
1,/home/azureuser/data/NAAMES_ml/D20180402T13445...,Other,0,7,0,5.395499e-01,1.456711e-04,4.600495e-09,2.238769e-04,1.425122e-01,6.822130e-03,3.214117e-11,0.083037,2.268619e-01,8.470257e-04,1
2,/home/azureuser/data/NAAMES_ml/D20160528T22330...,Other,7,7,1,1.197600e-04,5.510693e-15,2.841731e-05,7.511244e-05,6.837943e-09,4.465281e-04,6.920837e-07,0.994058,8.560426e-05,5.186173e-03,1
3,/home/azureuser/data/NAAMES_ml/D20160531T08382...,Other,7,7,1,2.408427e-04,2.925888e-11,4.873408e-06,3.933519e-03,2.065363e-09,2.730137e-01,1.542221e-04,0.718308,5.967846e-04,3.748273e-03,1
4,/home/azureuser/data/NAAMES_ml/D20160526T21432...,Diatom,3,3,1,3.128773e-04,4.062050e-14,6.139785e-05,7.101523e-01,2.181512e-10,2.108470e-04,4.933558e-06,0.287664,2.219727e-04,1.371984e-03,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399775,/home/azureuser/data/NAAMES_ml/D20160530T23394...,Eugleno,6,6,1,6.013484e-13,3.751894e-26,3.330312e-09,7.386912e-06,3.777496e-13,6.410740e-06,9.999207e-01,0.000065,1.452780e-07,5.057208e-07,1
399776,/home/azureuser/data/NAAMES_ml/D20160529T01183...,Dinoflagellate,5,5,1,7.562776e-08,1.580177e-10,2.543378e-09,1.973915e-06,4.075483e-10,9.998869e-01,1.908313e-09,0.000110,5.005263e-08,3.868345e-07,1
399777,/home/azureuser/data/NAAMES_ml/D20160528T18364...,,9,9,1,5.158942e-07,7.589195e-09,5.717600e-09,2.155094e-05,9.907630e-07,1.271514e-05,1.036836e-10,0.015101,7.108635e-05,9.847918e-01,1
399778,/home/azureuser/data/NAAMES_ml/D20180410T12230...,Other,7,7,1,4.261079e-08,9.153698e-30,4.726195e-11,1.692143e-08,7.329000e-19,2.060179e-07,3.326508e-17,0.999673,1.107582e-07,3.268361e-04,1


In [18]:
class_ref = test_naames.loc[test_naames['is_correct'] == 1]
class_ref = class_ref.groupby('true_label').agg({'high_group' : 'max'})
class_ref.reset_index(inplace=True)
class_ref.columns = ['pred_label', 'pred_class']
class_ref

# class_ref.columns = ['top_2_label', 'top_2_class']
# exports = pd.merge(exports, class_ref, on='top_2_label', how='left')

Unnamed: 0,pred_label,pred_class
0,0,Chloro
1,1,Cilliate
2,2,Crypto
3,3,Diatom
4,4,Dictyo
5,5,Dinoflagellate
6,6,Eugleno
7,7,Other
8,8,Prymnesio
9,9,


In [25]:
# calculate the accuracy for each group based on labeled images (this is all for the naames images)

test_agg = test_naames.groupby('pred_label').agg({'full_path' : 'count', 'is_correct' : 'sum', 'top_5' : 'sum'})
test_agg.reset_index(inplace=True)
test_agg.columns = ['pred_label', 'n_obs', 'top_1', 'top_5']
test_agg['top_1_acc'] = test_agg.apply(lambda row: get_percent(row, 'top_1'), axis=1)
test_agg['top_5_acc'] = test_agg.apply(lambda row: get_percent(row, 'top_5'), axis=1)

test_agg.sort_values('top_1_acc', ascending=False)

# THIS FROM EMMETT's MODEL: 
# full dataset model predictions summary from 

#                label   top_1   top_5   n_obs  top_1_acc  top_5_acc
# 6          Euglenoid   12644   13200   13249   0.954336   0.996302
# 4             Diatom   45930   48837   48850   0.940225   0.999734
# 3        Cryptophyte    8271    8883    8948   0.924341   0.992736
# 10  Silicoflagellate    2445    2604    2649   0.922990   0.983012
# 5     Dinoflagellate   24047   27564   27570   0.872216   0.999782
# 11           invalid   22104       0   25447   0.868629   0.000000
# 8          Prymnesio     239     264     283   0.844523   0.932862
# 7              Other  143133  177750  177750   0.805249   1.000000
# 1        Chlorophyte    1762    2313    2398   0.734779   0.964554
# 2           Cilliate     531     635     724   0.733425   0.877072
# 9     Prymnesiophyte    2771    4312    4325   0.640694   0.996994
# 0           Artefact    1855    2943    2960   0.626689   0.994257


Unnamed: 0,pred_label,n_obs,top_1,top_5,top_1_acc,top_5_acc
7,7,209450,201368,209262,0.961413,0.999102
6,6,13057,11820,13039,0.905262,0.998621
4,4,2854,2503,2831,0.877015,0.991941
3,3,51399,42684,51302,0.830444,0.998113
2,2,10617,8329,10594,0.784497,0.997834
5,5,37084,23755,37019,0.640573,0.998247
9,9,60745,36272,60611,0.597119,0.997794
1,1,1197,615,1169,0.513784,0.976608
8,8,7173,3294,7142,0.459222,0.995678
0,0,6204,1846,6181,0.29755,0.996293


In [19]:
exports = test_eval[['image_path', 'pred_label']]#, 'top_2_label']]
exports = pd.merge(exports, class_ref, on='pred_label', how='left')

In [20]:
exports

Unnamed: 0,image_path,pred_label,pred_class
0,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,9,
1,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
2,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
3,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
4,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
...,...,...,...
204,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
205,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
206,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other
207,/Users/alisonchase/Dropbox/UTOPIA/test/ml/D202...,7,Other


In [71]:
exports.to_csv('test.csv')