# Set Up

In [8]:
import off_sample_utils as utils

In [9]:
utils.check_tf_dev()

1.12.0
name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14135432450729868661

name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 12094878319910897147
physical_device_desc: "device: XLA_GPU device"

name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 7892318722688490866
physical_device_desc: "device: XLA_CPU device"

name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11281553818
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15270162979022582801
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7"



In [10]:
import keras
import numpy as np
import pandas as pd
import sklearn
import matplotlib
keras.__version__, pd.__version__, np.__version__, matplotlib.__version__, sklearn.__version__

('2.2.4', '0.23.4', '1.12.1', '2.0.2', '0.20.0')

In [18]:
from pathlib import Path
IMAGE_SHAPE = (64, 64)  # reshape all ion images to this size
gs_path = Path('./GS/raw_images')

# Load Data

In [14]:
# all_paths contains paths to rectangular datasets in the GS
all_paths = utils.get_ds_paths(gs_path)
# unique groups (dataset names) in the data
u_groups = list(range(len(all_paths)))
len(u_groups)

66

In [17]:
print('Loading iso image dataset...')
X, y, groups = utils.load_img_X_y_groups(all_paths, image_shape=IMAGE_SHAPE)
print(X.shape)

Loading iso image dataset...
(18038, 64, 64, 1)


# Cross Validation

In [28]:
import sklearn
import pickle

In [41]:
# %%time
# utils = reload(utils)

break_after_first_fold = False

folds = 5  # number of CV folds
attempts = 5  # to train a CNN on the same data multiple times

cv_metrics = []
history_list = []

cv = sklearn.model_selection.GroupKFold(n_splits=folds)
for fold_i, (train_inds, test_inds) in enumerate(cv.split(u_groups, groups=u_groups)):
    print(f'Fold: {fold_i}', train_inds.shape, test_inds.shape)
    train_u_groups, test_u_groups = train_inds, test_inds
    
    X_train, y_train, groups_train = utils.make_subset(train_u_groups, X, y, groups)
    X_test, y_test, groups_test = utils.make_subset(test_u_groups, X, y, groups)
    
    y_test_pred_avg = np.zeros_like(y_test, dtype=np.float32)
    for attempt_i in range(attempts):
        print(f'Attempt: {attempt_i}')
        
        epochs = 15
        # CNN architecture is defined in off_sample_utils.model module (create_cnn)
        model_cnn = utils.KerasCNN(image_shape=IMAGE_SHAPE)
        history = model_cnn.fit(X_train, y_train,
                                epochs=epochs, batch_size=32, seed=13)
        history_list.append(history.history)
        y_test_pred = model_cnn.predict(X_test, load_best=False)
        y_test_pred_avg += y_test_pred

        metrics = utils.calc_metrics(y_test, y_test_pred)
        metrics['fold'] = fold_i
        metrics['attempt'] = attempt_i
        cv_metrics.append(metrics)
        print(metrics, end='\n\n')
    
    y_test_pred_avg /= attempts
    metrics = utils.calc_metrics(y_test, y_test_pred_avg)
    metrics['fold'] = fold_i
    metrics['attempt'] = 'avg'
    cv_metrics.append(metrics)
    print(metrics, end='\n\n')

    if break_after_first_fold:
        break

metrics_df = pd.concat(cv_metrics)
metrics_df.to_msgpack('metrics_df.msgpack')
pickle.dump(history_list, open('history_list.pkl', 'wb'))

In [39]:
metrics_avg_df = metrics_df[metrics_df.attempt == 'avg']
metrics_avg_df

Unnamed: 0,f1,prec,recall,acc,fold,attempt
on,0.972973,0.985075,0.961165,0.968766,0,avg
off,0.963008,0.947084,0.979478,0.968766,0,avg
on,0.957615,0.92752,0.989728,0.952937,1,avg
off,0.9471,0.987072,0.910238,0.952937,1,avg
on,0.941823,0.967876,0.917136,0.931188,2,avg
off,0.915794,0.881452,0.952921,0.931188,2,avg
on,0.976007,0.992784,0.959787,0.973834,3,avg
off,0.971228,0.951937,0.991318,0.973834,3,avg
on,0.99128,0.989464,0.993103,0.989912,4,avg
off,0.988035,0.99053,0.985553,0.989912,4,avg


In [40]:
metrics_avg_df.groupby(metrics_avg_df.index).agg([np.mean, np.std])

Unnamed: 0_level_0,f1,f1,prec,prec,recall,recall,acc,acc,fold,fold
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
off,0.957033,0.027376,0.951615,0.043918,0.963901,0.033404,0.963328,0.022291,2,1.581139
on,0.96794,0.018867,0.972544,0.026935,0.964184,0.03054,0.963328,0.022291,2,1.581139
