In [1]:
%matplotlib qt
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import train_test_split
import os, glob
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

## Set up parameter variables

In [4]:
epochs = 1
batch_size = 64
random_seed = 10
# Select between 'large_filter_cnn' or 'binary_then_multiclass'
cnn_to_use = 'large_filter_cnn' # 'binary_then_multiclass'

# Load simulation data
#sim_data_path = glob.glob('*PX*.npz')[0]
sim_data_path = glob.glob('*K*.npz')[0]
print(sim_data_path)

1D_simulated_data_cal0p00588_cropK_randrelrod0p02-0p2-0p4-0p6-1-2-5-10_randsigma2d3-4-5-6-7-8-9_3classes_orix30000neuler_cubo.npz


## Load data and reshape

In [11]:
npz_simulated_data = np.load(sim_data_path)

sim_data = npz_simulated_data['x']
phase_names = npz_simulated_data['phases']
phase_names = [s for s in phase_names]

# Reshape to just (data, label)
sim_data = sim_data.reshape(-1, sim_data.shape[-1])

# Create labels
n_phases = len(phase_names)
sim_labels = np.zeros((n_phases, int(sim_data.shape[0]/n_phases)))
for i in range(n_phases):
    sim_labels[i,:] = i
sim_labels = sim_labels.flatten()
print(phase_names)

['p4mbm_scaled_mixed_halide', 'gratia_2h', 'pbi2_2h'] (90000,)


## Create training and testing datasets

In [17]:
train_data, test_data, train_labels, test_labels = train_test_split(sim_data, sim_labels,
                                                                    test_size=0.25,
                                                                    random_state=random_seed)
print('train_data & train_labels:')
print(np.shape(train_data), np.shape(train_labels))
print('test_data & test_labels:')
print(np.shape(test_data), np.shape(test_labels))
print(test_labels[:5])

train_data & train_labels:
(67500, 147) (67500,)
test_data & test_labels:
(22500, 147) (22500,)
[1. 1. 2. 2. 0.]


## Connect to `neptune.io` and start experiment

In [None]:

PARAMS = {
'n_classes': len(phase_names),
'epochs': epochs,
'batch_size': batch_size,
'cnn_to_use': cnn_to_use,
}

tags = [cnn_to_use] + phase_names

project_name = 'ml_sed_cambridge/mini2-anish'

# Connect your script to Neptune
import neptune.new as neptune
from neptune.new.types import File

run = neptune.init(api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlYWE5ZGEyNy1kZWU0LTQzN2MtOTE4YS03Y2VlMDcyMTRmYTMifQ==',
                   project=project_name, tags=tags)

run['source_code/model/my_params'].log(PARAMS)
run['source_code/simulation_dataset_name'].log(os.path.basename(sim_data_path))


## Create Neural Network

In [13]:
# Reshape to create categorical labels (instead of value from 0-n, get an n-array with 0s and 1)
from tensorflow.keras.utils import to_categorical
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)
test_labels[:5]


array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [8]:
def create_large_filter_model(input_shape, output_classes):
    conv1d_filters = 64
    conv1d_kernel_size = 6
    deep_layer_input = 128
    max_pooling = 2
    dropout_rate = 0.5
    if output_classes == 2:
        final_activation_func = 'sigmoid'
        loss_function = 'binary_crossentropy'
    else:
        final_activation_func = 'softmax'
        loss_function = 'categorical_crossentropy'

    model = Sequential()
    model.add(tf.keras.layers.Conv1D(conv1d_filters, conv1d_kernel_size,
                                     input_shape=input_shape,
                                     data_format='channels_last',
                                     activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(max_pooling))
    model.add(tf.keras.layers.Conv1D(conv1d_filters, conv1d_kernel_size,
                                     activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(max_pooling))
    model.add(Dropout(dropout_rate))
    model.add(tf.keras.layers.Conv1D(conv1d_filters, conv1d_kernel_size,
                                     activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(max_pooling))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(deep_layer_input, activation = 'relu'))
    model.add(tf.keras.layers.Dense(output_classes,  activation=final_activation_func))
    model.compile(loss=loss_function,
                  optimizer = 'adam',
                  metrics=['accuracy'])
    return model

In [10]:
input_shape = (train_data[0].size, 1)
output_classes = len(phase_names)

if cnn_to_use == 'large_filter_cnn':
    model = create_large_filter_model(input_shape, output_classes)
elif cnn_to_use == 'binary_then_multiclass':
    raise NotImplementedError("Please use the other notebook template called 'binary_then_cat'")
else:
    raise NotImplementedError("The model asked is not implemented yet.")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 142, 64)           448       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 71, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            24640     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 33, 64)           0         
 1D)                                                             
                                                                 
 dropout (Dropout)           (None, 33, 64)            0         
                                                                 
 conv1d_2 (Conv1D)           (None, 28, 64)            2

In [11]:
batch_size = PARAMS['batch_size']
epochs =  PARAMS['epochs']

from neptune.new.integrations.tensorflow_keras import NeptuneCallback
neptune_cbk = NeptuneCallback(run=run, base_namespace='metrics')

history = model.fit(train_data, train_labels,
          batch_size=batch_size, epochs=epochs,
          callbacks=[neptune_cbk])

Epoch 1/2
Epoch 2/2


In [12]:
accuracy_train = model.evaluate(train_data,train_labels,)
accuracy_test = model.evaluate(test_data, test_labels,)



In [13]:
data = np.vstack([accuracy_train, accuracy_test])
accuracies = pd.DataFrame(data, index=['train', 'test'], columns=['loss', 'accuracy'])
run['metrics/train/accuracies_df'].upload(File.as_html(accuracies))
accuracies

Unnamed: 0,loss,accuracy
train,0.084727,0.973244
test,0.095624,0.969067


In [14]:
import datetime
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d")

train_ac_str = f"{accuracy_train[1]:.4f}".replace(".", "p")
test_ac_str = f"{accuracy_test[1]:.4f}".replace(".", "p")

In [15]:
name = f'{timestamp}_CNN_{cnn_to_use}_{output_classes}nclasses_' \
       f'{epochs}epochs_{batch_size}batchsize__train_{train_data.shape[0]/output_classes:.0f}n_' \
       f'{train_ac_str}ac__test_{test_data.shape[0]/output_classes:.0f}n_' \
       f'{accuracy_test[1]:.4f}ac.h5'

model.save(name)
run['source_code/model/model_name'].log(name)

phases_txt_name = f"phase_names_{name.split('.')[0]}"
np.save(phases_txt_name, np.array(phase_names))
run['source_code/model/phase_names'].log(phase_names)

model.summary(print_fn=lambda x: run['source_code/model/model_summary'].log(x))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 142, 64)           448       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 71, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            24640     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 33, 64)           0         
 1D)                                                             
                                                                 
 dropout (Dropout)           (None, 33, 64)            0         
                                                                 
 conv1d_2 (Conv1D)           (None, 28, 64)            2

## Undestanding misclassification in test data

In [16]:
# Get the metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import pandas as pd

preds = np.round(model.predict(test_data), 0)
classification_metrics = metrics.classification_report(test_labels, preds,
                                                       target_names=phase_names, output_dict=True)
classification_metrics = pd.DataFrame(classification_metrics).T

cf_matrix = confusion_matrix(test_labels.argmax(1), preds.argmax(1),)

  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
classification_metrics.iloc[:-4,:-1].plot()
fig = plt.gcf()
run['metrics/test/classification_metrics'].upload(File.as_image(fig))
plt.close()


I found a path object that I don't think is part of a bar chart. Ignoring.



In [18]:
run['metrics/test/classification_metrics'].upload(File.as_html(classification_metrics))
print(classification_metrics)

                           precision    recall  f1-score  support
p4mbm_scaled_mixed_halide   0.978901  0.965536  0.972172   7544.0
gratia_2h                   0.945438  0.970183  0.957651   7412.0
pbi2_2h                     0.988498  0.968319  0.978305   7544.0
micro avg                   0.970718  0.968000  0.969357  22500.0
macro avg                   0.970945  0.968013  0.969376  22500.0
weighted avg                0.971095  0.968000  0.969445  22500.0
samples avg                 0.968000  0.968000  0.968000  22500.0


In [19]:
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    import itertools
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment='center', color='white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return plt.gcf()

In [20]:
fig = plot_confusion_matrix(cf_matrix, phase_names, normalize=False)
run['metrics/test/confusion_matrix'].upload(File.as_image(fig))
plt.close()

In [21]:
fig_norm = plot_confusion_matrix(cf_matrix, phase_names, normalize=True)
run['metrics/test/confusion_matrix_norm'].upload(File.as_image(fig_norm))
plt.close()

In [22]:
%matplotlib inline
# Plot some of the misclassified data:
bool_predictions = test_labels.argmax(1) == preds.argmax(1)

n_bad = np.count_nonzero(~bool_predictions)
n_max = min(25, n_bad)

fig, axs = plt.subplots(nrows=n_max, figsize = (10, n_max * 1.5), sharex=True)
ax = 0
for i, bool_pred in enumerate(bool_predictions):
    if ax >= n_max:
        break
    if bool_pred == False:
        true_phase = phase_names[test_labels.argmax(1)[i]]
        pred_phase = phase_names[preds.argmax(1)[i]]
        lab = 'True: {}, Pred: {}'.format(true_phase, pred_phase)
        axs[ax].plot(test_data[i,:], label=lab, color=f'C{ax}')
        axs[ax].legend()
        ax += 1

run['metrics/test/bad_predictions'].upload(File.as_image(fig))
plt.close()

In [23]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!
All 2 operations synced, thanks for waiting!


Waiting for the remaining 2 operations to synchronize with Neptune. Do not kill this process.
