# Experimenting With 1D CNN on Baumhofer Data
All models and their variants should be stored in ./ml_utils/models.py<br>
Opening a new notebook for working with the model, separate to the data pre-processing, helps keep things clean and easily accessible for the future.

In [57]:
import numpy as np
import pickle
import pandas as pd

import tensorflow as tf
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from ml_utils import models, tools
from baumhofer_utils import *

In [8]:
# Load the data we will need
with open("./data/X_array_first_100.pkl", "rb") as a_file:
    X = pickle.load(a_file)

with open("./data/index_first_100.pkl", "rb") as a_file:
    index = pickle.load(a_file)

with open("./data/capacity_target_df_first_100.pkl", "rb") as a_file:
    cap_df = pickle.load(a_file)

with open("./data/y_array_first_100.pkl", "rb") as a_file:
    y_arr = pickle.load(a_file)

# Change the path for this one so it points to your local copy - too big for GitHub
with open("E:/new_german_data/code/processed_data/baumhofer_first_100_cycles.pkl", "rb") as a_file:
    data = pickle.load(a_file)

cells = list(data.keys())
index_cells = np.array([name.split("_")[0] for name in index])
    
del a_file

In [9]:
# A function to reshape the data so it's the appropriate shape for 1D CNN / LSTM models.
# This will accept arbitrary numbers of features

def reshape_for_model(X_arr, to_plot=False):
    # Get the shape of the data prior to reshaping for model
    features, samples, timesteps = X_arr.shape
    
    # Instead of creating a new array where we assume a number of features,
    # we stack all features present in X_arr.
    X_reshaped = np.array([np.vstack(
                            [np.vstack(
                                [X_arr[j, i, :] for j in range(features)]
                            )]).T
                           for i in range(samples)])
        
    if to_plot:
        # Plot a random selection of instances to check they look OK
        indices = np.random.randint(0, samples, size=25)
        fig, ax = plt.subplots(5,5)
        for subplot, sample in enumerate(indices):
            ax.flatten()[subplot].plot(X_reshaped[sample,:,1])

        plt.show()
        
    return X_reshaped


X = reshape_for_model(X, to_plot=False)

In [33]:
# Keep a test set back and use the rest for k-fold
train_cells, test_cells = train_test_split(cells, train_size=0.9, random_state=31)
train_cells = np.array(train_cells)
test_cells = np.array(test_cells)

index_train = np.in1d(index_cells, train_cells)
index_test = np.in1d(index_cells, test_cells)

X_train, X_test = X[index_train], X[index_test]
y_train, y_test = y_arr[index_train], y_arr[index_test]

# Get the index arrays for k-fold
X_train_index = np.array([val for val in index_cells if val in train_cells])
X_test_index = np.array([val for val in index_cells if val in test_cells])

### Train and evaluate using k-fold cross validation with splits on the cell level

In [59]:
# Callback to print the epoch every 20 so we can monitor progress without loads of output
class MyCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 20 == 0:
            print(f"Finished epoch {epoch}")
            
    def on_train_end(self, logs=None):
        print("Training finished")

In [70]:
# Specify which target you want to train for (refer to cap_df)
target_idx = 0

# Create some lists to store the scores from model evaluation
kf_train_scores = []
kf_test_scores = []


# Instantiate a custom k-fold generator
n_folds = 5
k_fold_generator = tools.kfold_gen(train_cells, X_train_index, n_folds=n_folds)

# Print the results of iterating through it - should give you n_folds outputs
for i in range(n_folds):
    print(f"#### Fold {i+1} ####")
    kf_train_indices, kf_test_indices = next(k_fold_generator)
    # Print the number of "instances" associated with the train and test indices
    print(f"{len(kf_train_indices)} training instances")
    print(f"{len(kf_test_indices)} test instances")
    print()
    
    # Extract the k-fold data from the train arrays
    X_train_kf, y_train_kf = X_train[kf_train_indices], y_train[kf_train_indices]
    X_test_kf, y_test_kf = X_train[kf_test_indices], y_train[kf_test_indices]
    
    # Scaling
    X_train_kf, _, X_test_kf = scaler_3d(X_train_kf, X_test_kf, X_test_kf, scaler_type='robust', return_scaler=False)
    
    # Create a model instance
    model = models.build_convnet_model(X_in=X_train_kf, loss='mse', n_outputs=1)
    progress_callback = MyCallback()
    callbacks = [LearningRateScheduler(tools.lr_scheduler, verbose=0), progress_callback]
    # Train the model
    history = model.fit(X_train_kf, 
                        y_train_kf[:, target_idx],
                        callbacks=callbacks,
                        batch_size=512,
                        epochs=100,    
                        shuffle=True,
                        verbose=0)
    
    # You could save the model here if you wanted
    # TODO
    
    # Evaluate the model on the k-fold train and test set. Add scores to lists.
    kf_train_scores.append(model.evaluate(X_train_kf, y_train_kf[:, target_idx]))
    kf_test_scores.append(model.evaluate(X_test_kf, y_test_kf[:, target_idx]))
    
    
    
    

## Template code for running without k-fold

In [12]:
model = models.build_convnet_model(X_in=X_train_sc, loss='mse', n_outputs=1)
#model.summary()

In [19]:
verbose = 1
callbacks = [LearningRateScheduler(tools.lr_scheduler, verbose=0)]

history = model.fit(X_train_sc, 
                    y_train,
                    callbacks=callbacks,
                    batch_size=512,
                    epochs=200,    
                    shuffle=True,
                    verbose=verbose)

### Train the model

In [None]:
model = build_convnet_model(n_outputs=1)

target_idx = 2
verbose = 0
callbacks = [LearningRateScheduler(lr_scheduler, verbose=verbose)]
history = model.fit(X_train_sc, 
                    y_train[:, target_idx],
                    callbacks=callbacks,
                    batch_size=512,
                    epochs=200,    
                    shuffle=True,
                    verbose=verbose)

### Evaluate the trained model on train, val and test

In [None]:
model.evaluate(X_train_sc, y_train[:, target_idx])

In [None]:
model.evaluate(X_test_sc, y_test[:, target_idx])

In [None]:
model.evaluate(X_val_sc, y_val[:, target_idx])

### Train predictions

In [None]:
plt.figure(figsize=(20,5))

plt.plot(model.predict(X_train_sc))
plt.plot(y_train[:, target_idx])

### Val predictions

In [None]:
plt.figure(figsize=(20,5))

plt.plot(model.predict(X_val_sc))
plt.plot(y_val[:, target_idx])

### Test predictions

In [None]:
plt.figure(figsize=(20,5))

plt.plot(model.predict(X_test_sc))
plt.plot(y_test[:, target_idx])