# Example modeling process

### Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from helpers import (get_training_observations, 
                     get_training_labels)
import pandas as pd
import numpy as np
from joblib import dump
from joblib import load

# Model specific imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import metrics
tf.get_logger().setLevel('INFO')

### Load training data and labels

In [2]:
# Set up data
x_train = get_training_observations()
y_train = get_training_labels()
le = load('../model_joblibs/final_labelEncoder.joblib')
    
y_train_transformed = le.transform(y_train['EMPO_3'])
y_train_transformed

x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train_transformed, test_size=0.2, random_state=0, stratify=y_train_transformed)   

Getting all training observations from 'metagenome_classification.db'...
Getting all training labels from 'metagenome_classification.db'...


In [3]:
x_train.head(10)

index,PF00001.19,PF00002.22,PF00003.20,PF00004.27,PF00005.25,PF00006.23,PF00007.20,PF00008.25,PF00009.25,PF00010.24,...,PF17216.1,PF17217.1,PF17218.1,PF17219.1,PF17220.1,PF17221.1,PF17222.1,PF17223.1,PF17224.1,PF17225.1
0,4e-06,6.092434e-07,0.0,0.001285,0.021986,0.000809,0.0,1e-05,0.001546,6.092434e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1e-05,0.0,3e-06,0.001523,0.01532,0.001126,0.0,2.4e-05,0.002005,3.392429e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.001595,0.01164,0.001008,0.0,0.0,0.00165,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1e-05,0.002188,0.018117,0.001597,0.0,0.0,0.002543,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2e-06,0.0,0.0,0.001786,0.014518,0.000924,0.0,0.0,0.001915,3.916983e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.2e-05,1.197289e-05,0.0,0.002706,0.008824,0.001257,0.0,0.0,0.002155,0.0,...,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.002233,0.013982,0.001488,0.0,5e-06,0.002459,5.383928e-06,...,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.001661,0.018014,0.000828,0.0,0.0,0.001876,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.005429,9.319447e-05,4.7e-05,0.001631,0.012302,0.000815,0.0,0.000396,0.001375,0.0003494793,...,0.0,4.7e-05,0.0,0.0,0.0,2.3e-05,0.0,0.0,0.0,0.0
9,8e-06,0.0,0.0,0.002739,0.016584,0.00123,0.0,0.0,0.002286,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
y_train.head()

index,EMPO_1,EMPO_2,EMPO_3
0,Host-associated,Plant,Plant rhizosphere
1,Free-living,Non-saline,Soil (non-saline)
2,Free-living,Saline,Water (saline)
3,Free-living,Non-saline,Water (non-saline)
4,Free-living,Non-saline,Soil (non-saline)


### Build the model


In [5]:
K = tf.keras.backend

def build_model(
    activation='relu',
    hidden_layer_sizes=[],
    final_layer_activation='softmax',
    dropout=0.0,
    optimizer='Adam',
    learning_rate=0.01,
    kernel_regularizer=1e-5,
    bias_regularizer=1e-5,
    activity_regularizer=1e-5,
    n_classes=15
):
    """Build a multi-class logistic regression model using Keras.

    Args:
      n_classes: Number of output classes in the dataset.
      hidden_layer_sizes: A list with the number of units in each hidden layer.
      activation: The activation function to use for the hidden layers.
      optimizer: The optimizer to use (SGD, Adam).
      learning_rate: The desired learning rate for the optimizer.

    Returns:
      model: A tf.keras model (graph).
    """
    tf.keras.backend.clear_session()
    np.random.seed(0)
    tf.random.set_seed(0)
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten())
    
    for hidden_layer_size in hidden_layer_sizes:
        if hidden_layer_size > n_classes:
            model.add(tf.keras.layers.Dense(
                hidden_layer_size,
                activation=activation,
                kernel_regularizer=tf.keras.regularizers.L2(kernel_regularizer),
                bias_regularizer=tf.keras.regularizers.L2(bias_regularizer),
                activity_regularizer=tf.keras.regularizers.L2(activity_regularizer)
            ))
            if dropout > 0:
                model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Dense(
        n_classes,
        activation=final_layer_activation,
        kernel_regularizer=tf.keras.regularizers.L2(kernel_regularizer),
        bias_regularizer=tf.keras.regularizers.L2(bias_regularizer),
        activity_regularizer=tf.keras.regularizers.L2(activity_regularizer)
    ))
    opt = None
    if optimizer == "SGD":
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    elif optimizer == "Adam":
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise f"Unsupported optimizer, {optimizer}"
    model.compile(
        loss="categorical_crossentropy", optimizer=opt, 
        metrics=[tfa.metrics.F1Score(num_classes=15, average='weighted')]
    )
    return model

### Train Model

#### Use hyperparameters for 2-layer model as determined by Optuna run:

With stratified split: Trial 22 finished with value: 0.936034083366394 and parameters: {'learning_rate': 0.008898464308222692, 'hidden_layer_size1': 495, 'hidden_layer_size2': 765, 'kernel_regularizer': 8.856221026715601e-07, 'bias_regularizer': 7.737459161815707e-05, 'activity_regularizer': 1.561737020593232e-05, 'dropout': 0.018195406988695262}. Best is trial 22 with value: 0.936034083366394.

In [6]:
def train_model(x_tr=x_tr, x_val=x_val, y_tr=y_tr, y_val=y_val):
    
    params = {
        'learning_rate': 0.009,
        'kernel_regularizer': 8.9e-07,
        'bias_regularizer': 7.7e-05,
        'activity_regularizer': 1.6e-05,
        'dropout': 0.018
    }
    hidden_layer_sizes = [495,765]
    
    y_tr_one_hot = K.one_hot(K.cast(y_tr, dtype='int64'), 15)
    y_val_one_hot = K.one_hot(K.cast(y_val, dtype='int64'), 15)
    
    # Create model
    nn = build_model(
        **params,
        n_classes=15,
        optimizer='Adam',
        final_layer_activation='softmax',
        hidden_layer_sizes=hidden_layer_sizes
    )
    
    # Fit model
    nn.fit(x=x_tr,
           y=y_tr_one_hot,
           epochs=50,
           batch_size=64,
           verbose=0)
    
    evaluation = nn.evaluate(x=x_val, y=y_val_one_hot, verbose=0, return_dict=True)
    f1_score = evaluation['f1_score']
    loss = evaluation['loss']
    
    print(f'F1 Score: {f1_score}; Loss={loss}')
    
    return nn

nn = train_model(x_tr, x_val, y_tr, y_val)

2022-08-04 02:44:55.772282: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


F1 Score: 0.9188954830169678; Loss=0.4476182758808136


### Save fitted model

In [7]:
# Save best model as joblib or pkl file to 'model_joblibs' folder
from joblib import dump

dump(nn, '../model_joblibs/neural_network_EMPO3.joblib')

INFO:tensorflow:Assets written to: ram://27c4b29f-8e13-4109-b1b0-a8112dbddedb/assets


['../model_joblibs/neural_network_EMPO3.joblib']

## SHAP

In [None]:
import shap

explainer = shap.KernelExplainer(nn.predict,x_tr)

In [None]:
shap_values = explainer.shap_values(x_val,nsamples=1983)

In [None]:
shap.summary_plot(shap_values,x_val,feature_names=features)

## Integrated Gradients

In [11]:
from keras_explain.lime_ribeiro import Lime

y_tr_one_hot = K.one_hot(K.cast(y_tr, dtype='int64'), 15)
y_val_one_hot = K.one_hot(K.cast(y_val, dtype='int64'), 15)

explainer = Lime(nn)
exp_pos, exp_neg = explainer.explain(x_tr, y_tr_one_hot)

  0%|          | 0/1000 [00:00<?, ?it/s]



ValueError: in user code:

    File "/Users/ekirton/Documents/ischool/207 Intro Machine Learning/final-project/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1845, in predict_function  *
        return step_function(self, iterator)
    File "/Users/ekirton/Documents/ischool/207 Intro Machine Learning/final-project/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1834, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ekirton/Documents/ischool/207 Intro Machine Learning/final-project/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1823, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/ekirton/Documents/ischool/207 Intro Machine Learning/final-project/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1791, in predict_step
        return self(x, training=False)
    File "/Users/ekirton/Documents/ischool/207 Intro Machine Learning/final-project/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/ekirton/Documents/ischool/207 Intro Machine Learning/final-project/venv/lib/python3.9/site-packages/keras/engine/input_spec.py", line 248, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential" (type Sequential).
    
    Input 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 16306, but received input with shape (None, 48869082)
    
    Call arguments received by layer "sequential" (type Sequential):
      • inputs=tf.Tensor(shape=(None, 999, 16306, 3), dtype=float32)
      • training=False
      • mask=None
