# Script configuration

In [65]:
!pip install pandas
!pip install matplotlib
!pip install sklearn
!pip install tensorflow
!pip install pydot
!pip install graphviz
!pip install hunga-bunga
!pip install tabulate

import random
import pandas as pd
import numpy as np
from datetime import datetime

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)
from matplotlib import pyplot as plt
from IPython.display import clear_output

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

tf.random.set_seed(123)

class CSV_FORMATS():
  METRICS = "metrics" # dataset format with extracted metrics
  OPERATIONS = "operations" # dataset format where every entry is an entity operation

class TraceFeature():
  def __init__(self, name: str, first_idx: int, last_idx: int =None):
    self.name = name
    self.first_idx = first_idx
    self.last_idx = last_idx

Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/


You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m
Looking in indexes: https://pypi.org/simple, https://pip:****@pypi.infra.unbabel.com/simple/
You should consider upgrading via the '/usr/local/bin/python3.8 -m pip install --upgrade pip' command.[0m


# Read dataset

In [78]:
def read_dataset(file, row_names, rows_to_exclude):
  return pd.read_csv(CSV_FILE, names=CSV_ROWS, skiprows=1, usecols = [i for i in CSV_ROWS if i not in CSV_ROWS_TO_EXCLUDE])


def split_dataset_by_trace_features(trace_dataset):
  features = []

  for idx, name in enumerate(trace_dataset["Feature"]):
    current_feature = features[len(features) - 1] if len(features) > 0 else None

    if current_feature is None or name != current_feature.name:
      if len(features) > 0:
        last_feature = features[len(features) - 1]
        last_feature.last_idx = idx - 1
      
      features.append(TraceFeature(name, idx))

  current_feature.last_idx = idx
  return features


def create_batch(dataset_features, dataset_labels, trace_controllers, string_to_id):
  clusters_features = []
  clusters_labels = []

  for idx, controller in enumerate(trace_controllers):
    clusters = []
    for _ in range(controller.first_idx,controller.last_idx + 1):
        clusters.append([])
    
    for feature, values in dataset_features.items():
      for idx2, cluster_feature_value in enumerate(values[controller.first_idx:controller.last_idx + 1]):
        if feature == "Feature":
            feature_id = string_to_id.get(cluster_feature_value)
            if not feature_id:
                feature_id = string_to_id["next"]
                string_to_id[cluster_feature_value] = feature_id
                string_to_id["next"] += 1
            
            cluster_feature_value = feature_id
    
        clusters[idx2].append(cluster_feature_value)
    
    clusters_features = clusters_features + clusters
    feature_labels = np.asarray(dataset_labels[controller.first_idx:controller.last_idx + 1]).astype('float32')

    if idx == 0:
      clusters_labels = feature_labels
    else:
      clusters_labels = np.concatenate((clusters_labels, feature_labels))
  
  clusters_features = np.array(clusters_features)
  return clusters_features, clusters_labels


def get_kfold_iteration_batches(
    iteration,
    dataset_features,
    dataset_labels,
    trace_features,
    training_features_size,
    validation_features_size,
    testing_features_size
):
  testing_start_idx = iteration * testing_features_size
  testing_end_idx = testing_start_idx + testing_features_size
  testing_features = trace_features[testing_start_idx:testing_end_idx]

  if iteration == 0:
        training_start_idx = testing_end_idx + validation_features_size
        training_features = trace_features[training_start_idx:]
  elif iteration < (K_FOLD_VALUE - 1):
        training_start_idx_2 = testing_end_idx + validation_features_size
        training_features = trace_features[:testing_start_idx] + trace_features[training_start_idx_2:]
  else:
        training_features = trace_features[:testing_start_idx]

  # now we divide the dataset into batches
  string_to_id = {}
  string_to_id["next"] = 0
  training_batch_features, training_batch_labels = create_batch(dataset_features, dataset_labels, training_features, string_to_id)
  testing_batch_features, testing_batch_labels = create_batch(dataset_features, dataset_labels, testing_features, string_to_id)

  validation_batch_features = None
  validation_batch_labels = None
  if APPLY_FIT_VALIDATION:
    validation_end_idx = testing_end_idx + validation_features_size
    validation_features = trace_features[testing_end_idx:validation_end_idx]
    validation_batch_features, validation_batch_labels = create_batch(dataset_features, dataset_labels, validation_features, string_to_id)

  return (training_batch_features, training_batch_labels), (testing_batch_features, testing_batch_labels), (validation_batch_features, validation_batch_labels)

# Preprocessing data

To build the preprocessing model, start by building a set of symbolic keras.Input objects, matching the names and data-types of the CSV columns.

In [34]:
def create_input_objects(dataset_features):
  inputs = {}

  for name, column in dataset_features.items():
    dtype = column.dtype
    if dtype == object:
      dtype = tf.string
    else:
      dtype = tf.float32

    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
  
  return inputs

The first step in the preprocessing logic is to concatenate the numeric inputs together, and run them through a normalization layer:

In [35]:
def create_preprocessing_logic(dataset, dataset_features, inputs):
  numeric_inputs = {name:input for name,input in inputs.items()
                    if input.dtype==tf.float32}

  preprocessed_inputs = []
  if numeric_inputs:
    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = preprocessing.Normalization()
    norm.adapt(np.array(dataset[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    # Collect all the symbolic preprocessing results, to concatenate them later.
    preprocessed_inputs = [all_numeric_inputs]

    # For the string inputs use the preprocessing.StringLookup function to map from 
    # strings to integer indices in a vocabulary. Next, use preprocessing.CategoryEncoding 
    # to convert the indexes into float32 data appropriate for the model.
    for name, input in inputs.items():
      if input.dtype == tf.float32:
        continue

      lookup = preprocessing.StringLookup(vocabulary=np.unique(dataset_features[name]))
      one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

      x = lookup(input)
      x = one_hot(x)
      preprocessed_inputs.append(x)
    
  preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
  preprocessing_model = tf.keras.Model(inputs, preprocessed_inputs_cat)
  tf.keras.utils.plot_model(model = preprocessing_model , rankdir="LR", dpi=126, show_shapes=True)
  
  return preprocessing_model

# Design Neural Network model

Now build the model on top of this:

In [36]:
def build_neural_network_model(body, preprocessing_head, inputs, loss, optimizer):
  preprocessed_inputs = preprocessing_head(inputs)
  result = tf.keras.Sequential(body)(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)

  # The purpose of loss functions is to compute the quantity that 
  # a model should seek to minimize during training.
  # Binary classification loss function comes into play when solving a problem 
  # involving just two classes (1 or 0)

  # Adam optimization is a stochastic gradient descent method that is based on 
  # adaptive estimation of first-order and second-order moments.
  # is computationally efficient, has little memory requirement, and is well 
  # suited for problems that are large in terms of data/parameters"
  model.compile(
      loss=loss,
      optimizer=optimizer,  
      metrics=["accuracy"],  
  )
  return model

# Training

In [37]:
def fit_neural_network(model, training_features, training_labels, validation_features, validation_labels, epochs, shuffle, weights):
  callbacks = [
      tf.keras.callbacks.EarlyStopping(
          # Stop training when `loss` is no longer improving
          monitor="loss",
          # "no longer improving" being defined as "no better than 1e-2 less"
          min_delta=1e-4,
          # "no longer improving" being further defined as "for at least 2 epochs"
          patience=2,
          verbose=1,
      )
  ]

  history = model.fit(
      x=training_features,
      y=training_labels,
      callbacks=callbacks,
      shuffle=shuffle,
      epochs=epochs,
      validation_data=(validation_features, validation_labels) if APPLY_FIT_VALIDATION else None,
      class_weight=weights, # This argument allows you to define a dictionary that maps class integer values to the importance to apply to each class.
      verbose=0,
  )

  return history

In [38]:
def plot_training_results(history):
  # plot loss during training
  plt.figure(1)
  plt.title('Loss')
  plt.plot(history.history['loss'], label='train')

  if APPLY_FIT_VALIDATION:
    plt.plot(history.history['val_loss'], label='test')

  plt.legend()

  # plot accuracy during training
  plt.figure(2)
  plt.title('Accuracy')
  plt.plot(history.history['accuracy'], label='train')

  if APPLY_FIT_VALIDATION:
    plt.plot(history.history['val_accuracy'], label='test')

  plt.legend()
  plt.show()

  print(f"\nTraining results:\nFinal loss: {history.history['loss'][len(history.history['loss'])-1]}")
  print(f"Final accuracy: {history.history['accuracy'][len(history.history['accuracy'])-1]}\n")

# Testing

In [39]:
from sklearn.metrics import roc_curve, auc

def test_model(model, testing_features, testing_labels, verbose=True):
  print(f"Results for {testing_labels.size} test samples\n")

  results = model.evaluate(testing_features, testing_labels, batch_size=testing_labels.size, verbose=0)
  print(f"Loss {results[0]} | Recall: {results[1]}\n")

  predictions = model.predict(testing_features)

  if verbose:
    for idx, prediction in enumerate(predictions):
      label = 0 if prediction[0] > 0.500 else 1
      percentage = prediction[0] if label == 0 else prediction[1]
      percentage = int(percentage * 100)
      correct_label = testing_labels[idx]
      feature = testing_features["Feature"][idx]
      
      print(f"Prediction: {label} ({percentage} %) | Correct: {correct_label} | Feature: {feature}")
  
  return predictions, results[0], results[1]

# !!!!!!!!!!!!!!!!!!
#  If one feature has multiple clusters being the orchestrator, we should select the one with
#  the highest probability

# evaluate the ROC AUC of the predictions
def plot_testing_results(predictions, testing_labels):
  results = []
  for prediction in predictions:
    label = 0 if prediction[0] > 0.5 else 1
    results.append(label)
  
  fpr_keras, tpr_keras, thresholds_keras = roc_curve(testing_labels, results)
  
  auc_keras = auc(fpr_keras, tpr_keras)

  print("\n")
  plt.figure(1)
  plt.plot([0, 1], [0, 1], 'k--')
  plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
  plt.xlabel('False positive rate')
  plt.ylabel('True positive rate')
  plt.title('ROC curve')
  plt.legend(loc='best')
  plt.show()

  print(f"AUC: {auc_keras}")
  return auc_keras

# Main script execution

In [80]:
# -------------------------------------------------------------
# SCRIPT CONFIGURATION
# -------------------------------------------------------------
CSV_FORMAT = CSV_FORMATS.METRICS

NETWORK_ARCHITECTURE = [
    layers.Dense(9, activation="relu"),
    layers.Dense(16, activation="relu"),
    layers.Dense(2, activation="softmax")
]

TRAINING_EPOCHS = 100
LEARNING_RATE = 0.0001
SHUFFLE_ON_TRAINING = False
CLASS_WEIGHTS = {0:1, 1:3}

APPLY_FIT_VALIDATION = False
K_FOLD_VALUE = 1

EXPORT_MODEL = False

# -------------------------------------------------------------
# EXECUTION
# -------------------------------------------------------------
if CSV_FORMAT == CSV_FORMATS.METRICS:
  CSV_FILE = "../output/ml-dataset-23-03.csv"
  CSV_ROWS = ["Codebase", "Feature", "Cluster", "CLIP", "CRIP", "CROP", "CWOP", "CIP", "COP", "CPIF", "CIOF", "Orchestrator"]
  CSV_ROWS_TO_EXCLUDE = ["Cluster", "Codebase", "CROP", "CWOP", "CLIP", "CRIP"]
  #CSV_ROWS_TO_EXCLUDE = []

elif CSV_FORMAT == CSV_FORMATS.OPERATIONS:
  CSV_FILE = "2021-03-16 23:41:19.csv"
  CSV_ROWS = ["Codebase", "Feature", "Cluster", "Entity", "Operation", "Orchestrator"]
  #CSV_ROWS_TO_EXCLUDE = ["Codebase"]
  CSV_ROWS_TO_EXCLUDE = ["Cluster"]


dataset = read_dataset(CSV_FILE, CSV_ROWS, CSV_ROWS_TO_EXCLUDE)
# print(dataset.head())

dataset_features = dataset.copy()
dataset_labels = dataset_features.pop('Orchestrator')

# generate a trace_features array to make the splitting of the batches easier
trace_features = split_dataset_by_trace_features(dataset)
random.shuffle(trace_features)

# preprocessing
inputs = create_input_objects(dataset_features)
trace_preprocessing = create_preprocessing_logic(dataset, dataset_features, inputs)

number_trace_features = len(trace_features)
if K_FOLD_VALUE == 1:
    training_features_size = int(number_trace_features - (number_trace_features*0.1))
else:
    training_features_size = int(number_trace_features - (number_trace_features/K_FOLD_VALUE))
validation_features_size = int((number_trace_features - training_features_size) / 2) if APPLY_FIT_VALIDATION else 0
testing_features_size = number_trace_features - training_features_size - validation_features_size

print(f"\n\nBatch size: {dataset_labels.size} | Number of trace features: {number_trace_features}")
print(f"Training size: {training_features_size} | Validation size: {validation_features_size} | Testing size: {testing_features_size}\n\n")


histories = []
labels = []
predictions = []
aucs = []
losses = []
recalls = []

from hunga_bunga import HungaBungaClassifier, HungaBungaRegressor

for iteration in range(K_FOLD_VALUE):
  (training_batch_features, training_batch_labels), (testing_batch_features, testing_batch_labels), (validation_batch_features, validation_batch_labels) = get_kfold_iteration_batches(
      iteration,
      dataset_features,
      dataset_labels,
      trace_features,
      training_features_size,
      validation_features_size,
      testing_features_size,
  )
  labels.append(testing_batch_labels)

  clf = HungaBungaClassifier(brain=False)
  clf.fit(
      training_batch_features,
      training_batch_labels,
  )

  print("\n\n--------------------------------------------------------------------------------------\n\n")


if EXPORT_MODEL:
  filename = f'trace_trained_model-{datetime.now().strftime("%d_%m_%Y_%H_%M_%S")}'
  model.save(filename)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


Batch size: 2325 | Number of trace features: 717
Training size: 645 | Validation size: 0 | Testing size: 72




 60%|██████    | 9/15 [03:48<02:32, 25.40s/it]


KeyboardInterrupt: 