In [1]:
import tensorflow as tf
from keras.layers import Embedding, GRU, Dense, Dropout, Input, Concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy

MAX_SEQUENCE_LENGTH = 16750

def buildModel():
    # Define model hyperparameters
    VOCAB_SIZE = 82
    OUTPUT_DIM = 5
    HIDDEN_UNITS_GRU = 64
    HIDDEN_UNITS_DENSE = [128, 64, 1]
    DROPOUT_RATE = 0.2
    LEARNING_RATE = 0.001
    BATCH_SIZE = 32
    NUM_LABELS = 5 # excluding label 4: 0, 1, 2, 3, 5
    NUM_BRANCHES = NUM_LABELS  # Excluding label 4

    # Define the stem layer (feature extractor)
    inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=OUTPUT_DIM, input_length=MAX_SEQUENCE_LENGTH)(inputs)
    gru_output = GRU(HIDDEN_UNITS_GRU)(embedding)

    # Define the vulnerability branches
    branches = []
    for i in range(NUM_BRANCHES):
        dense1 = Dense(HIDDEN_UNITS_DENSE[0])(gru_output)
        dropout = Dropout(DROPOUT_RATE)(dense1)
        dense2 = Dense(HIDDEN_UNITS_DENSE[1])(dropout)
        dense3 = Dense(HIDDEN_UNITS_DENSE[2], activation='sigmoid', name=f'output_{i}')(dense2)
        branches.append(dense3)

    # # Concatenate the branches and create the model
    # concatenated = Concatenate()(branches)
    model = Model(inputs=inputs, outputs=branches)

    label_metrics = {}
    for i in range(NUM_BRANCHES):
        label_metrics[f'output_{i}'] = 'accuracy'

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
                  loss=BinaryCrossentropy(from_logits=False),
                  metrics=label_metrics)

    return model

model = buildModel()
num_outputs = len(model.outputs)
print("Number of outputs:", num_outputs)
print('model output names:')
print(model.output_names)
# Print the model summary
# model.summary()

Number of outputs: 5
model output names:
['output_0', 'output_1', 'output_2', 'output_3', 'output_4']


In [2]:
from keras.callbacks import Callback

class MetricsCallback(Callback):
    def __init__(self):
        super(MetricsCallback, self).__init__()
        self.chunk_metrics = []

    def on_epoch_end(self, epoch, logs=None):
        chunk_loss = []
        chunk_accuracy = []

        # Compute loss and accuracy for each output
        for output_name in self.model.output_names:
            loss_key = f'{output_name}_loss'
            accuracy_key = f'{output_name}_accuracy'

            if loss_key in logs:
                chunk_loss.append(logs[loss_key])
            if accuracy_key in logs:
                chunk_accuracy.append(logs[accuracy_key])

        self.chunk_metrics.append({'loss': chunk_loss, 'accuracy': chunk_accuracy})

In [3]:
pip install scikit-learn



In [5]:
# traing with hdf5 datasets

import tensorflow as tf
import pandas as pd
import h5py
from google.colab import drive
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Define datasets
# Open the HDF5 file
hdf5_file_training = h5py.File('/content/drive/MyDrive/Practicum/HDF5Data/training.h5', 'r')
hdf5_file_validation = h5py.File('/content/drive/MyDrive/Practicum/HDF5Data/validation.h5', 'r')

# Get the dataset you want to iterate over
training_data = hdf5_file_training['data']
training_labels = hdf5_file_training['labels']
print(f'training data shape: {training_data.shape}')
print(f'training labels shape: {training_labels.shape}')

validation_data = hdf5_file_validation['data']
validation_labels = hdf5_file_validation['labels']
print(f'validation data shape: {validation_data.shape}')
print(f'validation labels shape: {validation_labels.shape}')


# Define the total number of chunks for each dataset
num_train_chunks = 70
# num_val_chunks = 11
# num_test_chunks = 16

# Define other model hyperparameters
num_epochs = 1
batch_size = 32

labelsColumns = [0, 1, 2, 3, 5]

# Load the model
model = buildModel()  # Replace with the actual path to your model file

print('model output names:')
print(model.output_names)

# Get the total number of rows in the dataset
total_rows_training = training_data.shape[0]

# Set the chunk size
chunk_size = 1024

# Training phase
for epoch in range(num_epochs):
    trainingCtr = 0
    metrics_callback_2 = MetricsCallback()

    for i in range(0, total_rows_training, chunk_size):

        # Calculate the start and end indices for the current chunk
        start_index = i
        end_index = min(i + chunk_size, total_rows_training)

        # Get the chunk of data & labels
        train_X = training_data[start_index:end_index][:, 0:MAX_SEQUENCE_LENGTH]
        labels = training_labels[start_index:end_index]
        label_0 = labels[:, 0]
        label_1 = labels[:, 1]
        label_2 = labels[:, 2]
        label_3 = labels[:, 3]
        label_4 = labels[:, 5]

        # Train the model on the chunked data
        model.fit(train_X, {'output_0': label_0, 'output_1': label_1, 'output_2': label_2, 'output_3': label_3, 'output_4': label_4}, callbacks=[metrics_callback_2], batch_size=batch_size, epochs=1, verbose=1)

        trainingCtr += 1
        print(f'trained chunk: {trainingCtr}')

        # # After each chunk, evaluate the model on the validation data
        # val_X = validation_data[:][:, 0:MAX_SEQUENCE_LENGTH]
        # val_labels = validation_labels[:]
        # val_label_0 = val_labels[:, 0]
        # val_label_1 = val_labels[:, 1]
        # val_label_2 = val_labels[:, 2]
        # val_label_3 = val_labels[:, 3]
        # val_label_4 = val_labels[:, 5]
        # evaluation_results = model.evaluate(val_X, {'output_0': val_label_4, 'output_1': val_label_3, 'output_2': val_label_2, 'output_3': val_label_1, 'output_4': val_label_0}, verbose=0)
        # # Print the loss and accuracy for each label
        # for result in evaluation_results:
        #     print(result, end=' ')

    # last validation set evaluation
    val_X = validation_data[:][:, 0:MAX_SEQUENCE_LENGTH]
    val_labels = validation_labels[:]
    val_label_0 = val_labels[:, 0]
    val_label_1 = val_labels[:, 1]
    val_label_2 = val_labels[:, 2]
    val_label_3 = val_labels[:, 3]
    val_label_4 = val_labels[:, 5]
    evaluation_results = model.evaluate(val_X, {'output_0': val_label_0, 'output_1': val_label_1, 'output_2': val_label_2, 'output_3': val_label_3, 'output_4': val_label_4}, verbose=0)
    # Print the loss and accuracy for each label
    for result in evaluation_results:
        print(result, end=' ')

    # Calculate precision, recall and
    val_labels_list = [val_labels[:, i].tolist() for i in range(val_labels.shape[1])]
    _ = val_labels_list.pop(4) # remove labels for clean contracts

    predictions = model.predict(val_X)
    predictions_array = np.array(predictions)

    # Convert the predictions to binary format (e.g., using a threshold of 0.5)
    binary_predictions = (predictions_array >= 0.5).astype(int)

    # Calculate precision, recall, and F1-score for each label
    precision, recall, f1_score, _ = precision_recall_fscore_support(val_labels_list[0], binary_predictions[0], average=None)

    for i in range(0, 5):
        # Calculate precision, recall, and F1-score for each label
        precision, recall, f1_score, _ = precision_recall_fscore_support(val_labels_list[i], binary_predictions[i], average=None)
        print(f"Label {i}: Precision={precision}, Recall={recall}, F1-score={f1_score}")


# Save the trained model
model.save("/content/drive/MyDrive/Practicum/TrainedModel/model_escort_design_1.h5")

# # Testing phase
# test_loss = 0.0
# test_accuracy = 0.0
# num_test_samples = 0

# for test_chunk_idx in range(num_test_chunks):
#     # Load the chunked test data
#     test_df = pd.read_csv(test_data_path.format(test_chunk_idx))
#     test_X = test_df.iloc[:, 0:24600].values  # Input sequences
#     test_y = test_df.iloc[:, 24600:].values   # Labels

#     # Evaluate the model on the test data
#     loss, accuracy = model.evaluate(test_X, test_y, verbose=0)
#     test_loss += loss * len(test_X)
#     test_accuracy += accuracy * len(test_X)
#     num_test_samples += len(test_X)

# # Compute the average test loss and accuracy
# test_loss /= num_test_samples
# test_accuracy /= num_test_samples
# print(f'Test loss: {test_loss:.4f}, accuracy: {test_accuracy:.4f}')

# Close the HDF5 file
hdf5_file_training.close()
hdf5_file_validation.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
training data shape: (79414, 24600)
training labels shape: (79414, 6)
validation data shape: (10831, 24600)
validation labels shape: (10831, 6)
model output names:
['output_0', 'output_1', 'output_2', 'output_3', 'output_4']
trained chunk: 1
trained chunk: 2
trained chunk: 3
trained chunk: 4
trained chunk: 5
trained chunk: 6
trained chunk: 7
trained chunk: 8
trained chunk: 9
trained chunk: 10
trained chunk: 11
trained chunk: 12
trained chunk: 13
trained chunk: 14
trained chunk: 15
trained chunk: 16
trained chunk: 17
trained chunk: 18
trained chunk: 19
trained chunk: 20
trained chunk: 21
trained chunk: 22
trained chunk: 23
trained chunk: 24
trained chunk: 25
trained chunk: 26
trained chunk: 27
trained chunk: 28
trained chunk: 29
trained chunk: 30
trained chunk: 31
trained chunk: 32
trained chunk: 33
trained chunk: 34
trained chunk: 35
trained chunk: 36
trained

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Label 0: Precision=[0.85338381 0.        ], Recall=[1. 0.], F1-score=[0.9208927 0.       ]
Label 1: Precision=[0.8315916  0.77777778], Recall=[0.99955536 0.00762943], F1-score=[0.90787016 0.01511063]
Label 2: Precision=[0.75484121 0.64015905], Recall=[0.97730977 0.11282411], F1-score=[0.85178913 0.19183795]
Label 3: Precision=[0.72007475 0.66716867], Recall=[0.97069743 0.1346914 ], F1-score=[0.82681123 0.22413357]
Label 4: Precision=[0.57542863 0.78644764], Recall=[0.96462585 0.15471622], F1-score=[0.72084895 0.2585654 ]
