# Easily export jupyter cells to python module
https://github.com/fastai/course-v3/blob/master/nbs/dl2/notebook2script.py

In [4]:
! python /tf/src/scripts/notebook2script.py finetuning_gpt2.ipynb

Converted finetuning_gpt2.ipynb to exp/nb_finetuning.py


In [None]:
#export
from exp.nb_embedding import generate_embeddings_from_files, generate_embeddings_from_list, generate_embeddings_from_text_files
from tensorflow.keras import layers
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf


tf.__version__

In [None]:
cd /tf/src/data/gpt-2

# Read in training data

In [None]:
df = pd.read_csv("/tf/src/data/datasets/security-training.csv")
df.head()
# for method in df['code']:
#     print(method)

In [None]:
a = np.asarray(df["label"])
unique, counts = np.unique(a, return_counts=True)
dict(zip(unique, counts))

# Save Features

In [None]:
features = generate_embeddings_from_files(
    "/tf/src/data/gpt-2/checkpoint/run1",
    "/tf/src/data/methods/DATA00M_[god-r]/test",
    samples = 100
)

len(features)

In [None]:
features = generate_embeddings_from_list(
    "/tf/src/data/gpt-2/checkpoint/run1",
    df['code'],
    samples = 1000
)

len(features)

In [None]:
features = generate_embeddings_from_text_files(
    "/tf/src/data/gpt-2/checkpoint/run1",
    ["/tf/src/data/datasets/train/fixed.txt", "/tf/src/data/datasets/train/buggy.txt"],
    samples = 1000
)

len(features)

In [None]:
with open('/tf/src/data/embeddings/vulnerability_gpt-2.pickle', 'wb') as f:
    pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)

# Read and Prepare Features and Labels

In [None]:
#export
def prepare_dataset(pickle_path, MAX_LEN = 1024):
    with open(pickle_path, 'rb') as f:
            features = pickle.load(f)

    features = np.asarray(features)
    features = tf.keras.preprocessing.sequence.pad_sequences(features, MAX_LEN)
    return features

In [None]:
MAX_LEN = 1024

with open('/tf/src/data/embeddings/vulnerability_gpt-2.pickle', 'rb') as f:
            features = pickle.load(f)

features = np.asarray(features)
features = tf.keras.preprocessing.sequence.pad_sequences(features, MAX_LEN)
features.shape

## Security vulnerability labels

In [None]:
y = np.array(df['label'][:1000])
y.shape, features.shape

## Buggy vs. Non Buggy labels

In [None]:
a = np.zeros((1000,), dtype=int, order='C')
b = np.ones((1000,), dtype=int, order='C')
y = np.append(a, b)

y.shape, y

# Define Model

In [None]:
#export
def get_model(input_shape):
    model = tf.keras.Sequential()
#     model.add(layers.Flatten(input_shape = input_shape))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(2, activation = "softmax"))
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
#export
def get_gru(input_shape):
    # 1D convolution with 64 output channels (filters) and five kernel size
    model = tf.keras.Sequential()
#     model.add(layers.Flatten(input_shape = input_shape))
    model.add(layers.Conv1D(64, 5))
#     x = Conv1D(64, 5)(embedded_sequences)
    # MaxPool divides the length of the sequence by 5
    model.add(layers.MaxPooling1D(5))
    model.add(layers.Conv1D(64, 5))
    model.add(layers.MaxPooling1D(5))
    # LSTM layer with a hidden size of 64
    model.add(layers.GRU(64))

    #Regularization
    model.add(layers.Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
data_shape = features[0].shape

model = tf.keras.Sequential()
model.add(layers.Flatten(input_shape = data_shape))
model.add(layers.Dense(2, activation = "softmax"))
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy']
             )

In [None]:
#export
def finetune_model(x, y, val_x, val_y, model, callbacks, class_weight, epochs = 100, bs = 128):
    history = model.fit(
        x, y,
        epochs = epochs,
        batch_size = bs,
        validation_data = (val_x, val_y),
        callbacks = callbacks,
        class_weight = class_weight
    )
    
    return history

In [None]:
#export
def evaluate_model(history):
    #Evaluation
    acc = history['accuracy']
    val_acc = history['val_accuracy']
    loss = history['loss']
    val_loss = history['val_loss']

    epochs2 = range(len(acc))

    plt.plot(epochs2, acc, 'b', label='Training')
    plt.plot(epochs2, val_acc, 'r', label='Validation')
    plt.title('Training and validation accuracy')
    plt.ylabel('acc')
    plt.xlabel('epoch')
    plt.legend()

    plt.figure()

    plt.plot(epochs2, loss, 'b', label='Training')
    plt.plot(epochs2, val_loss, 'r', label='Validation')
    plt.title('Training and validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend()

    plt.show()
    

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor='val_loss',
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=2,
        verbose=1
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath='/tf/src/data/checkpoints/finetuning_gpt2_{epoch}.h5',
        # Path where to save the model
        # The two parameters below mean that we will overwrite
        # the current checkpoint if and only if
        # the `val_loss` score has improved.
        save_best_only=True,
        monitor='val_loss',
        verbose=1
    )
]

In [None]:
model = get_model(features[0].shape)
finetune_model(features, y, features, y, model, callbacks = callbacks, class_weight = {0: 0.1, 1: 0.9})

In [None]:
EPOCHS = 50
BS = 64

model.fit(
    features, y, epochs = EPOCHS,
    batch_size = BS,
    validation_data = (features, y),
    callbacks = callbacks,
    class_weight = {0: 1.0, 1: 1.0}
)

model.evaluate(features, y)

In [None]:
model.fit?