In [1]:
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer, Embedding, Input, GlobalAveragePooling1D, Dense, Concatenate
from tensorflow.keras.models import Sequential, Model
import pandas as pd
import tensorflow as tf
import random
import os
from sklearn.metrics import confusion_matrix
import numpy as np
import warnings

NUM_TOKEN = 5000
MAX_PRO_LEN = 64
MAX_TXT_LEN = 256
NO_EPO = 60
NO_BAT = 128

MACHINE_1_P = "./data/set1_machine.json"
HUMAN_1_P = "./data/set1_human.json"
MACHINE_2_P = "./data/set2_machine.json"
HUMAN_2_P = "./data/set2_human.json"
TEST_P = "./data/test.json"
RANDOM_SEED = 42
MACHINE_IND = 0
HUMAN_IND = 1
TEST_FRA = 0.2


class DomainData:
    """
    Domain dataset contains data for traininig. 
    Featured with function train test split, padding, 
    downsampling, oversampling and rebalance test class weight.
    """

    def __init__(self, x, y):
        # inupts are (pd.Dataframe, pd.Series)
        self.x = x
        self.y = y

    def t_t_spli(self, test_size, random_state):
        ## train test split according to test fraction <test_size> and random state <random_state>
        ## generated train_x / test_x are pd.Dataframe, train_y / test_y are pd.Series
        self.random_state = random_state
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
            self.x, self.y, test_size=test_size, stratify=self.y, random_state=random_state)
        self.train_x = self.train_x.reset_index(drop=True)
        self.train_y = self.train_y.reset_index(drop=True)
        self.test_x = self.test_x.reset_index(drop=True)
        self.test_y = self.test_y.reset_index(drop=True)

    def add_padding(self, padding, prompt_len, txt_len):
        ## add padding of the given length
        ## out put are np arraies
        self.train_prompt = self.train_x["prompt"]
        self.train_txt = self.train_x["txt"]
        self.train_label = self.train_y.to_numpy()
        self.test_prompt = self.test_x["prompt"]
        self.test_txt = self.test_x["txt"]
        self.test_label = self.test_y.to_numpy()
        unique_classes = np.unique(self.train_label)
        class_weights = class_weight.compute_class_weight(
            "balanced", classes=unique_classes, y=self.train_y)
        self.class_weights = dict(zip(unique_classes, class_weights))

        self.prompt_len = prompt_len
        self.txt_len = txt_len

        self.train_prompt = pad_sequences(
            self.train_prompt, padding=padding, maxlen=prompt_len)
        self.train_txt = pad_sequences(
            self.train_txt, padding=padding, maxlen=txt_len)
        self.test_prompt = pad_sequences(
            self.test_prompt, padding=padding, maxlen=prompt_len)
        self.test_txt = pad_sequences(
            self.test_txt, padding=padding, maxlen=txt_len)

    def down_sampling(self):
        ## down sample the majority calss to have same number of record compare to nimor class
        mac_ind = self.train_y[self.train_y == MACHINE_IND].index.to_list()
        hum_ind = self.train_y[self.train_y == HUMAN_IND].index.to_list()
        lower = min(len(mac_ind), len(hum_ind))
        sel_lit = mac_ind[:lower] + hum_ind[:lower]
        self.train_x = self.train_x.iloc[sel_lit]
        self.train_y = self.train_y.iloc[sel_lit]
        random.shuffle(sel_lit)

    def over_sampling(self, upper_fra):
        ## over sampling the minority class with a fraction then 
        ### down sample the majority to have the same number of records
        
        # find index
        mac_ind = self.train_y[self.train_y == MACHINE_IND].index.to_list()
        hum_ind = self.train_y[self.train_y == HUMAN_IND].index.to_list()
        lower = min(len(mac_ind), len(hum_ind))
        if lower == len(mac_ind):
            upper = int(lower*upper_fra) if lower * \
                upper_fra < len(hum_ind) else len(hum_ind)
            major = hum_ind[:upper]
            minor = mac_ind[:lower]
        else:
            upper = int(lower*upper_fra) if lower * \
                upper_fra < len(mac_ind) else len(mac_ind)
            major = mac_ind[:upper]
            minor = hum_ind[:lower]

        # resampling
        add_n = upper - lower
        oversampled = []
        while (len(oversampled) < add_n):
            oversampled.append(random.choice(mac_ind))
        sel_lit = major + minor + oversampled
        random.shuffle(sel_lit)

        self.train_x = self.train_x.iloc[sel_lit]
        self.train_y = self.train_y.iloc[sel_lit]

    def test_down(self, frac=1):
        ## down sample the majority class in test sets to have same number of record with the minority class
        mac_ind = self.test_y[self.test_y == MACHINE_IND].index.to_list()
        hum_ind = self.test_y[self.test_y == HUMAN_IND].index.to_list()
        lower = min(len(mac_ind), len(hum_ind))
        if frac > 1:
            sel_lit = mac_ind[:lower] + hum_ind[:int(lower/frac)]
        else:
            sel_lit = mac_ind[:int(lower*frac)] + hum_ind[:lower]
        self.test_x = self.test_x.iloc[sel_lit]
        self.test_y = self.test_y.iloc[sel_lit]
        random.shuffle(sel_lit)


def f1_loss(y_true, y_pred):
    # Calculate precision and recall
    tp = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)))
    fp = backend.sum(backend.round(backend.clip(y_pred - y_true, 0, 1)))
    fn = backend.sum(backend.round(backend.clip(y_true - y_pred, 0, 1)))
    precision = tp / (tp + fp + backend.epsilon())
    recall = tp / (tp + fn + backend.epsilon())

    # Calculate F1 score
    f1_score = 2 * precision * recall / \
        (precision + recall + backend.epsilon())

    # Return negative F1 score as the loss (to minimize it)
    return -f1_score


# transformer / embedding block design/code from Bharath K
# https://blog.paperspace.com/transformers-text-classification/

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, drop_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"),
             Dense(embed_dim),]
        )
        self.dropout1 = Dropout(drop_rate)
        self.dropout2 = Dropout(drop_rate)
        self.lay_nor_1 = LayerNormalization(epsilon=1e-6)
        self.lay_nor_2 = LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training):
        attn_output = self.attention(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.lay_nor_1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.lay_nor_2(out1 + ffn_output)


class EmbeddingLayer(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(EmbeddingLayer, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
random.seed(RANDOM_SEED)
## using GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if len(tf.config.list_physical_devices('GPU')):
    tf.config.list_physical_devices('GPU')
    print("Using GPU")


2023-04-27 13:43:46.519529: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-27 13:43:46.695769: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  1
Using GPU


2023-04-27 13:43:48.651450: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-27 13:43:48.758954: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-27 13:43:48.759502: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


## Domain 1

In [2]:
# _______________ Read data from domain 1 _______________
man_1_df = pd.read_json(HUMAN_1_P)
man_1_df["label"] = HUMAN_IND
mac_1_df = pd.read_json(MACHINE_1_P).drop("machine_id", axis=1)
mac_1_df["label"] = MACHINE_IND
domain_1_df = pd.concat([man_1_df, mac_1_df])

domain_1 = DomainData(domain_1_df[["prompt", "txt"]], domain_1_df["label"])
domain_1.t_t_spli(TEST_FRA, RANDOM_SEED)
domain_1.over_sampling(1.6)
domain_1.test_down()
domain_1.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)


In [3]:
embed_dim = 128
num_heads = 2
ff_dim = 64
epo_size = NO_EPO
batch_size = 128
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)

# Set Prompt input
inputs_p = Input(shape=(MAX_PRO_LEN,))
embedding_layer = EmbeddingLayer(MAX_PRO_LEN, NUM_TOKEN, embed_dim)
x = embedding_layer(inputs_p)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)
x = Dropout(0.3)(x)


# Set txt input
inputs_t = Input(shape=(MAX_TXT_LEN,))
embedding_layer = EmbeddingLayer(MAX_TXT_LEN, NUM_TOKEN, embed_dim)
y = embedding_layer(inputs_t)
y = transformer_block(y)
y = GlobalAveragePooling1D()(y)
y = Dropout(0.3)(y)
y = Dense(64, activation="relu")(y)
y = Dropout(0.3)(y)


# Concatenate outputs from prompt and text models
merged = Concatenate()([x, y])
merged = Dense(units=64, activation='relu')(merged)
merged = Dense(32, activation="relu")(merged)
outputs = Dense(units=1, activation='sigmoid')(merged)
trans_model_2 = Model(inputs=[inputs_p, inputs_t], outputs=outputs)

# Compile and train
trans_model_2.compile(optimizer="adam", loss="binary_crossentropy", metrics=[
                      "accuracy", f1_loss])
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'trans_model.h5', monitor='val_loss', save_best_only=True)
trans_model_2.fit([domain_1.train_prompt, domain_1.train_txt], domain_1.train_label, epochs=epo_size,
                  batch_size=batch_size, validation_split=0.2, callbacks=[callback, model_checkpoint])
print("Model Saved: trans_model.h5")

# evaluate
trans_model_2 = tf.keras.models.load_model("trans_model.h5", custom_objects={
                                           'f1_loss': f1_loss, 'TransformerBlock': TransformerBlock, 'EmbeddingLayer': EmbeddingLayer})
print("Model Loaded: trans_model.h5")
loss, accuracy, f1 = trans_model_2.evaluate(
    [domain_1.test_prompt, domain_1.test_txt], domain_1.test_label, verbose=False)
print("loss: ", loss)
print("accuracy", accuracy)
trans_1_pre_rnn = trans_model_2.predict(
    [domain_1.test_prompt, domain_1.test_txt])
trans_1_pre_rnn = np.round(trans_1_pre_rnn).flatten()
confusion = confusion_matrix(domain_1.test_label, trans_1_pre_rnn)
# trans_1_pre_rnn = [0 if i.flatten()[0] > i.flatten()[1] else 1 for i in trans_1_pre_rnn]
# confusion = confusion_matrix(domain_1.test_label, trans_1_pre_rnn)
print(confusion)
f1 = f1_score(domain_1.test_label, trans_1_pre_rnn)
print("f1-score: ", f1)

# 609/652 dropout -> 0.3 || 645/610
# 645/610 dense 20 -> 32 || 642/624


2023-04-27 13:43:51.939711: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-27 13:43:51.940468: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-27 13:43:51.941316: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-27 13:43:53.120370: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-27 13:43:53.121153: I tensorflow/compile

Epoch 1/60


2023-04-27 13:43:57.063020: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-04-27 13:43:57.106815: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f9ee00a6c00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-27 13:43:57.106854: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-04-27 13:43:57.132916: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-04-27 13:43:57.362734: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-04-27 13:43:57.570920: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifeti

Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Model Saved: trans_model.h5
Model Loaded: trans_model.h5
loss:  0.25016483664512634
accuracy 0.897857129573822
[[639  61]
 [ 82 618]]
f1-score:  0.8963016678752719


## Domain 2 weighted

In [4]:
over_fra = 1.6
weight_fra = 300

# _______________ Read data from domain 1 _______________
man_1_df = pd.read_json(HUMAN_1_P)
man_1_df["label"] = HUMAN_IND
mac_1_df = pd.read_json(MACHINE_1_P).drop("machine_id", axis=1)
mac_1_df["label"] = MACHINE_IND
domain_1_df = pd.concat([man_1_df, mac_1_df])

domain_1 = DomainData(domain_1_df[["prompt", "txt"]], domain_1_df["label"])
domain_1.t_t_spli(TEST_FRA, RANDOM_SEED)
# domain_1.down_sampling()
domain_1.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)

# _______________ Read data from domain 2 _______________
man_2_df = pd.read_json(HUMAN_2_P)
man_2_df["label"] = HUMAN_IND
mac_2_df = pd.read_json(MACHINE_2_P).drop("machine_id", axis=1)
mac_2_df["label"] = MACHINE_IND
domain_2_df = pd.concat([man_2_df, mac_2_df])

domain_2 = DomainData(domain_2_df[["prompt", "txt"]], domain_2_df["label"])
domain_2.t_t_spli(TEST_FRA, RANDOM_SEED)
# domain_2.over_sampling(over_fra)
domain_2.test_down()
domain_2.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)


# _______________ weight data _______________
sample_weight_1 = np.ones(len(domain_1.train_label))
sample_weight_2 = np.ones(len(domain_2.train_label))
sample_weight_2 *= weight_fra
sample_weight = np.concatenate([sample_weight_1, sample_weight_2])

train_prompt = np.concatenate([domain_1.train_prompt, domain_2.train_prompt])
train_txt = np.concatenate([domain_1.train_txt, domain_2.train_txt])
train_label = np.concatenate([domain_1.train_label, domain_2.train_label])

data = list(zip(train_prompt, train_txt, train_label, sample_weight))
random.shuffle(data)

train_prompt, train_txt, train_label, sample_weight = zip(*data)
train_prompt = np.array(train_prompt)
train_txt = np.array(train_txt)
train_label = np.array(train_label)
sample_weight = np.array(sample_weight)


In [5]:
embed_dim = 128
num_heads = 10
ff_dim = 32
epo_size = 10
batch_size = 128
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)

# Set Prompt input
inputs_p = Input(shape=(MAX_PRO_LEN,))
embedding_layer = EmbeddingLayer(MAX_PRO_LEN, NUM_TOKEN, embed_dim)
x = embedding_layer(inputs_p)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)


# Set txt input
inputs_t = Input(shape=(MAX_TXT_LEN,))
embedding_layer = EmbeddingLayer(MAX_TXT_LEN, NUM_TOKEN, embed_dim)
y = embedding_layer(inputs_t)
y = transformer_block(y)
y = GlobalAveragePooling1D()(y)
y = Dropout(0.1)(y)
y = Dense(20, activation="relu")(y)
y = Dropout(0.1)(y)


# Concatenate outputs from prompt and text models
merged = Concatenate()([x, y])
merged = Dense(units=64, activation='relu')(merged)
merged = Dense(20, activation="relu")(merged)
outputs = Dense(units=1, activation='sigmoid')(merged)
trans_model_2 = Model(inputs=[inputs_p, inputs_t], outputs=outputs)

# Compile and train
trans_model_2.compile(optimizer="adam", loss="binary_crossentropy", metrics=[
                      "accuracy", f1_loss])
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'trans_model_weighted.h5', monitor='val_loss', save_best_only=True)
trans_model_2.fit([train_prompt, train_txt], train_label, epochs=epo_size, batch_size=batch_size,
                  sample_weight=sample_weight, validation_split=0.2, callbacks=[callback, model_checkpoint])
print("Model Saved: trans_model_weighted.h5")


## evaluate
trans_model_2 = tf.keras.models.load_model("trans_model_weighted.h5", custom_objects={
                                           'f1_loss': f1_loss, 'TransformerBlock': TransformerBlock, 'EmbeddingLayer': EmbeddingLayer})
print("Model Loaded: trans_model_weighted.h5")
loss, accuracy, f1 = trans_model_2.evaluate(
    [domain_2.test_prompt, domain_2.test_txt], domain_2.test_label, verbose=False)
print("loss: ", loss)
print("accuracy", accuracy)
trans_2_pre_rnn = trans_model_2.predict(
    [domain_2.test_prompt, domain_2.test_txt])
trans_2_pre_rnn = np.round(trans_2_pre_rnn).flatten()
confusion = confusion_matrix(domain_2.test_label, trans_2_pre_rnn)
print(confusion)
f1 = f1_score(domain_2.test_label, trans_2_pre_rnn)
print("f1-score: ", f1)

# 20/18 epoch: NO_EPO -> 10 ||  20/17


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model Saved: trans_model_weighted.h5
Model Loaded: trans_model_weighted.h5
loss:  0.1168220266699791
accuracy 0.949999988079071
[[20  0]
 [ 2 18]]
f1-score:  0.9473684210526316


### test on test

In [6]:
DOMAIN_SPL = 600
## prepare data
test_df = pd.read_json(TEST_P)
test_prompt = pad_sequences(
    test_df["prompt"], padding="post", maxlen=MAX_PRO_LEN)
test_txt = pad_sequences(test_df["txt"], padding="post", maxlen=MAX_TXT_LEN)

## train model
model_1 = tf.keras.models.load_model("trans_model.h5", custom_objects={
                                     'f1_loss': f1_loss, 'TransformerBlock': TransformerBlock, 'EmbeddingLayer': EmbeddingLayer})
model_2 = tf.keras.models.load_model("trans_model_weighted.h5", custom_objects={
                                     'f1_loss': f1_loss, 'TransformerBlock': TransformerBlock, 'EmbeddingLayer': EmbeddingLayer})

## predict
pred = []
pred += model_1.predict([test_prompt[:DOMAIN_SPL],
                        test_txt[:DOMAIN_SPL]]).tolist()
pred += model_2.predict([test_prompt[DOMAIN_SPL:],
                        test_txt[DOMAIN_SPL:]]).tolist()
pred = [int(i) for i in np.round(pred).flatten()]


## save
pred_df = pd.DataFrame(pred)
pred_df.columns = ["Predicted"]
pred_df.index.names = ['Id']
pred_df.to_csv("./data/result3.csv")


