In [24]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import class_weight, resample
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, Dense, Dropout, LSTM, GRU
from tensorflow.keras.models import load_model

In [46]:
# Load data
def json_to_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

set1_human = json_to_dataset("./data/set1_human.json")
set1_machine = json_to_dataset("./data/set1_machine.json")
set1_machine = set1_machine.drop('machine_id', axis=1)

set2_human = json_to_dataset("./data/set2_human.json")
set2_machine = json_to_dataset("./data/set2_machine.json")
set2_machine = set2_machine.drop('machine_id', axis=1)

print(f"set1 - human/machine : {len(set1_human)} / {len(set1_machine)}")
print(f"set2 - human/machine : {len(set2_human)} / {len(set2_machine)}")

# Label data and combine
set1_human["label"] = 1
set1_machine["label"] = 0

print("\nMock test sets")
# Domain 1 - 600 records
sample_human = set1_human.sample(300, random_state=42)
set1_human = set1_human.drop(sample_human.index)
sample_machine = set1_machine.sample(300, random_state=42)
set1_machine = set1_machine.drop(sample_machine.index)
print(f"extract set1 - human/machine : {len(set1_human)} / {len(set1_machine)}")
# Combine the samples into a mock test dataset
mock_test_data1 = pd.concat([sample_human, sample_machine], ignore_index=True)
X_mock_test = pad_sequences(mock_test_data1['txt'], maxlen=max_len)
y_mock_test = mock_test_data1["label"].values
set1_human = set1_human.sample(len(set1_machine), random_state=42)
dataset1 = pd.concat([set1_human, set1_machine], ignore_index=True)
print(f"downsample set1 - human/machine : {len(set1_human)} / {len(set1_machine)}")

set2_human["label"] = 1
set2_machine["label"] = 0
dataset2 = pd.concat([set2_human, set2_machine], ignore_index=True)
# Domain 2 - 400 records
# Sample 100 records from the human class
sample_human = dataset2[dataset2["label"] == 1]
# Oversample the human records to reach 200 records
sample_human_oversampled = resample(sample_human, replace=True, n_samples=200, random_state=42)
# Sample 200 records from the machine class
sample_machine = dataset2[dataset2["label"] == 0].sample(200, random_state=42)
# Combine the samples into a mock test dataset
mock_test_data2 = pd.concat([sample_human_oversampled, sample_machine], ignore_index=True)
# Preprocess the mock test dataset
# X_mock_test2 = pad_sequences(mock_test_data2['txt'], maxlen=max_len)
# y_mock_test2 = mock_test_data2["label"].values

# Combine the two mock test sets
combined_mock_test = pd.concat([mock_test_data1, mock_test_data2], ignore_index=True)
X_combined_mock_test = pad_sequences(combined_mock_test['txt'], maxlen=max_len)
y_combined_mock_test = combined_mock_test["label"].values
print(f"generate {len(y_combined_mock_test)} mock tests.")

set1 - human/machine : 122584 / 3500
set2 - human/machine : 100 / 400

Mock test sets
extract set1 - human/machine : 122284 / 3200
downsample set1 - human/machine : 3200 / 3200
generate 1000 mock tests.


In [47]:
# Train/validation split
train_data1, val_data1 = train_test_split(dataset1, test_size=0.2, random_state=42, stratify=dataset1["label"])
train_data2, val_data2 = train_test_split(dataset2, test_size=0.2, random_state=42, stratify=dataset2["label"])

In [48]:
# fixed param
vocab_size = 5000

# model param
max_len = 250
embedding_dim = 64
lstm_unit = 64

# fit param
num_epoch = 50
num_batch = 64

In [49]:
strategy = "txt" # "prompt"
X_train1 = pad_sequences(train_data1[strategy], maxlen=max_len)
y_train1 = train_data1["label"].values
X_val1 = pad_sequences(val_data1[strategy], maxlen=max_len)
y_val1 = val_data1["label"].values

X_train2 = pad_sequences(train_data2[strategy], maxlen=max_len)
y_train2 = train_data2["label"].values
X_val2 = pad_sequences(val_data2[strategy], maxlen=max_len)
y_val2 = val_data2["label"].values

# Set up the EarlyStopping and ModelCheckpoint callback
checkpoint_filepath1 = './best_model1.h5'
checkpoint_filepath2 = './best_model2.h5'
model_checkpoint_callback1 = ModelCheckpoint(filepath=checkpoint_filepath1, save_best_only=True, 
                                             monitor='val_loss', mode='min')
model_checkpoint_callback2 = ModelCheckpoint(filepath=checkpoint_filepath2, save_best_only=True, 
                                             monitor='val_loss', mode='min')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')
print("Preparation done.")

Preparation done.


In [50]:
# Define model - TRANSFER LEARNING
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
    model.add(Bidirectional(GRU(lstm_unit, return_sequences=True)))
    model.add(Bidirectional(GRU(lstm_unit // 2)))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# # Train model for domain 1
print("Domain 1 model:")
model1 = create_model()
model1.fit(X_train1, y_train1, epochs=num_epoch, batch_size=num_batch, validation_data=(X_val1, y_val1), 
           verbose=1, 
#            class_weight=class_weights_dict, 
           callbacks=[model_checkpoint_callback1, early_stopping])

print("-"*40)
print("Domain 2 model:")
num_folds = 3
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
best_val_loss = float('inf')
fold_accuracies = []

# Prepare the data for domain 2
X_train2 = pad_sequences(dataset2['txt'], maxlen=max_len)
y_train2 = dataset2["label"].values

# Transfer learning
model2 = load_model(checkpoint_filepath1)
model2.pop()

for layer in model2.layers[:-1]:
    layer.trainable = False

model2.add(Dense(1, activation='sigmoid'))

model2.fit(X_train2, y_train2, epochs=num_epoch, batch_size=num_batch, validation_data=(X_val2, y_val2), 
           verbose=1, 
#            class_weight=class_weights_dict, 
           callbacks=[model_checkpoint_callback2, early_stopping])

# for i, (train_index, val_index) in enumerate(kf.split(X_train2)):
#     print(f"\nFold {i + 1}/{num_folds}")
#     X_train_fold, X_val_fold = X_train2[train_index], X_train2[val_index]
#     y_train_fold, y_val_fold = y_train2[train_index], y_train2[val_index]

#     model2.fit(X_train_fold, y_train_fold, epochs=num_epoch, batch_size=num_batch,
#                validation_data=(X_val_fold, y_val_fold), verbose=1,
#                callbacks=[model_checkpoint_callback2, early_stopping])
#     model2 = load_model(checkpoint_filepath2)
#     val_loss, _ = model2.evaluate(X_val_fold, y_val_fold, verbose=0)
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         best_model2 = model2
#         print("best model updated!")

Domain 1 model:
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
----------------------------------------
Domain 2 model:
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1cecd54d2b0>

In [51]:
# Load the best models
best_model1 = load_model(checkpoint_filepath1)
best_model2 = load_model(checkpoint_filepath2)

# Make predictions and evaluate
y_pred_val1 = (best_model1.predict(X_val1) > 0.5).astype("int32")
y_pred_val2 = (best_model2.predict(X_train2) > 0.5).astype("int32")
print("\nValidation")
print("Domain1 Validation Accuracy:", accuracy_score(y_val1, y_pred_val1))
print("Domain2 Validation Accuracy:", accuracy_score(y_train2, y_pred_val2))

# mock test
print("-"*40)
print("Mock Test")
# Split the combined mock test set into two parts: first 600 for model1 and next 400 for model2
X_mock_test1 = X_combined_mock_test[:600]
y_mock_test1 = y_combined_mock_test[:600]

X_mock_test2 = X_combined_mock_test[600:]
y_mock_test2 = y_combined_mock_test[600:]

# Make predictions using the respective models
y_pred_mock_test1 = (best_model1.predict(X_mock_test1) > 0.5).astype("int32")
y_pred_mock_test2 = (best_model2.predict(X_mock_test2) > 0.5).astype("int32")

# Calculate the accuracy for each part
accuracy_mock_test1 = accuracy_score(y_mock_test1, y_pred_mock_test1)
accuracy_mock_test2 = accuracy_score(y_mock_test2, y_pred_mock_test2)
print("Mock Test1 Accuracy:", accuracy_mock_test1)
print("Mock Test2 Accuracy:", accuracy_mock_test2)

# Combine the predictions and true labels
y_pred_combined_mock_test = np.concatenate([y_pred_mock_test1, y_pred_mock_test2])
y_true_combined_mock_test = np.concatenate([y_mock_test1, y_mock_test2])

# Calculate the overall accuracy
accuracy_combined_mock_test = accuracy_score(y_true_combined_mock_test, y_pred_combined_mock_test)
print("Combined Mock Test Accuracy:", accuracy_combined_mock_test)


Validation
Domain1 Validation Accuracy: 0.89140625
Domain2 Validation Accuracy: 1.0
----------------------------------------
Mock Test
Mock Test1 Accuracy: 0.885
Mock Test2 Accuracy: 1.0
Combined Mock Test Accuracy: 0.931


In [52]:
# Load test data
test_df = json_to_dataset("./data/test.json")

# Pad test sequences
X_test = pad_sequences(test_df['txt'], maxlen=max_len)

# Make predictions
X_test_domain1 = X_test[:600]
X_test_domain2 = X_test[600:]

predictions_domain1 = best_model1.predict(X_test_domain1)
predictions_domain2 = best_model2.predict(X_test_domain2)

# Combine predictions
predictions = np.concatenate([predictions_domain1, predictions_domain2])

# Convert predictions to binary labels
binary_predictions = (predictions > 0.5).astype(int).flatten()

# Create a submission DataFrame
submission_df = pd.DataFrame({"Id": np.arange(len(binary_predictions)), "Predicted": binary_predictions})

# Save the submission DataFrame as a CSV file
submission_df.to_csv("bidirection_transfer_cv.csv", index=False)
print("Done csv output.")

Done csv output.
