In [18]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

# Load data
def json_to_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return pd.DataFrame(data)

set1_human = json_to_dataset("./data/set1_human.json")
set1_machine = json_to_dataset("./data/set1_machine.json")
set2_human = json_to_dataset("./data/set2_human.json")
set2_machine = json_to_dataset("./data/set2_machine.json")

# Label data and combine
set1_human["label"] = 1
set1_machine["label"] = 0
dataset1 = pd.concat([set1_human, set1_machine], ignore_index=True)

set2_human["label"] = 1
set2_machine["label"] = 0
dataset2 = pd.concat([set2_human, set2_machine], ignore_index=True)

# Train/validation split
train_data1, val_data1 = train_test_split(dataset1, test_size=0.2, random_state=42, stratify=dataset1["label"])
train_data2, val_data2 = train_test_split(dataset2, test_size=0.2, random_state=42, stratify=dataset2["label"])

# model based on txt
max_len = 250
vocab_size = 5000

X_train1 = pad_sequences(train_data1['txt'], maxlen=max_len)
y_train1 = train_data1["label"].values
X_val1 = pad_sequences(val_data1['txt'], maxlen=max_len)
y_val1 = val_data1["label"].values

X_train2 = pad_sequences(train_data2['txt'], maxlen=max_len)
y_train2 = train_data2["label"].values
X_val2 = pad_sequences(val_data2['txt'], maxlen=max_len)
y_val2 = val_data2["label"].values

# Define model
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 64, input_length=max_len))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train model
model1 = create_model()
model1.fit(X_train1, y_train1, epochs=5, batch_size=32, validation_data=(X_val1, y_val1), verbose=1)

model2 = create_model()
model2.fit(X_train2, y_train2, epochs=5, batch_size=32, validation_data=(X_val2, y_val2), verbose=1)

# Make predictions and evaluate
y_pred_val1 = (model1.predict(X_val1) > 0.5).astype("int32")
y_pred_val2 = (model2.predict(X_val2) > 0.5).astype("int32")

print("Domain1 Validation Accuracy:", accuracy_score(y_val1, y_pred_val1))
print("Domain2 Validation Accuracy:", accuracy_score(y_val2, y_pred_val2))

2023-04-09 18:22:21.539863: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Domain1 Validation Accuracy: 0.9838600943807748
Domain2 Validation Accuracy: 0.8


In [26]:
# model based on prompt
X_train1 = pad_sequences(train_data1['prompt'], maxlen=max_len)
y_train1 = train_data1["label"].values
X_val1 = pad_sequences(val_data1['prompt'], maxlen=max_len)
y_val1 = val_data1["label"].values

X_train2 = pad_sequences(train_data2['prompt'], maxlen=max_len)
y_train2 = train_data2["label"].values
X_val2 = pad_sequences(val_data2['prompt'], maxlen=max_len)
y_val2 = val_data2["label"].values

# Define model
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 64, input_length=max_len))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train model
model1 = create_model()
model1.fit(X_train1, y_train1, epochs=5, batch_size=32, validation_data=(X_val1, y_val1), verbose=1)

model2 = create_model()
model2.fit(X_train2, y_train2, epochs=5, batch_size=32, validation_data=(X_val2, y_val2), verbose=1)

# Make predictions and evaluate
y_pred_val1 = (model1.predict(X_val1) > 0.5).astype("int32")
y_pred_val2 = (model2.predict(X_val2) > 0.5).astype("int32")

print("Domain1 Validation Accuracy:", accuracy_score(y_val1, y_pred_val1))
print("Domain2 Validation Accuracy:", accuracy_score(y_val2, y_pred_val2))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Domain1 Validation Accuracy: 0.9723202601419677
Domain2 Validation Accuracy: 0.8
