## Read files

In [None]:
import json
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
# MACHINE_1_P = "./data/set2_machine.json"
# HUMAN_1_P = "./data/set2_human.json"
MACHINE_1_P = "./data/set1_machine.json"
HUMAN_1_P = "./data/set1_human.json"
MACHINE_2_P = "./data/set2_machine.json"
HUMAN_2_P = "./data/set2_human.json"
MACHINE_IND = 1
HUMAND_IND = 0

TEST_FRA = 0.3
RANDOM_SEED = 42

def sparse_transf(info_li, vectorizer, fit=True):
    data_dicts = []
    for record in info_li:
        word_dict = defaultdict(int)
        for token in record:
            word_dict[token] += 1
        data_dicts.append(word_dict)
    if fit == True:
        return vectorizer.fit_transform(data_dicts)
    return vectorizer.transform(data_dicts)


prompt_1 = []
text_1 = []
label_1 = []
# read machine_set_1 data
with open(MACHINE_1_P, 'r') as file:
    
    f_data = json.load(file)
    prompt_1 += [i["prompt"] for i in f_data]
    text_1 += [i["txt"] for i in f_data]
    label_1 += [MACHINE_IND for i in range(len(f_data))]

# read human_set_1 data
with open(HUMAN_1_P, 'r') as file:
    f_data = json.load(file)
    prompt_1 += [i["prompt"] for i in f_data]
    text_1 += [i["txt"] for i in f_data]
    label_1 += [HUMAND_IND for i in range(len(f_data))]


### Preprocessing - shuffle / train-test split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import resample
N_SAMPLE = 100


# # StratifiedShuffleSplit
# sss = StratifiedShuffleSplit(n_splits=1, test_size=N_SAMPLE, random_state=42)
# for train_index, test_index in sss.split(data_1, label_1):
#     data_1_train, data_1_test = [data_1[i] for i in train_index], [data_1[i] for i in test_index]
#     label_1_train, label_1_test = [label_1[i] for i in train_index], [label_1[i] for i in test_index]
# print(len(label_1_train), sum(label_1_train))

# _______________ train_test_split _______________
data_1 = list(zip(prompt_1, text_1))
data_1_train, data_1_test, label_1_train, label_1_test = train_test_split(data_1, label_1, test_size=TEST_FRA, stratify=label_1, random_state=RANDOM_SEED)


# _______________ Resampling / sub-sampling _______________
h_1 = []
m_1 = []
for i in range(len(data_1_train)):
    if label_1_train[i] == MACHINE_IND:
        m_1.append(data_1_train[i])
    else:
        h_1.append(data_1_train[i])
n_samples = min(len(h_1), len(m_1))

h_1 = resample(h_1, n_samples = n_samples, random_state=0)
m_1 = resample(m_1, n_samples = n_samples, random_state=0)
data_1_train = h_1+m_1
label_1_train = [HUMAND_IND for i in range(n_samples)]+[MACHINE_IND for i in range(n_samples)]
print(len(data_1_train))
print(sum(label_1_train))
data_1_train, d_t_1, label_1_train, l_t_1 = train_test_split(data_1_train, label_1_train, test_size=TEST_FRA, random_state=RANDOM_SEED)
data_1_train += d_t_1
label_1_train += l_t_1



# unzip
prompt_1_train, text_1_train = zip(*data_1_train)
prompt_1_train, text_1_train = list(prompt_1_train), list(text_1_train)

prompt_1_test, text_1_test = zip(*data_1_test)
prompt_1_test, text_1_test = list(prompt_1_test), list(text_1_test)



4900
2450


## Benchmark -- Naive Bayes / Logistic Regression

In [None]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
CV_NUM = 5

def vali_model(clf, d_train, label_train, cv_num = CV_NUM):
    print(len(label_train))
    print(sum(label_train))
    label_pred = cross_val_predict(clf, d_train, label_train, cv=cv_num)
    acc_score = accuracy_score(label_train, label_pred)
    print("Accuracy Score: ", acc_score)
    print("Marco F1 score: ", f1_score(label_train, label_pred, average="macro"))
    print("Precision: ", precision_score(label_train, label_pred, average="macro"))
    print("Recall: ", recall_score(label_train, label_pred, average="macro"))
    print("Confusion matrix: ")
    print(confusion_matrix(label_train, label_pred))
    print()


# _______________ vectorize _______________
p_dv = DictVectorizer()
t_dv = DictVectorizer()
prompt_1_train_v = sparse_transf(prompt_1_train, p_dv)
prompt_1_test_v = sparse_transf(prompt_1_test, p_dv, False)
text_1_train_v = sparse_transf(text_1_train, t_dv)
text_1_test_v = sparse_transf(text_1_test, t_dv, False)

# _______________ Promp _______________
# Naive bayes on promp
print("Naive Bayes' on prompt:")
p_nb_clf = MultinomialNB()
vali_model(p_nb_clf, prompt_1_train_v, label_1_train)
# Logistic Regression on promp
print("Logistic Regression on prompt: ")
p_lr_clf = LogisticRegression()
vali_model(p_lr_clf, prompt_1_train_v, label_1_train)

# _______________ text _______________
# Naive bayes on text
print("Naive Bayes' on text:")
t_nb_clf = MultinomialNB()
vali_model(t_nb_clf, text_1_train_v, label_1_train)
# Logistic Regression on text
print("Logistic Regression on text: ")
t_lr_clf = LogisticRegression()
vali_model(t_lr_clf, text_1_train_v, label_1_train)

Naive Bayes' on prompt:
4900
2450
Accuracy Score:  0.7542857142857143
Marco F1 score:  0.7539754836210791
Precision:  0.7555748076428479
Recall:  0.7542857142857142
Confusion matrix: 
[[1761  689]
 [ 515 1935]]

Logistic Regression on prompt: 
4900
2450


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy Score:  0.8295918367346938
Marco F1 score:  0.828477224445821
Precision:  0.8383876609409944
Recall:  0.8295918367346939
Confusion matrix: 
[[1835  615]
 [ 220 2230]]

Naive Bayes' on text:
4900
2450
Accuracy Score:  0.7687755102040816
Marco F1 score:  0.7664495284335089
Precision:  0.7799269317530566
Recall:  0.7687755102040816
Confusion matrix: 
[[2128  322]
 [ 811 1639]]

Logistic Regression on text: 
4900
2450


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy Score:  0.9175510204081633
Marco F1 score:  0.9175124185562225
Precision:  0.9183340944415068
Recall:  0.9175510204081633
Confusion matrix: 
[[2195  255]
 [ 149 2301]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## RNN


In [None]:
from keras.models import Sequential
from keras import layers
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

NUM_TOKEN = 5000
MAX_PRO_LEN = 50
MAX_TEXT_LEN = 200
NO_EPO = 4
NO_BAT = 10


In [None]:
## _______________ RNN on Prompts _______________

p_model_rnn= tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=NUM_TOKEN,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
p_model_rnn.compile(optimizer="Adam", loss="binary_crossentropy", metrics=['accuracy'])

prompt_1_train_np = pad_sequences(prompt_1_train, padding='post', maxlen=MAX_PRO_LEN)
label_1_train_np = np.array(label_1_train)

p_model_rnn.fit(prompt_1_train_np, label_1_train_np, epochs=NO_EPO, verbose=True, batch_size=NO_BAT)

prompt_1_test_np = pad_sequences(prompt_1_test, padding='post', maxlen=MAX_PRO_LEN)
label_1_test_np = np.array(label_1_test)
loss, accuracy = p_model_rnn.evaluate(prompt_1_test_np, label_1_test_np, verbose=False)

prompt_1_pre_rnn = p_model_rnn.predict(prompt_1_test_np)
confusion = tf.math.confusion_matrix(labels=label_1_test_np, predictions=prompt_1_pre_rnn, num_classes=2)
print(confusion)

print("loss: {}, accuracy: {}".format(loss, accuracy))

Epoch 1/5


2023-04-09 13:04:04.269798: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-04-09 13:04:06.881071: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


AttributeError: module 'tensorflow' has no attribute 'confusion_matrix'

In [None]:
# _______________ RNN on Text _______________
t_model_rnn= tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=NUM_TOKEN,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
t_model_rnn.compile(optimizer="Adam", loss="binary_crossentropy", metrics=['accuracy'])

text_1_train_np = pad_sequences(text_1_train, padding='post', maxlen=MAX_TEXT_LEN)
label_1_train_np = np.array(label_1_train)
t_model_rnn.fit(text_1_train_np, label_1_train_np, epochs=NO_EPO, verbose=True, batch_size=NO_BAT)

text_1_test_np = pad_sequences(text_1_train, padding='post', maxlen=MAX_TEXT_LEN)
label_1_test_np = np.array(label_1_test)
loss, accuracy = t_model_rnn.evaluate(prompt_1_test_np, label_1_test_np, verbose=False)

text_1_pre_rnn = t_model_rnn.predict(prompt_1_test_np)
confusion = tf.math.confusion_matrix(labels=label_1_test, predictions=text_1_pre_rnn, num_classes=2)
print(confusion)

print("loss: {}, accuracy: {}".format(loss, accuracy))


Epoch 1/4


2023-04-09 13:28:49.741935: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-04-09 13:28:51.846716: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]


Epoch 2/4
Epoch 3/4
Epoch 4/4
tf.Tensor(
[[36776     0]
 [ 1050     0]], shape=(2, 2), dtype=int32)
loss: 2.1054835319519043, accuracy: 0.57317715883255
