In [1]:
import json
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
# MACHINE_1_P = "./data/set2_machine.json"
# HUMAN_1_P = "./data/set2_human.json"
MACHINE_1_P = "./data/set1_machine.json"
HUMAN_1_P = "./data/set1_human.json"
MACHINE_2_P = "./data/set2_machine.json"
HUMAN_2_P = "./data/set2_human.json"
MACHINE_IND = 1
HUMAND_IND = 0

TEST_FRA = 0.3
RANDOM_SEED = 42

def sparse_transf(info_li, vectorizer, fit=True):
    data_dicts = []
    for record in info_li:
        word_dict = defaultdict(int)
        for token in record:
            word_dict[token] += 1
        data_dicts.append(word_dict)
    if fit == True:
        return vectorizer.fit_transform(data_dicts)
    return vectorizer.transform(data_dicts)


prompt_1 = []
txt_1 = []
label_1 = []
# read machine_set_1 data
with open(MACHINE_1_P, 'r') as file:
    
    f_data = json.load(file)
    prompt_1 += [i["prompt"] for i in f_data]
    txt_1 += [i["txt"] for i in f_data]
    label_1 += [MACHINE_IND for i in range(len(f_data))]

# read human_set_1 data
with open(HUMAN_1_P, 'r') as file:
    f_data = json.load(file)
    prompt_1 += [i["prompt"] for i in f_data]
    txt_1 += [i["txt"] for i in f_data]
    label_1 += [HUMAND_IND for i in range(len(f_data))]


In [2]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import resample
N_SAMPLE = 100


# # StratifiedShuffleSplit
# sss = StratifiedShuffleSplit(n_splits=1, test_size=N_SAMPLE, random_state=42)
# for train_index, test_index in sss.split(data_1, label_1):
#     data_1_train, data_1_test = [data_1[i] for i in train_index], [data_1[i] for i in test_index]
#     label_1_train, label_1_test = [label_1[i] for i in train_index], [label_1[i] for i in test_index]
# print(len(label_1_train), sum(label_1_train))

# train_test_split
data_1 = list(zip(prompt_1, txt_1))
data_1_train, data_1_test, label_1_train, label_1_test = train_test_split(
    data_1, label_1, test_size=TEST_FRA, stratify=label_1, random_state=RANDOM_SEED
)


# Resampling / sub-sampling
h_1 = []
m_1 = []
for i in range(len(data_1_train)):
    if label_1_train[i] == MACHINE_IND:
        m_1.append(data_1_train[i])
    else:
        h_1.append(data_1_train[i])
n_samples = min(len(h_1), len(m_1))

h_1 = resample(h_1, n_samples = n_samples, random_state=0)
m_1 = resample(m_1, n_samples = n_samples, random_state=0)
data_1_train = h_1+m_1
label_1_train = [HUMAND_IND for i in range(n_samples)]+[MACHINE_IND for i in range(n_samples)]
print(len(data_1_train))
print(sum(label_1_train))
data_1_train, d_t_1, label_1_train, l_t_1 = train_test_split(
    data_1_train, label_1_train, test_size=TEST_FRA, random_state=RANDOM_SEED
)
data_1_train += d_t_1
label_1_train += l_t_1


# unzip
prompt_1_train, txt_1_train = zip(*data_1_train)
prompt_1_train, txt_1_train = list(prompt_1_train), list(txt_1_train)

prompt_1_test, txt_1_test = zip(*data_1_test)
prompt_1_test, txt_1_test = list(prompt_1_test), list(txt_1_test)

# vectorize
p_dv = DictVectorizer()
t_dv = DictVectorizer()
prompt_1_train = sparse_transf(prompt_1_train, p_dv)
prompt_1_test = sparse_transf(prompt_1_test, p_dv, False)
txt_1_train = sparse_transf(txt_1_train, t_dv)
txt_1_test = sparse_transf(txt_1_test, t_dv, False)

4900
2450


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
CV_NUM = 5

def vali_model(clf, d_train, label_train, cv_num = CV_NUM):
    print(len(label_train))
    print(sum(label_train))
    label_pred = cross_val_predict(clf, d_train, label_train, cv=cv_num)
    acc_score = accuracy_score(label_train, label_pred)
    print("Accuracy Score: ", acc_score)
    print("Marco F1 score: ", f1_score(label_train, label_pred, average="macro"))
    print("Precision: ", precision_score(label_train, label_pred, average="macro"))
    print("Recall: ", recall_score(label_train, label_pred, average="macro"))
    print("Confusion matrix: ")
    print(confusion_matrix(label_train, label_pred))
    print()


# Naive bayes on promp
print("Naive Bayes' on prompt:")
p_nb_clf = MultinomialNB()
vali_model(p_nb_clf, prompt_1_train, label_1_train)

print("Logistic Regression on prompt: ")
p_lr_clf = LogisticRegression(max_iter=1000)
vali_model(p_lr_clf, prompt_1_train, label_1_train)


# Naive bayes on txt
print("---------------------")
print("Naive Bayes' on text:")
t_nb_clf = MultinomialNB()
vali_model(t_nb_clf, txt_1_train, label_1_train)

print("Logistic Regression on text: ")
t_lr_clf = LogisticRegression(max_iter=1000)
vali_model(t_lr_clf, txt_1_train, label_1_train)

Naive Bayes' on prompt:
4900
2450
Accuracy Score:  0.7542857142857143
Marco F1 score:  0.7539754836210791
Precision:  0.7555748076428479
Recall:  0.7542857142857142
Confusion matrix: 
[[1761  689]
 [ 515 1935]]

Logistic Regression on prompt: 
4900
2450
Accuracy Score:  0.8293877551020408
Marco F1 score:  0.828288803444083
Precision:  0.8380416205891223
Recall:  0.8293877551020408
Confusion matrix: 
[[1836  614]
 [ 222 2228]]

---------------------
Naive Bayes' on text:
4900
2450
Accuracy Score:  0.7687755102040816
Marco F1 score:  0.7664495284335089
Precision:  0.7799269317530566
Recall:  0.7687755102040816
Confusion matrix: 
[[2128  322]
 [ 811 1639]]

Logistic Regression on text: 
4900
2450
Accuracy Score:  0.9169387755102041
Marco F1 score:  0.9168811669929167
Precision:  0.9180978870367644
Recall:  0.9169387755102041
Confusion matrix: 
[[2182  268]
 [ 139 2311]]

