In [14]:
import numpy as np
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend
import pandas as pd
import tensorflow as tf
import random
import os
from gensim.models import Word2Vec
from keras import layers
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from sklearn.utils import class_weight
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense, Concatenate
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
import pickle

NUM_TOKEN = 5000
MAX_PRO_LEN = 64
MAX_TXT_LEN = 256
NO_EPO = 60
NO_BAT = 128
CLS = 5001
SEP = 5002
PAD_ID = 0

MACHINE_1_P = "./data/set1_machine.json"
HUMAN_1_P = "./data/set1_human.json"
MACHINE_2_P = "./data/set2_machine.json"
HUMAN_2_P = "./data/set2_human.json"
TEST_P = "./data/test.json"
RANDOM_SEED = 42
MACHINE_IND = 0
HUMAN_IND = 1
TEST_FRA = 0.2

class DomainData:
    """
    train_test_split, pad_sequence, class_weight
    """
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def t_t_spli(self, test_size, random_state):
        self.random_state = random_state
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(self.x, self.y, test_size=test_size, stratify = self.y, random_state = random_state)
        self.train_x = self.train_x.reset_index(drop=True)
        self.train_y = self.train_y.reset_index(drop=True)
        self.test_x = self.test_x.reset_index(drop=True)
        self.test_y = self.test_y.reset_index(drop=True)
        
    def add_sep(self):
        self.x["prompt"] = self.x.apply(lambda x: [i+1 for i in x["prompt"]] , axis = 1)
        self.x["prompt"] = self.x.apply(lambda x: [CLS] + x["prompt"] , axis = 1)
        self.x["prompt"] = self.x.apply(lambda x: x["prompt"] + [SEP] if len(x["prompt"])<MAX_PRO_LEN else x["prompt"][:MAX_PRO_LEN-1] + [SEP], axis = 1)

        self.x["txt"] = self.x.apply(lambda x: [i+1 for i in x["txt"]] , axis = 1)
        self.x["txt"] = self.x.apply(lambda x: [CLS] + x["txt"] , axis = 1)
        self.x["txt"] = self.x.apply(lambda x: x["txt"] + [SEP] if len(x["prompt"])<MAX_TXT_LEN else x["txt"][:MAX_TXT_LEN-1] + [SEP], axis = 1)
        
    
    def add_padding(self, padding, prompt_len, txt_len):
        self.train_prompt = self.train_x["prompt"]
        self.train_txt = self.train_x["txt"]
        self.train_label = self.train_y.to_numpy()
        self.test_prompt = self.test_x["prompt"]
        self.test_txt = self.test_x["txt"]
        self.test_label = self.test_y.to_numpy()
        unique_classes = np.unique(self.train_label)
        class_weights = class_weight.compute_class_weight("balanced", classes=unique_classes, y=self.train_y)
        self.class_weights = dict(zip(unique_classes, class_weights))
        
        self.prompt_len = prompt_len
        self.txt_len = txt_len
        
        self.train_prompt = pad_sequences(self.train_prompt, padding=padding, maxlen=prompt_len, value=PAD_ID)
        self.train_txt = pad_sequences(self.train_txt, padding=padding, maxlen=txt_len, value=PAD_ID)
        self.test_prompt = pad_sequences(self.test_prompt, padding=padding, maxlen=prompt_len, value=PAD_ID)
        self.test_txt = pad_sequences(self.test_txt, padding=padding, maxlen=txt_len, value=PAD_ID)
        
        
    def down_sampling(self):
        mac_ind = self.train_y[self.train_y == MACHINE_IND].index.to_list()
        hum_ind = self.train_y[self.train_y == HUMAN_IND].index.to_list()
        lower = min(len(mac_ind), len(hum_ind))
        sel_lit = mac_ind[:lower] + hum_ind[:lower]
        self.train_x = self.train_x.iloc[sel_lit]
        self.train_y = self.train_y.iloc[sel_lit]
        random.shuffle(sel_lit)

    def over_sampling(self, upper_fra):
        mac_ind = self.train_y[self.train_y == MACHINE_IND].index.to_list()
        hum_ind = self.train_y[self.train_y == HUMAN_IND].index.to_list()
        lower = min(len(mac_ind), len(hum_ind))
        if lower == len(mac_ind):
            upper = int(lower*upper_fra) if lower*upper_fra < len(hum_ind) else len(hum_ind)
            major = hum_ind[:upper]
            minor = mac_ind[:lower]
    
        else:
            upper = int(lower*upper_fra) if lower*upper_fra < len(mac_ind) else len(mac_ind)
            major = mac_ind[:upper]
            minor = hum_ind[:lower]
        
        add_n = upper - lower
        oversampled = []
        while(len(oversampled) < add_n):
            oversampled.append(random.choice(mac_ind))
        sel_lit = major + minor + oversampled
        random.shuffle(sel_lit)
        
        self.train_x = self.train_x.iloc[sel_lit]
        self.train_y = self.train_y.iloc[sel_lit]
    
    def test_down(self, frac = 1):
        mac_ind = self.test_y[self.test_y == MACHINE_IND].index.to_list()
        hum_ind = self.test_y[self.test_y == HUMAN_IND].index.to_list()
        lower = min(len(mac_ind), len(hum_ind))
        if frac > 1:
            sel_lit = mac_ind[:lower] + hum_ind[:int(lower/frac)]
        else:
            sel_lit = mac_ind[:int(lower*frac)] + hum_ind[:lower]
        self.test_x = self.test_x.iloc[sel_lit]
        self.test_y = self.test_y.iloc[sel_lit]
        random.shuffle(sel_lit)
    
    def word2vec(self, vector_size = 100, min_count = 1, file_name = None, model = None):
        if model:
            self.w2v_model = model
        elif file_name:
            self.w2v_model = Word2Vec.load(file_name)
        else:
            w2v_prompt = []
            for i in self.train_prompt:
                w2v_prompt.append(i[np.nonzero(i)])

            w2v_txt = []
            for i in self.train_txt:
                w2v_txt.append(i[np.nonzero(i)])

            w2v_train = [list(w2v_prompt[i]) + list(w2v_txt[i]) for i in range(len(self.train_prompt))]
            print("text merged")
            sentences = [list(map(str, doc)) for doc in w2v_train]
            sentences.append(range(NUM_TOKEN))
            self.w2v_model = Word2Vec(sentences, vector_size=100, min_count = 1)
        self.train_prompt = np.array([np.mean([self.w2v_model.wv[i] for i in j], axis=0) for j in self.train_prompt])
        self.train_txt = np.array([np.mean([self.w2v_model.wv[i] for i in j], axis=0) for j in self.train_txt])
        self.test_prompt = np.array([np.mean([self.w2v_model.wv[i] for i in j], axis=0) for j in self.test_prompt])
        self.test_txt = np.array([np.mean([self.w2v_model.wv[i] for i in j], axis=0) for j in self.test_txt])
        






def f1_loss(y_true, y_pred):
    # Calculate precision and recall
    tp = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)))
    fp = backend.sum(backend.round(backend.clip(y_pred - y_true, 0, 1)))
    fn = backend.sum(backend.round(backend.clip(y_true - y_pred, 0, 1)))
    precision = tp / (tp + fp + backend.epsilon())
    recall = tp / (tp + fn + backend.epsilon())
    
    # Calculate F1 score
    f1_score = 2 * precision * recall / (precision + recall + backend.epsilon())
    
    # Return negative F1 score as the loss (to minimize it)
    return -f1_score


callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience = 3)
random.seed(RANDOM_SEED)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if len(tf.config.list_physical_devices('GPU')):
    tf.config.list_physical_devices('GPU')
    print("Using GPU")


Num GPUs Available:  1
Using GPU


## Domain 1

In [45]:
vector_size = 64

## _______________ Read data from domain 1 _______________
man_1_df = pd.read_json(HUMAN_1_P)
man_1_df["label"] = HUMAN_IND
mac_1_df = pd.read_json(MACHINE_1_P).drop("machine_id", axis = 1)
mac_1_df["label"] = MACHINE_IND
domain_1_df = pd.concat([man_1_df, mac_1_df])

domain_1 = DomainData(domain_1_df[["prompt", "txt"]], domain_1_df["label"])
domain_1.t_t_spli(TEST_FRA, RANDOM_SEED)
# domain_1.over_sampling(1.6)
domain_1.test_down()
domain_1.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)
# domain_1.word2vec(vector_size, 1, "Word2Vec_1.model")
domain_1.word2vec(vector_size, 1)
domain_1.w2v_model.save("Word2Vec_1.model")



text merged


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np
c = 22


# Train a logistic regression model on the training data
clf = LogisticRegression(max_iter = 1000, C=c, class_weight=domain_1.class_weights)
clf.fit(domain_1.train_txt, domain_1.train_label)
with open("lr_d1_v1.pkl", "wb") as f:
    pickle.dump(clf, f)


y_pred = clf.predict(domain_1.test_txt)

# Calculate the F1 score for the predictions
f1 = f1_score(domain_1.test_label, y_pred)

print(domain_1.test_label, y_pred)

print("Domain 1 -- F1 score on the test data: ", f1)
print("confusion matrix: \n", confusion_matrix(domain_1.test_label, y_pred))

# 32/29: 100, 1
# vs64 c22-> 609/609
# vs32 c22-> 605/605
# vs80 c22-> 606/609

[0 0 0 ... 1 1 1] [0 0 0 ... 1 1 1]
Domain 1 -- F1 score on the test data:  0.8681397006414826
confusion matrix: 
 [[606  94]
 [ 91 609]]


## Domain 2

In [4]:
vector_size = 100

## _______________ Read data from domain 2 _______________
man_2_df = pd.read_json(HUMAN_2_P)
man_2_df["label"] = HUMAN_IND
mac_2_df = pd.read_json(MACHINE_2_P).drop("machine_id", axis = 1)
mac_2_df["label"] = MACHINE_IND
domain_2_df = pd.concat([man_2_df, mac_2_df])

domain_2 = DomainData(domain_2_df[["prompt", "txt"]], domain_2_df["label"])
domain_2.t_t_spli(TEST_FRA, RANDOM_SEED)
# domain_2.over_sampling(1.6)
domain_2.test_down()
domain_2.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)
# domain_2.word2vec(vector_size, 1, "Word2Vec_2.model")
domain_2.word2vec(vector_size, 1)
# domain_2.w2v_model.save("Word2Vec_2.model")
domain_2.w2v_model.save("Word2Vec_2.model")


text merged


In [5]:
print(len(domain_2.test_label))
print(domain_2.test_label)

40
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np



# Train a logistic regression model on the training data
clf = LogisticRegression(class_weight=domain_2.class_weights)
clf.fit(domain_2.train_txt, domain_2.train_label)
with open("lr_d2_v1.pkl", "wb") as f:
    pickle.dump(clf, f)

y_pred = clf.predict(domain_2.test_txt)

# Calculate the F1 score for the predictions
f1 = f1_score(domain_2.test_label, y_pred)

print(domain_2.test_label, y_pred)

print("Domain 2 -- F1 score on the test data: ", f1)
print("confusion matrix: \n", confusion_matrix(domain_2.test_label, y_pred))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1] [1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
Domain 2 -- F1 score on the test data:  0.7407407407407407
confusion matrix: 
 [[ 6 14]
 [ 0 20]]


## Sample weight class 2

In [6]:
over_fra = 1.6
weight_fra = 100
vector_size = 50

## _______________ Read data from domain 1 _______________
man_1_df = pd.read_json(HUMAN_1_P)
man_1_df["label"] = HUMAN_IND
mac_1_df = pd.read_json(MACHINE_1_P).drop("machine_id", axis = 1)
mac_1_df["label"] = MACHINE_IND
domain_1_df = pd.concat([man_1_df, mac_1_df])

domain_1 = DomainData(domain_1_df[["prompt", "txt"]], domain_1_df["label"])
domain_1.t_t_spli(TEST_FRA, RANDOM_SEED)
# domain_1.over_sampling(over_fra)
# domain_1.down_sampling()
domain_1.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)

## _______________ Read data from domain 2 _______________
man_2_df = pd.read_json(HUMAN_2_P)
man_2_df["label"] = HUMAN_IND
mac_2_df = pd.read_json(MACHINE_2_P).drop("machine_id", axis = 1)
mac_2_df["label"] = MACHINE_IND
domain_2_df = pd.concat([man_2_df, mac_2_df])

domain_2 = DomainData(domain_2_df[["prompt", "txt"]], domain_2_df["label"])
domain_2.t_t_spli(TEST_FRA, RANDOM_SEED)
# domain_2.over_sampling(over_fra)
domain_2.test_down()
domain_2.add_padding('post', MAX_PRO_LEN, MAX_TXT_LEN)
# domain_2.word2vec(vector_size, 1, "Word2Vec_2.model")
domain_2.word2vec(vector_size, 1)
domain_2.w2v_model.save("Word2Vec_2.model")

## _______________ weight data _______________
sample_weight_1 = np.ones(len(domain_1.train_label))
sample_weight_2 = np.ones(len(domain_2.train_label))
sample_weight_2 *= weight_fra
sample_weight = np.concatenate([sample_weight_1, sample_weight_2])

train_prompt = np.array([np.mean([domain_2.w2v_model.wv[i] for i in j], axis=0) for j in domain_1.train_prompt])
train_txt = np.array([np.mean([domain_2.w2v_model.wv[i] for i in j], axis=0) for j in domain_1.train_txt])
train_prompt = np.concatenate([train_prompt, domain_2.train_prompt])
train_txt = np.concatenate([train_txt, domain_2.train_txt])
train_label = np.concatenate([domain_1.train_label, domain_2.train_label])

data = list(zip(train_prompt, train_txt, train_label, sample_weight))
random.shuffle(data)

train_prompt, train_txt, train_label, sample_weight = zip(*data)
train_prompt = np.array(train_prompt)
train_txt = np.array(train_txt)
train_label = np.array(train_label)
sample_weight = np.array(sample_weight)




text merged


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np



# Train a logistic regression model on the training data
clf = LogisticRegression(class_weight=domain_2.class_weights)
clf.fit(train_txt, train_label, sample_weight = sample_weight)

y_pred = clf.predict(domain_2.test_txt)

# Calculate the F1 score for the predictions
f1 = f1_score(domain_2.test_label, y_pred)

print(domain_2.test_label, y_pred)

print("Domain 2 -- F1 score on the test data: ", f1)
print("confusion matrix: \n", confusion_matrix(domain_2.test_label, y_pred))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
Domain 2 -- F1 score on the test data:  0.6666666666666666
confusion matrix: 
 [[ 0 20]
 [ 0 20]]


### test on test

In [8]:
DOMAIN_SPL = 600

w2v_model_1 = Word2Vec.load("Word2Vec_1.model")
w2v_model_2 = Word2Vec.load("Word2Vec_2.model")
test_df = pd.read_json(TEST_P)
test_txt = list(pad_sequences(test_df["txt"], padding="post", maxlen=MAX_TXT_LEN))


test_txt[:DOMAIN_SPL] = np.array([np.mean([w2v_model_1.wv[i] for i in j], axis=0) for j in test_txt[:DOMAIN_SPL]])
test_txt[DOMAIN_SPL:] = np.array([np.mean([w2v_model_2.wv[i] for i in j], axis=0) for j in test_txt[DOMAIN_SPL:]])
with open("lr_d1_v1.pkl", "rb") as f:
    model_1 = pickle.load(f)
with open("lr_d2_v1.pkl", "rb") as f:
    model_2 = pickle.load(f)

pred = []
pred += model_1.predict(test_txt[:DOMAIN_SPL]).tolist()
pred += model_2.predict(test_txt[DOMAIN_SPL:]).tolist()
pred = [int(i) for i in np.round(pred).flatten()]


In [9]:
pred_df = pd.DataFrame(pred)
pred_df.columns = ["Predicted"]
pred_df.index.names = ['Id']

pred_df.to_csv("./data/result7.csv")

## Draft