From 879bd9c81f26f62e0ecb3eb9b030484368d98dc0 Mon Sep 17 00:00:00 2001 From: Hamid Karimi Date: Tue, 26 Feb 2019 13:27:38 -0500 Subject: [PATCH] Add files via upload --- config.py | 33 +++++++++ model.py | 152 +++++++++++++++++++++++++++++++++++++++++ test.py | 96 ++++++++++++++++++++++++++ train.py | 133 ++++++++++++++++++++++++++++++++++++ utils.py | 197 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 611 insertions(+) create mode 100644 config.py create mode 100644 model.py create mode 100644 test.py create mode 100644 train.py create mode 100644 utils.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..2f1c231 --- /dev/null +++ b/config.py @@ -0,0 +1,33 @@ +import argparse +import os + + +def str2bool(v): + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +def path(p): + return os.path.expanduser(p) + + +parser = argparse.ArgumentParser(description='Arguments for DHSF Project') + +parser.add_argument("--project_dir", type=str, required=False, default='/PATH/DHSF/', help="project directory") +parser.add_argument("--dropout", type=float, required=False, default=0.0, help="Value of droupout") +parser.add_argument("--sim_name", type=str, required=False, default="sim1",help="The unique and arbitrary name of simulation") +parser.add_argument("--l2_coeff", type=float, required=False, default=0.00,help="Value L2 regularization coefficient") +parser.add_argument("--train_embeddings", type=str2bool, required=False, default=True,help="train embeddings or not") +parser.add_argument("--lr", type=float, required=False, default=0.01, help="initial learning rate ") +parser.add_argument("--batch_size", type=int, required=False, default=40, help="size of batch") +parser.add_argument('--step_num', type=int, required=False, help=' number of steps to run the simulation.', default=200) +parser.add_argument('--gpu', default=-1, type=int, help='GPU id > 0 moves the model on GPU') +parser.add_argument('--word_dim', default=300, type=int) +parser.add_argument("--fill_embedding", type=str2bool, required=False, default=True, help="train embeddings or not") +parser.add_argument('--blstm_hidden_unit_dim', default=100, type=int) + +args = parser.parse_args() diff --git a/model.py b/model.py new file mode 100644 index 0000000..1cdaa80 --- /dev/null +++ b/model.py @@ -0,0 +1,152 @@ +import numpy as np +import torch +import torch.nn as nn +import config +import utils +args = config.args +class DependencyBLSTM(nn.Module): + def __init__(self, num_words, max_sen_length, max_doc_sent_length): + super(DependencyBLSTM, self).__init__() + self.max_sen_length = max_sen_length + self.max_doc_sent_length = max_doc_sent_length + self.dropout = nn.Dropout(p=args.dropout) + self.word_embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=args.word_dim) + self.Softmax = nn.Softmax(dim=0) + ############################# Sentence level Functions ####################################### + + self.forwardLSTM_sent = nn.LSTM(num_layers=1, input_size=args.word_dim, + dropout=args.dropout, + hidden_size=int(args.blstm_hidden_unit_dim), + batch_first=True) + self.backwardLSTM_sent = nn.LSTM(num_layers=1, input_size=args.word_dim, + dropout=args.dropout, + hidden_size=args.blstm_hidden_unit_dim, + batch_first=True) + self.sentence_encoder = nn.Sequential(nn.Linear(args.word_dim, args.blstm_hidden_unit_dim), + nn.LeakyReLU(), nn.Dropout(p=args.dropout)) + ############################# Doc level Functions ####################################### + + self.parent_encoder_doc = nn.Sequential( + nn.Linear(args.blstm_hidden_unit_dim, int(args.blstm_hidden_unit_dim)), + nn.LeakyReLU(), nn.Dropout(p=args.dropout)) + self.child_encoder_doc = nn.Sequential( + nn.Linear(args.blstm_hidden_unit_dim, int(args.blstm_hidden_unit_dim)), + nn.LeakyReLU(), nn.Dropout(p=args.dropout)) + + + self.root_score_encoder_doc = nn.Linear(args.blstm_hidden_unit_dim, 1) + self.root_embed_doc = \ + utils.wrap_with_variable(torch.FloatTensor(np.zeros(shape=(args.blstm_hidden_unit_dim))), + gpu=args.gpu, + requires_grad=True) + self.r_embeds_doc = nn.Sequential( + nn.Linear(3 * args.blstm_hidden_unit_dim, + int(args.blstm_hidden_unit_dim)), + nn.LeakyReLU(), nn.Dropout(p=args.dropout)) + + self.final_binary_classifier = nn.Linear(int(args.blstm_hidden_unit_dim), 2) + + def create_sentence_batches(self, docs): + all_doc_batches = [] + all_doc_batches_inverse = [] + all_doc_seq_lengths = [] + for doc in docs: + doc_sent_embed, doc_sent_embed_inverse = [], [] + seq_lengths = [] + for sent_word_indices in doc['word_indices']: + j = utils.wrap_with_variable(torch.LongTensor(np.array(sent_word_indices).astype(int)), gpu=args.gpu, requires_grad=False) + + word_embed = self.word_embedding(j) + word_embed = self.dropout(word_embed) + X = torch.zeros(self.max_sen_length, args.word_dim) + X[0:len(sent_word_indices)] = word_embed.data + X = utils.wrap_with_variable(X, gpu=args.gpu, requires_grad=True) + doc_sent_embed.append(X) + + idx = [i for i in range(word_embed.data.size(0) - 1, -1, -1)] + if args.gpu > -1: + idx = torch.LongTensor(idx).cuda(args.gpu) + else: + idx = torch.LongTensor(idx) + X_inverse = torch.zeros(self.max_sen_length, args.word_dim) + X_inverse[0:len(sent_word_indices)] = word_embed.data.index_select(0, idx) + X_inverse = utils.wrap_with_variable(X_inverse, gpu=args.gpu, requires_grad=True) + doc_sent_embed_inverse.append(X_inverse) + + seq_lengths.append(len(sent_word_indices)) + + doc_sent_embed = torch.stack(doc_sent_embed) + doc_sent_embed_inverse = torch.stack(doc_sent_embed_inverse) + all_doc_batches.append(doc_sent_embed) + all_doc_batches_inverse.append(doc_sent_embed_inverse) + all_doc_seq_lengths.append(seq_lengths) + return all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths + + def get_sentence_encodings(self, all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths): + all_doc_sentence_encodings = [] + for doc_batch, doc_batch_inverse, doc_seq_length in zip(all_doc_batches, all_doc_batches_inverse, + all_doc_seq_lengths): + doc_sentence_encodings = [] + + fwrd_outputs, _ = self.forwardLSTM_sent(doc_batch) + bwrd_outputs, _ = self.backwardLSTM_sent(doc_batch_inverse) + for sent_forward, sent_backward, l in zip(doc_batch, doc_batch_inverse, doc_seq_length): + idx = [i for i in range(l - 1, -1, -1)] + if args.gpu > -1: + idx = torch.LongTensor(idx).cuda(args.gpu) + else: + idx = torch.LongTensor(idx) + bwrd_outputs_inverse = utils.wrap_with_variable(sent_backward.data.index_select(0, idx), gpu=args.gpu, + requires_grad=True) + + h = self.sentence_encoder(0.5 * (sent_forward[l - 1] + bwrd_outputs_inverse[l-1])) + doc_sentence_encodings.append(h) + doc_sentence_encodings = torch.stack(doc_sentence_encodings) + all_doc_sentence_encodings.append(doc_sentence_encodings) + return all_doc_sentence_encodings + + def forward(self, docs): + all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths = self.create_sentence_batches(docs) + all_doc_sentence_encodings = self.get_sentence_encodings(all_doc_batches, all_doc_batches_inverse, + all_doc_seq_lengths) + all_doc_doc_dependency_tree_info = [] + all_final_features = [] + for sentence_encodings in all_doc_sentence_encodings: + fri = [] + Aij = [] + for i in range(len(sentence_encodings)): + fri.append(self.root_score_encoder_doc(sentence_encodings[i])) + for j in range(len(sentence_encodings)): + if i == j: + Aij.append(utils.wrap_with_variable(torch.tensor(-9999999.000), gpu=args.gpu, requires_grad=True)) + continue + + x = torch.dot(self.parent_encoder_doc(sentence_encodings[i]), + self.child_encoder_doc(sentence_encodings[j])) + Aij.append(x) + Aij = torch.stack(Aij) + Aij = Aij.view(len(sentence_encodings), len(sentence_encodings)) + Aij = self.Softmax(Aij) + fri = torch.stack(fri) + fri = self.Softmax(fri) + ri = [] + for i in range(len(sentence_encodings)): + tmp = [] + tmp3 = [] + for k in range(len(sentence_encodings)): + tmp.append(torch.mul(Aij[k, i], sentence_encodings[k])) + tmp3.append(torch.mul(Aij[i, k], sentence_encodings[i])) + tmp3 = torch.stack(tmp3) + tmp3 = torch.sum(tmp3, 0) + tmp = torch.stack(tmp) + tmp = torch.sum(tmp, 0) + tmp2 = torch.mul(fri[i], self.root_embed_doc) + ri.append(self.r_embeds_doc(torch.cat((sentence_encodings[i], tmp + tmp2, tmp3)))) + ri = torch.stack(ri) + final_feature = torch.mean(ri,0) + all_final_features.append(final_feature) + all_doc_doc_dependency_tree_info.append([Aij.data, fri.data]) + + all_final_features = torch.stack(all_final_features) + output = self.final_binary_classifier(all_final_features) + return output, all_doc_doc_dependency_tree_info diff --git a/test.py b/test.py new file mode 100644 index 0000000..37db462 --- /dev/null +++ b/test.py @@ -0,0 +1,96 @@ +import os +import pickle +import numpy as np +import torch +from sklearn.metrics import accuracy_score +from sklearn.metrics import confusion_matrix, classification_report +from torch import nn +from torch.autograd import Variable +import config +import utils +from model import DependencyBLSTM + +args = config.args +output_dir = args.project_dir + 'Models/' + args.sim_name + '/' +utils.creat_word_embedding() + +def test(): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + save_path = args.project_dir + 'Models/' + args.sim_name + '/model.pt' + + if os.path.exists(save_path): + model = torch.load(save_path) + print('Great!!! Pre-Trained Model Loaded !!!') + else: + print('No pre-trained model ') + model = DependencyBLSTM(num_words=utils.get_num_words(), max_sen_length=97, max_doc_sent_length=326) + + if not os.path.exists(output_dir + 'test_performance_log.txt'): + test_performance_log = open(output_dir + 'test_performance_log.txt', 'w') + else: + test_performance_log = open(output_dir + 'test_performance_log.txt', 'a') + + if args.gpu > -1: + model.cuda(device=int(args.gpu)) + if args.fill_embedding : + embed = utils.get_word_embeddings(source='google') + if args.gpu > -1: + model.word_embedding.weight.data.set_(torch.FloatTensor((embed)).cuda(int(args.gpu))) + else: + model.word_embedding.weight.data.set_(torch.FloatTensor((embed))) + else: + if args.gpu > -1: + model.word_embedding.weight.data.set_( + torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim)).astype())).cuda(int(args.gpu))) + else: + model.word_embedding.weight.data.set_( + torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim))).astype(float))) + + if args.train_embeddings == False: + model.word_embedding.weight.requires_grad = False + + criterion = nn.CrossEntropyLoss() + test_set = utils.get_split_data(split='test') + print('Start Test ...') + model.eval() + test_labels = [d['label'] for d in test_set] + labels = Variable(torch.LongTensor(test_labels)) + + lengths = [len(doc['word_indices']) for doc in test_set] + doc_encodings, all_doc_doc_dependency_tree_info = model(test_set) + + outputs = doc_encodings.cpu() + loss = criterion(outputs, labels).data[0] + _, predictions = torch.max(outputs.data, 1) + predictions = predictions.numpy() + + with open(output_dir + "matrix.pkl", 'wb') as f: + pickle.dump([lengths, all_doc_doc_dependency_tree_info, test_labels], f) + + accuracy = accuracy_score(y_true=np.array(test_labels), y_pred=predictions) + report = classification_report(y_true=np.array(test_labels), y_pred=predictions, target_names=['Real', 'Fake']) + conf_matrix = confusion_matrix(y_true=np.array(test_labels), y_pred=predictions) + test_performance_log.write(" Loss {} Accuracy {} \n".format(loss, accuracy)) + test_performance_log.write("{}\n".format(report)) + test_performance_log.write("{}\n".format(conf_matrix)) + test_performance_log.write("{}\n".format('=' * 50)) + + print('************* Test ****************') + print("Loss {} Accuracy {} ".format(loss, accuracy)) + print(report) + print(conf_matrix) + print('*****************************************') + return accuracy + + +test() + +fake_doc_stat, real_doc_stat = utils.dependecy_tree_stat(dir=output_dir) +#print(fake_doc_stat) +#print('*' * 100) +#print(real_doc_stat) +#print('*' * 100) +print("Avg. Number of Leaf Nodes: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 2]), np.mean(real_doc_stat[:, 2]))) +print("Avg. Preorder Difference: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 4]), np.mean(real_doc_stat[:, 4]))) +print("Avg. Parent-Child Distance: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 3]), np.mean(real_doc_stat[:, 3]))) diff --git a/train.py b/train.py new file mode 100644 index 0000000..b9a0e95 --- /dev/null +++ b/train.py @@ -0,0 +1,133 @@ +import os +import random +import numpy as np +import torch +from sklearn.metrics import accuracy_score +from sklearn.metrics import confusion_matrix, classification_report +from torch import nn +from torch.autograd import Variable +from torch.optim.lr_scheduler import StepLR +import config +import utils +from model import DependencyBLSTM + +args = config.args +output_dir = args.project_dir + 'Models/' + args.sim_name + '/' + +utils.creat_word_embedding() + +def run(): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + save_path = args.project_dir + 'Models/' + args.sim_name + '/model.pt' + + with open(output_dir + 'config', 'w') as config_file: + argss = (str(args).split('(')[1].split(')')[0].split(',')) + for a in argss: + config_file.write("{}\n".format(a)) + if os.path.exists(save_path): + model = torch.load(save_path) + model_loaded = True + print('Great!!! Pre-Trained Model Loaded !!!') + else: + model_loaded = False + print('No pre-trained model ') + model = DependencyBLSTM(num_words=utils.get_num_words(), max_sen_length=97, max_doc_sent_length=326) + + if not os.path.exists(output_dir + 'train_performance_log.csv'): + train_performance_log = open(output_dir + 'train_performance_log.csv', 'w') + train_performance_log.write('Step,Loss\n') + else: + train_performance_log = open(output_dir + 'train_performance_log.csv', 'a') + + if not os.path.exists(output_dir + 'eval_performance_log.txt'): + eval_performance_log = open(output_dir + 'eval_performance_log.txt', 'w') + else: + eval_performance_log = open(output_dir + 'eval_performance_log.txt', 'a') + + if args.gpu > -1: + model.cuda(device=int(args.gpu)) + + if not model_loaded: + if args.fill_embedding: + embed = utils.get_word_embeddings(source='google') + if args.gpu > -1: + model.word_embedding.weight.data.set_(torch.FloatTensor((embed)).cuda(int(args.gpu))) + else: + model.word_embedding.weight.data.set_(torch.FloatTensor((embed))) + else: + if args.gpu > -1: + model.word_embedding.weight.data.set_( + torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim)).astype())).cuda( + int(args.gpu))) + else: + model.word_embedding.weight.data.set_( + torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim))).astype(float))) + + if args.train_embeddings == False: + model.word_embedding.weight.requires_grad = False + + params = [p for p in model.parameters() if p.requires_grad] + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(params=params, lr=args.lr, weight_decay=args.l2_coeff) + model.zero_grad() + scheduler = StepLR(optimizer, step_size=50, gamma=0.9) + print('Loading sets...') + dev_set = utils.get_split_data(split='dev') + train_set = utils.get_split_data(split='train') + train_set =train_set[0:int(len(train_set)/2)] + print('Train and dev sets loaded') + + def train(): + prev_accuracy = 0 + model.train() + for step in range(args.step_num + 1): + random.shuffle(train_set) + docs = train_set[0:args.batch_size] + labels = Variable(torch.LongTensor([d['label'] for d in docs])) + doc_encodings, _ = model(docs) + optimizer.zero_grad() + outputs = doc_encodings.cpu() + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + scheduler.step() + print("Step {} Loss {}".format(step, loss.data[0])) + train_performance_log.write("{},{}\n".format(step, loss.data[0])) + if step % 20 == 0 and step: + accuracy = evaluation(step) + if accuracy >= prev_accuracy: + torch.save(model, save_path) + print("Best model saved in {} Accuracy {}".format(save_path, accuracy)) + prev_accuracy = accuracy + + def evaluation(step): + print('Start evaluation ...') + model.eval() + eval_labels = [d['label'] for d in dev_set] + labels = Variable(torch.LongTensor(eval_labels)) + doc_encodings, _ = model(dev_set) + outputs = doc_encodings.cpu() + loss = criterion(outputs, labels).data[0] + _, predictions = torch.max(outputs.data, 1) + predictions = predictions.numpy() + + accuracy = accuracy_score(y_true=np.array(eval_labels), y_pred=predictions) + report = classification_report(y_true=np.array(eval_labels), y_pred=predictions, target_names=['Real', 'Fake']) + conf_matrix = confusion_matrix(y_true=np.array(eval_labels), y_pred=predictions) + eval_performance_log.write("Step {}, Loss {} Accuracy {} \n".format(step, loss, accuracy)) + eval_performance_log.write("{}\n".format(report)) + eval_performance_log.write("{}\n".format(conf_matrix)) + eval_performance_log.write("{}\n".format('=' * 50)) + + print('************* Evaluation ****************') + print("Step {}, Loss {} Accuracy {} ".format(step, loss, accuracy)) + print(report) + print(conf_matrix) + print('*****************************************') + return accuracy + + train() + + +run() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..ee8ad01 --- /dev/null +++ b/utils.py @@ -0,0 +1,197 @@ +import pickle +import numpy as np +import pandas as pd +from torch.autograd import Variable +import config +import gensim.models.keyedvectors as word2vec +import os +import urllib.request +args = config.args + + +###################### Tree ################################################ +class myTree(object): + def __init__(self, name='Node', children=None, data=None, parent=None): + self.parent = parent + self.name = name + self.index = -1 + self.children = [] + self.data = data + self.characters = [] + self.parent_relation_index = -1 + if children is not None: + for child in children: + self.add_child(child) + + def __repr__(self): + return self.name + + def add_child(self, node): + assert isinstance(node, myTree) + self.children.append(node) + + def __str__(self): + if len(self.children) == 0: + x = 'NONE' + else: + x = [a.name for a in self.children] + return "{} Children:{}".format(self.name, x) + + def __getitem__(self, item): + return self.name + + +def get_leaves(node): + leaves = [] + if len(node.children) == 0: + leaves.append(node) + else: + for child in node.children: + leaves.extend(get_leaves(child)) + return leaves + + +def get_inorder(tree, X): + X.append((tree.name)) + if len(tree.children) == 0: + return + for t in tree.children: + X.append(get_inorder(t, X)) + + +###################### Utility Functions ################################################ + +def wrap_with_variable(tensor, gpu, requires_grad=True): + if gpu > -1: + return Variable(tensor.cuda(gpu), requires_grad=requires_grad) + else: + return Variable(tensor, requires_grad=requires_grad) + + +def get_word_embeddings(source='google'): + with open(args.project_dir + 'Data/word_emb_' + source + '.pkl', 'rb') as f: + embed = pickle.load(f) + return embed + + +def creat_word_embedding(): + if os.path.exists(args.project_dir + 'Data/word_emb_google.pkl'): + return + if not os.path.exists(args.project_dir + 'GoogleNews-vectors-negative300.bin'): + print("Please download https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit and place it in".format(args.project_dir)) + exit(-1) + print('Creating word embeddings ...') + model = word2vec.KeyedVectors.load_word2vec_format(args.project_dir + 'GoogleNews-vectors-negative300.bin', + binary=True) + all_words = pd.read_csv(args.project_dir + "Data/words.csv", names=['Index', 'Word', 'Freq']) + word_embeddings = np.zeros(shape=(len(all_words) + 1, args.word_dim)) + non_exist_words = [] + for w in all_words.values: + if (w[1] in model): + word_embeddings[w[0]] = model[w[1]] + else: + word_embeddings[w[0]] = np.random.rand(1, args.word_dim)[0] + non_exist_words.append(w[1]) + word_embeddings[len(all_words)] = np.random.rand(1, args.word_dim)[0] + with open(args.project_dir + 'Data/word_emb_google.pkl', 'wb') as pkl: + pickle.dump(word_embeddings, pkl) + print('Word embedding file created and save in {}'.format(args.project_dir + 'Data/word_emb_google.pkl')) + #print(non_exist_words, len(non_exist_words)) + + +def get_split_data(split='train'): + documents = [] + with open(args.project_dir + 'Splits/' + split + '.split.csv', 'r') as f: + records = f.read().splitlines() + + for record in records: + label = record.split(',')[-1] + if label == 'Fake': + l = 1 + else: + l = 0 + with open(args.project_dir + 'Data/' + record.split(',')[1], 'r') as f: + word_indices = f.read().splitlines() + word_indices = [w.split(',') for w in word_indices] + documents.append({"word_indices": word_indices, 'label': l}) + return documents + + +def get_num_words(): + all_words = pd.read_csv(args.project_dir + "Data/words.csv", names=['Index', 'Word', 'Freq']) + return len(all_words) + + +def construct_dependecy_tree(matrixp, rootp): + trees = [] + root_index = np.argmax(rootp) + root = myTree(name=str(root_index)) + trees.append(root) + current_nodes = [] + current_nodes.append(root_index) + matrixp[:, root_index] = np.array([-1 for _ in range(len(rootp))]) + flag = True + child_parent_diff = 0 + while flag: + currmax = -99 + father = -1 + child = -1 + for c in current_nodes: + m = np.argmax(matrixp[c, :]) + if matrixp[c, m] > currmax: + currmax = matrixp[c, m] + father = c + child = m + + father_index_in_the_list = -1 + for i, t in enumerate(trees): + if t.name == str(father): + father_index_in_the_list = i + break + node = myTree(name=str(child)) + child_parent_diff += np.abs(int(child) - int(trees[father_index_in_the_list].name)) + node.parent = trees[father_index_in_the_list] + trees[father_index_in_the_list].add_child(node) + trees[father_index_in_the_list].data = currmax + trees.append(node) + current_nodes.append(child) + matrixp[:, child] = np.array([-1 for _ in range(len(rootp))]) + if len(current_nodes) == len(rootp): + flag = False + return trees[0], child_parent_diff + + +def dependecy_tree_stat(dir): + with open(dir + 'matrix.pkl', 'rb') as f: + x = pickle.load(f) + lengths, all_doc_doc_dependency_tree_info, labels = x[0], x[1], x[2] + pijs = [a[0].numpy() for a in all_doc_doc_dependency_tree_info] + piroots = [a[1].numpy() for a in all_doc_doc_dependency_tree_info] + fake_doc_stat = [] + real_doc_stat = [] + for index, (matrixp, rootp, label) in enumerate(zip(pijs, piroots, labels)): + tree, child_parent_diff = construct_dependecy_tree(matrixp, rootp) + X = [] + get_inorder(tree, X) + Y = [] + for ss in X: + if ss is not None: + Y.append(int(ss)) + inorder_diff = 0 + for i, n in enumerate(Y): + inorder_diff += np.abs(i + 1 - n) + leaves = get_leaves(tree) + if label == 1: + fake_doc_stat.append([label, len(matrixp), len(leaves) / (np.log10(len(matrixp))), + child_parent_diff / (np.log10(len(matrixp))), + inorder_diff / (np.log10(len(matrixp)))]) + else: + real_doc_stat.append([label, len(matrixp), len(leaves) / (np.log10(len(matrixp))), + child_parent_diff / (np.log10(len(matrixp))), + inorder_diff / (np.log10(len(matrixp)))]) + + fake_doc_stat = np.array(fake_doc_stat) + real_doc_stat = np.array(real_doc_stat) + return fake_doc_stat, real_doc_stat + +#creat_word_embedding()