Add files via upload

hamidkarimi · Feb 26, 2019 · 879bd9c · 879bd9c
1 parent d5a9c0a
commit 879bd9c
Show file tree

Hide file tree

Showing 5 changed files with 611 additions and 0 deletions.
diff --git a/config.py b/config.py
@@ -0,0 +1,33 @@
+import argparse
+import os
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def path(p):
+    return os.path.expanduser(p)
+
+
+parser = argparse.ArgumentParser(description='Arguments for DHSF Project')
+
+parser.add_argument("--project_dir", type=str, required=False, default='/PATH/DHSF/', help="project directory")
+parser.add_argument("--dropout", type=float, required=False, default=0.0, help="Value of droupout")
+parser.add_argument("--sim_name", type=str, required=False, default="sim1",help="The unique and arbitrary name of simulation")
+parser.add_argument("--l2_coeff", type=float, required=False, default=0.00,help="Value L2 regularization coefficient")
+parser.add_argument("--train_embeddings", type=str2bool, required=False, default=True,help="train embeddings or not")
+parser.add_argument("--lr", type=float, required=False, default=0.01, help="initial learning rate ")
+parser.add_argument("--batch_size", type=int, required=False, default=40, help="size of batch")
+parser.add_argument('--step_num', type=int, required=False, help=' number of steps to run the simulation.', default=200)
+parser.add_argument('--gpu', default=-1, type=int, help='GPU id > 0 moves the model on GPU')
+parser.add_argument('--word_dim', default=300, type=int)
+parser.add_argument("--fill_embedding", type=str2bool, required=False, default=True, help="train embeddings or not")
+parser.add_argument('--blstm_hidden_unit_dim', default=100, type=int)
+
+args = parser.parse_args()
diff --git a/model.py b/model.py
@@ -0,0 +1,152 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import config
+import utils
+args = config.args
+class DependencyBLSTM(nn.Module):
+    def __init__(self, num_words, max_sen_length, max_doc_sent_length):
+        super(DependencyBLSTM, self).__init__()
+        self.max_sen_length = max_sen_length
+        self.max_doc_sent_length = max_doc_sent_length
+        self.dropout = nn.Dropout(p=args.dropout)
+        self.word_embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=args.word_dim)
+        self.Softmax = nn.Softmax(dim=0)
+        ############################# Sentence level Functions #######################################
+
+        self.forwardLSTM_sent = nn.LSTM(num_layers=1, input_size=args.word_dim,
+                                        dropout=args.dropout,
+                                        hidden_size=int(args.blstm_hidden_unit_dim),
+                                        batch_first=True)
+        self.backwardLSTM_sent = nn.LSTM(num_layers=1, input_size=args.word_dim,
+                                         dropout=args.dropout,
+                                         hidden_size=args.blstm_hidden_unit_dim,
+                                         batch_first=True)
+        self.sentence_encoder = nn.Sequential(nn.Linear(args.word_dim, args.blstm_hidden_unit_dim),
+                                              nn.LeakyReLU(), nn.Dropout(p=args.dropout))
+        ############################# Doc level Functions #######################################
+
+        self.parent_encoder_doc = nn.Sequential(
+            nn.Linear(args.blstm_hidden_unit_dim, int(args.blstm_hidden_unit_dim)),
+            nn.LeakyReLU(), nn.Dropout(p=args.dropout))
+        self.child_encoder_doc = nn.Sequential(
+            nn.Linear(args.blstm_hidden_unit_dim, int(args.blstm_hidden_unit_dim)),
+            nn.LeakyReLU(), nn.Dropout(p=args.dropout))
+
+
+        self.root_score_encoder_doc = nn.Linear(args.blstm_hidden_unit_dim, 1)
+        self.root_embed_doc = \
+            utils.wrap_with_variable(torch.FloatTensor(np.zeros(shape=(args.blstm_hidden_unit_dim))),
+                                     gpu=args.gpu,
+                                     requires_grad=True)
+        self.r_embeds_doc = nn.Sequential(
+            nn.Linear(3 * args.blstm_hidden_unit_dim,
+                      int(args.blstm_hidden_unit_dim)),
+            nn.LeakyReLU(), nn.Dropout(p=args.dropout))
+
+        self.final_binary_classifier = nn.Linear(int(args.blstm_hidden_unit_dim), 2)
+
+    def create_sentence_batches(self, docs):
+        all_doc_batches = []
+        all_doc_batches_inverse = []
+        all_doc_seq_lengths = []
+        for doc in docs:
+            doc_sent_embed, doc_sent_embed_inverse = [], []
+            seq_lengths = []
+            for sent_word_indices in doc['word_indices']:
+                j = utils.wrap_with_variable(torch.LongTensor(np.array(sent_word_indices).astype(int)), gpu=args.gpu, requires_grad=False)
+
+                word_embed = self.word_embedding(j)
+                word_embed = self.dropout(word_embed)
+                X = torch.zeros(self.max_sen_length, args.word_dim)
+                X[0:len(sent_word_indices)] = word_embed.data
+                X = utils.wrap_with_variable(X, gpu=args.gpu, requires_grad=True)
+                doc_sent_embed.append(X)
+
+                idx = [i for i in range(word_embed.data.size(0) - 1, -1, -1)]
+                if args.gpu > -1:
+                    idx = torch.LongTensor(idx).cuda(args.gpu)
+                else:
+                    idx = torch.LongTensor(idx)
+                X_inverse = torch.zeros(self.max_sen_length, args.word_dim)
+                X_inverse[0:len(sent_word_indices)] = word_embed.data.index_select(0, idx)
+                X_inverse = utils.wrap_with_variable(X_inverse, gpu=args.gpu, requires_grad=True)
+                doc_sent_embed_inverse.append(X_inverse)
+
+                seq_lengths.append(len(sent_word_indices))
+
+            doc_sent_embed = torch.stack(doc_sent_embed)
+            doc_sent_embed_inverse = torch.stack(doc_sent_embed_inverse)
+            all_doc_batches.append(doc_sent_embed)
+            all_doc_batches_inverse.append(doc_sent_embed_inverse)
+            all_doc_seq_lengths.append(seq_lengths)
+        return all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths
+
+    def get_sentence_encodings(self, all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths):
+        all_doc_sentence_encodings = []
+        for doc_batch, doc_batch_inverse, doc_seq_length in zip(all_doc_batches, all_doc_batches_inverse,
+                                                                all_doc_seq_lengths):
+            doc_sentence_encodings = []
+
+            fwrd_outputs, _ = self.forwardLSTM_sent(doc_batch)
+            bwrd_outputs, _ = self.backwardLSTM_sent(doc_batch_inverse)
+            for sent_forward, sent_backward, l in zip(doc_batch, doc_batch_inverse, doc_seq_length):
+                idx = [i for i in range(l - 1, -1, -1)]
+                if args.gpu > -1:
+                    idx = torch.LongTensor(idx).cuda(args.gpu)
+                else:
+                    idx = torch.LongTensor(idx)
+                bwrd_outputs_inverse = utils.wrap_with_variable(sent_backward.data.index_select(0, idx), gpu=args.gpu,
+                                                                requires_grad=True)
+
+                h = self.sentence_encoder(0.5 * (sent_forward[l - 1] + bwrd_outputs_inverse[l-1]))
+                doc_sentence_encodings.append(h)
+            doc_sentence_encodings = torch.stack(doc_sentence_encodings)
+            all_doc_sentence_encodings.append(doc_sentence_encodings)
+        return all_doc_sentence_encodings
+
+    def forward(self, docs):
+        all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths = self.create_sentence_batches(docs)
+        all_doc_sentence_encodings = self.get_sentence_encodings(all_doc_batches, all_doc_batches_inverse,
+                                                                 all_doc_seq_lengths)
+        all_doc_doc_dependency_tree_info = []
+        all_final_features = []
+        for sentence_encodings in all_doc_sentence_encodings:
+            fri = []
+            Aij = []
+            for i in range(len(sentence_encodings)):
+                fri.append(self.root_score_encoder_doc(sentence_encodings[i]))
+                for j in range(len(sentence_encodings)):
+                    if i == j:
+                        Aij.append(utils.wrap_with_variable(torch.tensor(-9999999.000), gpu=args.gpu, requires_grad=True))
+                        continue
+
+                    x = torch.dot(self.parent_encoder_doc(sentence_encodings[i]),
+                                  self.child_encoder_doc(sentence_encodings[j]))
+                    Aij.append(x)
+            Aij = torch.stack(Aij)
+            Aij = Aij.view(len(sentence_encodings), len(sentence_encodings))
+            Aij = self.Softmax(Aij)
+            fri = torch.stack(fri)
+            fri = self.Softmax(fri)
+            ri = []
+            for i in range(len(sentence_encodings)):
+                tmp = []
+                tmp3 = []
+                for k in range(len(sentence_encodings)):
+                    tmp.append(torch.mul(Aij[k, i], sentence_encodings[k]))
+                    tmp3.append(torch.mul(Aij[i, k], sentence_encodings[i]))
+                tmp3 = torch.stack(tmp3)
+                tmp3 = torch.sum(tmp3, 0)
+                tmp = torch.stack(tmp)
+                tmp = torch.sum(tmp, 0)
+                tmp2 = torch.mul(fri[i], self.root_embed_doc)
+                ri.append(self.r_embeds_doc(torch.cat((sentence_encodings[i], tmp + tmp2, tmp3))))
+            ri = torch.stack(ri)
+            final_feature = torch.mean(ri,0)
+            all_final_features.append(final_feature)
+            all_doc_doc_dependency_tree_info.append([Aij.data, fri.data])
+
+        all_final_features = torch.stack(all_final_features)
+        output = self.final_binary_classifier(all_final_features)
+        return output, all_doc_doc_dependency_tree_info
diff --git a/test.py b/test.py
@@ -0,0 +1,96 @@
+import os
+import pickle
+import numpy as np
+import torch
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import confusion_matrix, classification_report
+from torch import nn
+from torch.autograd import Variable
+import config
+import utils
+from model import DependencyBLSTM
+
+args = config.args
+output_dir = args.project_dir + 'Models/' + args.sim_name + '/'
+utils.creat_word_embedding()
+
+def test():
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    save_path = args.project_dir + 'Models/' + args.sim_name + '/model.pt'
+
+    if os.path.exists(save_path):
+        model = torch.load(save_path)
+        print('Great!!! Pre-Trained Model Loaded !!!')
+    else:
+        print('No pre-trained model ')
+        model = DependencyBLSTM(num_words=utils.get_num_words(), max_sen_length=97, max_doc_sent_length=326)
+
+    if not os.path.exists(output_dir + 'test_performance_log.txt'):
+        test_performance_log = open(output_dir + 'test_performance_log.txt', 'w')
+    else:
+        test_performance_log = open(output_dir + 'test_performance_log.txt', 'a')
+
+    if args.gpu > -1:
+        model.cuda(device=int(args.gpu))
+    if args.fill_embedding :
+        embed = utils.get_word_embeddings(source='google')
+        if args.gpu > -1:
+            model.word_embedding.weight.data.set_(torch.FloatTensor((embed)).cuda(int(args.gpu)))
+        else:
+            model.word_embedding.weight.data.set_(torch.FloatTensor((embed)))
+    else:
+        if args.gpu > -1:
+            model.word_embedding.weight.data.set_(
+                torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim)).astype())).cuda(int(args.gpu)))
+        else:
+            model.word_embedding.weight.data.set_(
+                torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim))).astype(float)))
+
+    if args.train_embeddings == False:
+        model.word_embedding.weight.requires_grad = False
+
+    criterion = nn.CrossEntropyLoss()
+    test_set = utils.get_split_data(split='test')
+    print('Start Test ...')
+    model.eval()
+    test_labels = [d['label'] for d in test_set]
+    labels = Variable(torch.LongTensor(test_labels))
+
+    lengths = [len(doc['word_indices']) for doc in test_set]
+    doc_encodings, all_doc_doc_dependency_tree_info = model(test_set)
+
+    outputs = doc_encodings.cpu()
+    loss = criterion(outputs, labels).data[0]
+    _, predictions = torch.max(outputs.data, 1)
+    predictions = predictions.numpy()
+
+    with open(output_dir + "matrix.pkl", 'wb') as f:
+        pickle.dump([lengths, all_doc_doc_dependency_tree_info, test_labels], f)
+
+    accuracy = accuracy_score(y_true=np.array(test_labels), y_pred=predictions)
+    report = classification_report(y_true=np.array(test_labels), y_pred=predictions, target_names=['Real', 'Fake'])
+    conf_matrix = confusion_matrix(y_true=np.array(test_labels), y_pred=predictions)
+    test_performance_log.write(" Loss {} Accuracy {} \n".format(loss, accuracy))
+    test_performance_log.write("{}\n".format(report))
+    test_performance_log.write("{}\n".format(conf_matrix))
+    test_performance_log.write("{}\n".format('=' * 50))
+
+    print('************* Test ****************')
+    print("Loss {}  Accuracy {} ".format(loss, accuracy))
+    print(report)
+    print(conf_matrix)
+    print('*****************************************')
+    return accuracy
+
+
+test()
+
+fake_doc_stat, real_doc_stat = utils.dependecy_tree_stat(dir=output_dir)
+#print(fake_doc_stat)
+#print('*' * 100)
+#print(real_doc_stat)
+#print('*' * 100)
+print("Avg. Number of Leaf Nodes: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 2]), np.mean(real_doc_stat[:, 2])))
+print("Avg. Preorder Difference: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 4]), np.mean(real_doc_stat[:, 4])))
+print("Avg. Parent-Child Distance: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 3]), np.mean(real_doc_stat[:, 3])))