Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d5a9c0a
commit 879bd9c
Showing
5 changed files
with
611 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import argparse | ||
import os | ||
|
||
|
||
def str2bool(v): | ||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | ||
return True | ||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | ||
return False | ||
else: | ||
raise argparse.ArgumentTypeError('Boolean value expected.') | ||
|
||
|
||
def path(p): | ||
return os.path.expanduser(p) | ||
|
||
|
||
parser = argparse.ArgumentParser(description='Arguments for DHSF Project') | ||
|
||
parser.add_argument("--project_dir", type=str, required=False, default='/PATH/DHSF/', help="project directory") | ||
parser.add_argument("--dropout", type=float, required=False, default=0.0, help="Value of droupout") | ||
parser.add_argument("--sim_name", type=str, required=False, default="sim1",help="The unique and arbitrary name of simulation") | ||
parser.add_argument("--l2_coeff", type=float, required=False, default=0.00,help="Value L2 regularization coefficient") | ||
parser.add_argument("--train_embeddings", type=str2bool, required=False, default=True,help="train embeddings or not") | ||
parser.add_argument("--lr", type=float, required=False, default=0.01, help="initial learning rate ") | ||
parser.add_argument("--batch_size", type=int, required=False, default=40, help="size of batch") | ||
parser.add_argument('--step_num', type=int, required=False, help=' number of steps to run the simulation.', default=200) | ||
parser.add_argument('--gpu', default=-1, type=int, help='GPU id > 0 moves the model on GPU') | ||
parser.add_argument('--word_dim', default=300, type=int) | ||
parser.add_argument("--fill_embedding", type=str2bool, required=False, default=True, help="train embeddings or not") | ||
parser.add_argument('--blstm_hidden_unit_dim', default=100, type=int) | ||
|
||
args = parser.parse_args() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
import config | ||
import utils | ||
args = config.args | ||
class DependencyBLSTM(nn.Module): | ||
def __init__(self, num_words, max_sen_length, max_doc_sent_length): | ||
super(DependencyBLSTM, self).__init__() | ||
self.max_sen_length = max_sen_length | ||
self.max_doc_sent_length = max_doc_sent_length | ||
self.dropout = nn.Dropout(p=args.dropout) | ||
self.word_embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=args.word_dim) | ||
self.Softmax = nn.Softmax(dim=0) | ||
############################# Sentence level Functions ####################################### | ||
|
||
self.forwardLSTM_sent = nn.LSTM(num_layers=1, input_size=args.word_dim, | ||
dropout=args.dropout, | ||
hidden_size=int(args.blstm_hidden_unit_dim), | ||
batch_first=True) | ||
self.backwardLSTM_sent = nn.LSTM(num_layers=1, input_size=args.word_dim, | ||
dropout=args.dropout, | ||
hidden_size=args.blstm_hidden_unit_dim, | ||
batch_first=True) | ||
self.sentence_encoder = nn.Sequential(nn.Linear(args.word_dim, args.blstm_hidden_unit_dim), | ||
nn.LeakyReLU(), nn.Dropout(p=args.dropout)) | ||
############################# Doc level Functions ####################################### | ||
|
||
self.parent_encoder_doc = nn.Sequential( | ||
nn.Linear(args.blstm_hidden_unit_dim, int(args.blstm_hidden_unit_dim)), | ||
nn.LeakyReLU(), nn.Dropout(p=args.dropout)) | ||
self.child_encoder_doc = nn.Sequential( | ||
nn.Linear(args.blstm_hidden_unit_dim, int(args.blstm_hidden_unit_dim)), | ||
nn.LeakyReLU(), nn.Dropout(p=args.dropout)) | ||
|
||
|
||
self.root_score_encoder_doc = nn.Linear(args.blstm_hidden_unit_dim, 1) | ||
self.root_embed_doc = \ | ||
utils.wrap_with_variable(torch.FloatTensor(np.zeros(shape=(args.blstm_hidden_unit_dim))), | ||
gpu=args.gpu, | ||
requires_grad=True) | ||
self.r_embeds_doc = nn.Sequential( | ||
nn.Linear(3 * args.blstm_hidden_unit_dim, | ||
int(args.blstm_hidden_unit_dim)), | ||
nn.LeakyReLU(), nn.Dropout(p=args.dropout)) | ||
|
||
self.final_binary_classifier = nn.Linear(int(args.blstm_hidden_unit_dim), 2) | ||
|
||
def create_sentence_batches(self, docs): | ||
all_doc_batches = [] | ||
all_doc_batches_inverse = [] | ||
all_doc_seq_lengths = [] | ||
for doc in docs: | ||
doc_sent_embed, doc_sent_embed_inverse = [], [] | ||
seq_lengths = [] | ||
for sent_word_indices in doc['word_indices']: | ||
j = utils.wrap_with_variable(torch.LongTensor(np.array(sent_word_indices).astype(int)), gpu=args.gpu, requires_grad=False) | ||
|
||
word_embed = self.word_embedding(j) | ||
word_embed = self.dropout(word_embed) | ||
X = torch.zeros(self.max_sen_length, args.word_dim) | ||
X[0:len(sent_word_indices)] = word_embed.data | ||
X = utils.wrap_with_variable(X, gpu=args.gpu, requires_grad=True) | ||
doc_sent_embed.append(X) | ||
|
||
idx = [i for i in range(word_embed.data.size(0) - 1, -1, -1)] | ||
if args.gpu > -1: | ||
idx = torch.LongTensor(idx).cuda(args.gpu) | ||
else: | ||
idx = torch.LongTensor(idx) | ||
X_inverse = torch.zeros(self.max_sen_length, args.word_dim) | ||
X_inverse[0:len(sent_word_indices)] = word_embed.data.index_select(0, idx) | ||
X_inverse = utils.wrap_with_variable(X_inverse, gpu=args.gpu, requires_grad=True) | ||
doc_sent_embed_inverse.append(X_inverse) | ||
|
||
seq_lengths.append(len(sent_word_indices)) | ||
|
||
doc_sent_embed = torch.stack(doc_sent_embed) | ||
doc_sent_embed_inverse = torch.stack(doc_sent_embed_inverse) | ||
all_doc_batches.append(doc_sent_embed) | ||
all_doc_batches_inverse.append(doc_sent_embed_inverse) | ||
all_doc_seq_lengths.append(seq_lengths) | ||
return all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths | ||
|
||
def get_sentence_encodings(self, all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths): | ||
all_doc_sentence_encodings = [] | ||
for doc_batch, doc_batch_inverse, doc_seq_length in zip(all_doc_batches, all_doc_batches_inverse, | ||
all_doc_seq_lengths): | ||
doc_sentence_encodings = [] | ||
|
||
fwrd_outputs, _ = self.forwardLSTM_sent(doc_batch) | ||
bwrd_outputs, _ = self.backwardLSTM_sent(doc_batch_inverse) | ||
for sent_forward, sent_backward, l in zip(doc_batch, doc_batch_inverse, doc_seq_length): | ||
idx = [i for i in range(l - 1, -1, -1)] | ||
if args.gpu > -1: | ||
idx = torch.LongTensor(idx).cuda(args.gpu) | ||
else: | ||
idx = torch.LongTensor(idx) | ||
bwrd_outputs_inverse = utils.wrap_with_variable(sent_backward.data.index_select(0, idx), gpu=args.gpu, | ||
requires_grad=True) | ||
|
||
h = self.sentence_encoder(0.5 * (sent_forward[l - 1] + bwrd_outputs_inverse[l-1])) | ||
doc_sentence_encodings.append(h) | ||
doc_sentence_encodings = torch.stack(doc_sentence_encodings) | ||
all_doc_sentence_encodings.append(doc_sentence_encodings) | ||
return all_doc_sentence_encodings | ||
|
||
def forward(self, docs): | ||
all_doc_batches, all_doc_batches_inverse, all_doc_seq_lengths = self.create_sentence_batches(docs) | ||
all_doc_sentence_encodings = self.get_sentence_encodings(all_doc_batches, all_doc_batches_inverse, | ||
all_doc_seq_lengths) | ||
all_doc_doc_dependency_tree_info = [] | ||
all_final_features = [] | ||
for sentence_encodings in all_doc_sentence_encodings: | ||
fri = [] | ||
Aij = [] | ||
for i in range(len(sentence_encodings)): | ||
fri.append(self.root_score_encoder_doc(sentence_encodings[i])) | ||
for j in range(len(sentence_encodings)): | ||
if i == j: | ||
Aij.append(utils.wrap_with_variable(torch.tensor(-9999999.000), gpu=args.gpu, requires_grad=True)) | ||
continue | ||
|
||
x = torch.dot(self.parent_encoder_doc(sentence_encodings[i]), | ||
self.child_encoder_doc(sentence_encodings[j])) | ||
Aij.append(x) | ||
Aij = torch.stack(Aij) | ||
Aij = Aij.view(len(sentence_encodings), len(sentence_encodings)) | ||
Aij = self.Softmax(Aij) | ||
fri = torch.stack(fri) | ||
fri = self.Softmax(fri) | ||
ri = [] | ||
for i in range(len(sentence_encodings)): | ||
tmp = [] | ||
tmp3 = [] | ||
for k in range(len(sentence_encodings)): | ||
tmp.append(torch.mul(Aij[k, i], sentence_encodings[k])) | ||
tmp3.append(torch.mul(Aij[i, k], sentence_encodings[i])) | ||
tmp3 = torch.stack(tmp3) | ||
tmp3 = torch.sum(tmp3, 0) | ||
tmp = torch.stack(tmp) | ||
tmp = torch.sum(tmp, 0) | ||
tmp2 = torch.mul(fri[i], self.root_embed_doc) | ||
ri.append(self.r_embeds_doc(torch.cat((sentence_encodings[i], tmp + tmp2, tmp3)))) | ||
ri = torch.stack(ri) | ||
final_feature = torch.mean(ri,0) | ||
all_final_features.append(final_feature) | ||
all_doc_doc_dependency_tree_info.append([Aij.data, fri.data]) | ||
|
||
all_final_features = torch.stack(all_final_features) | ||
output = self.final_binary_classifier(all_final_features) | ||
return output, all_doc_doc_dependency_tree_info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import os | ||
import pickle | ||
import numpy as np | ||
import torch | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.metrics import confusion_matrix, classification_report | ||
from torch import nn | ||
from torch.autograd import Variable | ||
import config | ||
import utils | ||
from model import DependencyBLSTM | ||
|
||
args = config.args | ||
output_dir = args.project_dir + 'Models/' + args.sim_name + '/' | ||
utils.creat_word_embedding() | ||
|
||
def test(): | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
save_path = args.project_dir + 'Models/' + args.sim_name + '/model.pt' | ||
|
||
if os.path.exists(save_path): | ||
model = torch.load(save_path) | ||
print('Great!!! Pre-Trained Model Loaded !!!') | ||
else: | ||
print('No pre-trained model ') | ||
model = DependencyBLSTM(num_words=utils.get_num_words(), max_sen_length=97, max_doc_sent_length=326) | ||
|
||
if not os.path.exists(output_dir + 'test_performance_log.txt'): | ||
test_performance_log = open(output_dir + 'test_performance_log.txt', 'w') | ||
else: | ||
test_performance_log = open(output_dir + 'test_performance_log.txt', 'a') | ||
|
||
if args.gpu > -1: | ||
model.cuda(device=int(args.gpu)) | ||
if args.fill_embedding : | ||
embed = utils.get_word_embeddings(source='google') | ||
if args.gpu > -1: | ||
model.word_embedding.weight.data.set_(torch.FloatTensor((embed)).cuda(int(args.gpu))) | ||
else: | ||
model.word_embedding.weight.data.set_(torch.FloatTensor((embed))) | ||
else: | ||
if args.gpu > -1: | ||
model.word_embedding.weight.data.set_( | ||
torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim)).astype())).cuda(int(args.gpu))) | ||
else: | ||
model.word_embedding.weight.data.set_( | ||
torch.FloatTensor((np.zeros((utils.get_num_words() + 1, args.word_dim))).astype(float))) | ||
|
||
if args.train_embeddings == False: | ||
model.word_embedding.weight.requires_grad = False | ||
|
||
criterion = nn.CrossEntropyLoss() | ||
test_set = utils.get_split_data(split='test') | ||
print('Start Test ...') | ||
model.eval() | ||
test_labels = [d['label'] for d in test_set] | ||
labels = Variable(torch.LongTensor(test_labels)) | ||
|
||
lengths = [len(doc['word_indices']) for doc in test_set] | ||
doc_encodings, all_doc_doc_dependency_tree_info = model(test_set) | ||
|
||
outputs = doc_encodings.cpu() | ||
loss = criterion(outputs, labels).data[0] | ||
_, predictions = torch.max(outputs.data, 1) | ||
predictions = predictions.numpy() | ||
|
||
with open(output_dir + "matrix.pkl", 'wb') as f: | ||
pickle.dump([lengths, all_doc_doc_dependency_tree_info, test_labels], f) | ||
|
||
accuracy = accuracy_score(y_true=np.array(test_labels), y_pred=predictions) | ||
report = classification_report(y_true=np.array(test_labels), y_pred=predictions, target_names=['Real', 'Fake']) | ||
conf_matrix = confusion_matrix(y_true=np.array(test_labels), y_pred=predictions) | ||
test_performance_log.write(" Loss {} Accuracy {} \n".format(loss, accuracy)) | ||
test_performance_log.write("{}\n".format(report)) | ||
test_performance_log.write("{}\n".format(conf_matrix)) | ||
test_performance_log.write("{}\n".format('=' * 50)) | ||
|
||
print('************* Test ****************') | ||
print("Loss {} Accuracy {} ".format(loss, accuracy)) | ||
print(report) | ||
print(conf_matrix) | ||
print('*****************************************') | ||
return accuracy | ||
|
||
|
||
test() | ||
|
||
fake_doc_stat, real_doc_stat = utils.dependecy_tree_stat(dir=output_dir) | ||
#print(fake_doc_stat) | ||
#print('*' * 100) | ||
#print(real_doc_stat) | ||
#print('*' * 100) | ||
print("Avg. Number of Leaf Nodes: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 2]), np.mean(real_doc_stat[:, 2]))) | ||
print("Avg. Preorder Difference: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 4]), np.mean(real_doc_stat[:, 4]))) | ||
print("Avg. Parent-Child Distance: Fake {} Real {}".format(np.mean(fake_doc_stat[:, 3]), np.mean(real_doc_stat[:, 3]))) |
Oops, something went wrong.