In [10]:
import time
import os
import torch
import json
import tqdm
import random
import numpy as np
import argparse
import pickle as pkl
import networkx as nx
import networkx.algorithms as nxalg

from pycorenlp import StanfordCoreNLP
from data_utils import SymbolsManager
from sys import path
from data_utils import convert_to_tree
from collections import OrderedDict

In [2]:
# some basic configuration

data_dir = "../dataset/"
batch_size = 20
min_freq = 2
max_vocab_size = 15000
seed = 123

In [14]:
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x111860b10>

In [46]:
# start the corenlp server first
class InputPreprocessor(object):
    def __init__(self, url = 'http://localhost:9000'):
        self.nlp = StanfordCoreNLP(url)

    def featureExtract(self,src_text,whiteSpace=True):
        data = {}
        output = self.nlp.annotate(src_text.strip(), properties={
        'annotators': "tokenize,ssplit,pos,depparse",
        "tokenize.options":"splitHyphenated=true,normalizeParentheses=false",
		"tokenize.whitespace": whiteSpace,
        'ssplit.isOneSentence': True,
        'outputFormat': 'json'
    })
       
        # core_arguments = {"nsubj", "xcomp", "iobj"}
        core_arguments = {u'cc': 95, u'ccomp': 47, u'conj': 88, u'neg': 42, u'dobj': 317, 
         u'nsubj': 370, u'compound': 630, u'case': 558, u'cop': 42, u'dep': 107, 
         u'det': 496, u'nmod': 523, u'amod': 90, u'acl:relcl': 72, u'iobj': 81, u'expl': 138}
        
        snt = output['sentences'][0]["tokens"]
        depency = output['sentences'][0]["basicDependencies"]
        data["word_list"] = []
        data["fw_adj"] = {}
        for snt_tok in snt:
            data["word_list"].append(str(snt_tok['word']))
        seq_len = len(data["word_list"])
        data["seq_len"] = seq_len

        for idx in range(seq_len):
            data["fw_adj"][idx] = []
        dep_index = 0
        for deps in depency:
            if deps['dep'] in core_arguments:
                # if deps['dep'] != "neg" or data['word_list'][deps['governor'] - 1] == "outsid":
                if deps['dep'] != "ref_value":
                    if str(deps['dep']) not in data["word_list"]:
                        data["word_list"].append(str(deps['dep']))
                    # data["fw_adj"][deps['governor'] - 1].append([deps['dependent'] - 1, seq_len + dep_index])
                    data["fw_adj"][deps['governor'] - 1].append((deps['dependent'] - 1, str(deps['dep'])))
            # else:
            #     if data['word_list'][deps['dependent'] - 1] == "outsid":
            #         # print "ASDASD"
            #         data["word_list"].append("neg")
            #         data["fw_adj"][deps['governor'] - 1].append((deps['dependent'] - 1, "neg"))
                dep_index += 1
        return data

def read_parsed_result(input_file):
        graphs_new = []
        with open(input_file, "r") as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                jo = json.loads(line)
                graphs_new.append(jo)
        return graphs_new

def begin_parsing_dep():
    feature_extractor = InputPreprocessor()

    src_txt = []
    with open("{}/{}.txt".format(data_dir, "train"), "r") as f:
        lines = f.readlines()
        for line in lines:
            src_txt.append(line.strip().split('\t')[0])

    with open("{}/{}".format(data_dir, "dependency_parsed_result.train"), "w") as f:
        for sentence in tqdm.tqdm(src_txt):
            parsed_result = feature_extractor.featureExtract(sentence)
            f.write(json.dumps(parsed_result) + '\n')

    src_txt = []
    with open("{}/{}.txt".format(data_dir, "test"), "r") as f:
        lines = f.readlines()
        for line in lines:
            src_txt.append(line.strip().split('\t')[0])

    with open("{}/{}".format(data_dir, "dependency_parsed_result.test"), "w") as f:
        for sentence in tqdm.tqdm(src_txt):
            parsed_result = feature_extractor.featureExtract(sentence)
            f.write(json.dumps(parsed_result) + '\n')

In [41]:
def create_with_wo_dep_info(output_file, src, graph_scale):
    graph_list = []
    batch_size = len(src)
    for num in range(batch_size):
        info = {}
        graph = nx.DiGraph()
        graph_node_size = src[num]['seq_len']
        source_text = src[num]['word_list']
        dependency_edge = src[num]['fw_adj']
        for idx in range(graph_scale):
            graph.add_node(idx)
            if(idx >= 1 and idx <= graph_node_size - 1):
                graph.add_edge(idx, idx-1)
                graph.add_edge(idx-1, idx)
            if(idx <= graph_node_size - 1) and (str(idx) in dependency_edge.keys()):
                dependency_edge_list = dependency_edge[str(idx)]
                for tmp in dependency_edge_list:
                    dep_index = tmp[1]
                    dep_index = source_text.index(tmp[1])
                    graph.add_edge(idx, dep_index)
                    graph.add_edge(dep_index, tmp[0])

        adj_list = [sorted(n_dict.keys()) for nodes, n_dict in graph.adjacency()]

        g_ids = {}
        g_ids_features = {}
        g_adj = {}
        for i in range(graph_scale):
            g_ids[i] = i

            if i < len(source_text):
                g_ids_features[i] = source_text[i]
            else:
                g_ids_features[i] = '<P>'
            
            g_adj[i] = adj_list[i]

        info['g_ids'] = g_ids
        info['g_ids_features'] = g_ids_features
        info['g_adj'] = g_adj
        info['word_list'] = source_text
        graph_list.append(info)
        
    with open(output_file, "a+") as f:
        for idx in range(len(graph_list)):
            f.write(json.dumps(graph_list[idx]) + '\n')

    return graph_list

# process dep data

In [42]:
def train_data_preprocess():
    time_start = time.time()
    word_manager = SymbolsManager(True)
    word_manager.init_from_file("{}/vocab.q.txt".format(data_dir), min_freq, max_vocab_size)
    form_manager = SymbolsManager(True)
    form_manager.init_from_file("{}/vocab.f.txt".format(data_dir), 0, max_vocab_size)
    print(word_manager.vocab_size)
    print(form_manager.vocab_size)
    data = []
    with open("{}/{}.txt".format(data_dir, "train"), "r") as f:
        for line in f:
            l_list = line.split("\t")
            w_list = l_list[0].strip().split(' ')
            r_list = form_manager.get_symbol_idx_for_list(l_list[1].strip().split(' '))
            cur_tree = convert_to_tree(r_list, 0, len(r_list), form_manager)

            data.append((w_list, r_list, cur_tree))

    out_graphfile = "{}/graph.train".format(data_dir)
    if os.path.exists(out_graphfile):
        os.remove(out_graphfile)
    # generate batch graph here
    if len(data) % batch_size != 0:
        n = len(data)
        for i in range(len(data)%batch_size):
            data.insert(n-i-1, data[n-i-1])
            
    dependency_parsed_result = read_parsed_result("../dataset/dependency_parsed_result.train")
    if len(dependency_parsed_result) % batch_size != 0:
        n = len(dependency_parsed_result)
        for i in range(len(dependency_parsed_result)%batch_size):
            dependency_parsed_result.insert(n-i-1, dependency_parsed_result[n-i-1])
    print len(data)
    print len(dependency_parsed_result)
    index = 0
    while index + batch_size <= len(data):
        # generate graphs with order and dependency information
        dependency_batch = [dependency_parsed_result[index+idx] for idx in range(batch_size)]
        max_dependency_node_size = max([len(dependency_batch[idx]['word_list']) for idx in range(batch_size)])
        dependency_graph_batch = create_with_wo_dep_info(out_graphfile, dependency_batch, max_dependency_node_size)
        
        for idx in range(len(dependency_batch)):
            w_list_len = dependency_batch[idx]['seq_len']
            w_list = dependency_batch[idx]['word_list'][w_list_len:]
            for j in w_list:
                if j not in word_manager.symbol2idx:
                        word_manager.add_symbol(j)
                        print "{} Added.".format(j)
        index += batch_size

    out_datafile = "{}/train.pkl".format(data_dir)
    with open(out_datafile, "wb") as out_data:
        pkl.dump(data, out_data)
    
    out_mapfile = "{}/map.pkl".format(data_dir)
    with open(out_mapfile, "wb") as out_map:
        pkl.dump([word_manager, form_manager], out_map)

    print(word_manager.vocab_size)
    print(form_manager.vocab_size)

    time_end = time.time()
    print "time used:" + str(time_end - time_start)

In [43]:
def test_data_preprocess():
    data = []
    managers = pkl.load( open("{}/map.pkl".format(data_dir), "rb" ) )
    word_manager, form_manager = managers
    with open("{}/{}.txt".format(data_dir, "test"), "r") as f:
        for line in f:
            l_list = line.split("\t")
            w_list = l_list[0].strip().split(' ')
            r_list = form_manager.get_symbol_idx_for_list(l_list[1].strip().split(' '))
            cur_tree = convert_to_tree(r_list, 0, len(r_list), form_manager)
            data.append((w_list, r_list, cur_tree))
    out_datafile = "{}/test.pkl".format(data_dir)
    with open(out_datafile, "wb") as out_data:
        pkl.dump(data, out_data)

    out_graphfile = "{}/graph.test".format(data_dir)
    if os.path.exists(out_graphfile):
        os.remove(out_graphfile)

    # generate batch graph here
    if len(data) % batch_size != 0:
        n = len(data)
        for i in range(len(data)%batch_size):
            data.insert(n-i-1, data[n-i-1])
    dependency_parsed_result = read_parsed_result("../dataset/dependency_parsed_result.test")
    if len(dependency_parsed_result) % batch_size != 0:
        n = len(dependency_parsed_result)
        for i in range(len(dependency_parsed_result)%batch_size):
            dependency_parsed_result.insert(n-i-1, dependency_parsed_result[n-i-1])
            
    index = 0
    while index + batch_size <= len(data):
        # generate graphs with order and dependency information
        dependency_batch = [dependency_parsed_result[index+idx] for idx in range(batch_size)]
        max_dependency_node_size = max([len(dependency_batch[idx]['word_list']) for idx in range(batch_size)])
        dependency_graph_batch = create_with_wo_dep_info(out_graphfile, dependency_batch, max_dependency_node_size)
        
        index += batch_size

In [48]:
begin_parsing_dep()

100%|██████████| 500/500 [00:05<00:00, 83.65it/s]
100%|██████████| 140/140 [00:01<00:00, 84.95it/s]


In [49]:
train_data_preprocess()

loading vocabulary file: ../dataset//vocab.q.txt
loading vocabulary file: ../dataset//vocab.f.txt
129
52
500
500
det Added.
case Added.
nmod Added.
nsubj Added.
ccomp Added.
compound Added.
dobj Added.
expl Added.
iobj Added.
amod Added.
dep Added.
cop Added.
neg Added.
acl:relcl Added.
cc Added.
conj Added.
145
52
time used:0.27027797699


In [50]:
test_data_preprocess()