<h1>INDEX<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#构建constituency-tree" data-toc-modified-id="构建constituency-tree-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>构建constituency tree</a></span></li><li><span><a href="#process-dep-data" data-toc-modified-id="process-dep-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>process dep data</a></span></li></ul></div>

In [1]:
import time
import os
import torch
import json
import tqdm
import random
import numpy as np
import argparse
import pickle as pkl
import networkx as nx
import networkx.algorithms as nxalg

from pycorenlp import StanfordCoreNLP
from data_utils import SymbolsManager
from sys import path
from data_utils import convert_to_tree
from collections import OrderedDict
from pythonds.basic.stack import Stack

# 构建constituency tree

In [15]:
# some basic configuration

data_dir = "../dataset"
batch_size = 20
min_freq = 2
max_vocab_size = 15000
seed = 123

In [3]:
# set random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x10ae19270>

In [4]:
class InputPreprocessor(object):
    def __init__(self, url = 'http://localhost:9000'):
        self.nlp = StanfordCoreNLP(url)

    def featureExtract(self,src_text,whiteSpace=True):
        data = {}
        output = self.nlp.annotate(src_text.strip(), properties={
        'annotators': "tokenize,ssplit,pos,parse",
        "tokenize.options":"splitHyphenated=true,normalizeParentheses=false",
		"tokenize.whitespace": whiteSpace,
        'ssplit.isOneSentence': True,
        'outputFormat': 'json'
    })

        snt = output['sentences'][0]["tokens"]
        depency = output['sentences'][0]["basicDependencies"]
        data["tok"] = []
        data["pos"] = []
        data["dep"] = []
        data["governor"] = []
        data["dependent"] = []
        data['parse'] = output['sentences'][0]['parse']
        for snt_tok in snt:
            data["tok"].append(snt_tok['word'])
            data["pos"].append(snt_tok['pos'])
        for deps in depency:
            data["dep"].append(deps['dep'])
            data["governor"].append(deps['governor'])
            data["dependent"].append(deps['dependent'])        
        return data

In [5]:
def get_constituency_graph(input_tmp):
    tmp_result = input_tmp
    parse_str = str(tmp_result['parse'])
    for punc in ['(',')']:
        parse_str = parse_str.replace(punc,' ' + punc + ' ')
    parse_list = str(parse_str).split()
    # print len(parse_list)
    # print parse_list
    res_graph = nx.DiGraph()
    pstack = Stack()
    idx = 0
    while idx < len(parse_list):
        if parse_list[idx] == '(':
            res_graph.add_node(idx + 1)
            pstack.push(idx+1)
            if pstack.size() > 1:
                node_2 = pstack.pop()
                node_1 = pstack.pop()
                res_graph.add_edge(node_1, node_2)
                pstack.push(node_1)
                pstack.push(node_2)
        elif parse_list[idx] == ')':
            pstack.pop()
        elif parse_list[idx] in tmp_result['tok']:
            node_1 = pstack.pop()
            res_graph.add_edge(node_1, idx)
            pstack.push(node_1)
        idx += 1
    for parse_num in range(len(parse_list)):
        if parse_list.count(parse_list[parse_num]) > 1:
            if parse_list[parse_num] != '(' and parse_list[parse_num] != ')':
                parse_list[parse_num] = parse_list[parse_num] + u'_' + str(parse_num)
        
    const_result = {}
    const_result['word_list'] = []
    const_result['graph_nodes'] = [parse_list[idx] for idx in list(res_graph.nodes())]
    for node in const_result['graph_nodes']:
        if node not in tmp_result['tok']:
            const_result['word_list'].append(node)
                
    const_result['graph_edges'] = []
    for e in list(res_graph.edges()):
        const_result['graph_edges'].append((parse_list[e[0]],parse_list[e[1]]))
    const_result['seq'] = tmp_result['tok']
#     res_str = parse_str.replace("(", " ").replace(")", " ").split()

    const_result['word_list'] = tmp_result['tok'] + const_result['word_list']
#     const_result['parse_full'] = parse_list
    const_result['seq_len'] = len(tmp_result['tok'])
    return const_result

In [6]:
def read_parsed_result(input_file):
        graphs_new = []
        with open(input_file, "r") as f:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                jo = json.loads(line)
                graphs_new.append(jo)
        return graphs_new

def begin_parsing_con():
    feature_extractor = InputPreprocessor()

    src_txt = []
    with open("{}/{}.txt".format(data_dir, "train"), "r") as f:
        lines = f.readlines()
        for line in lines:
            src_txt.append(line.strip().split('\t')[0])

    with open("{}/{}".format(data_dir, "constituency_parsed_result.train"), "w") as f:
        for sentence in tqdm.tqdm(src_txt):
            parsed_result = feature_extractor.featureExtract(sentence)
            f.write(json.dumps(get_constituency_graph(parsed_result)) + '\n')

    src_txt = []
    with open("{}/{}.txt".format(data_dir, "test"), "r") as f:
        lines = f.readlines()
        for line in lines:
            src_txt.append(line.strip().split('\t')[0])

    with open("{}/{}".format(data_dir, "constituency_parsed_result.test"), "w") as f:
        for sentence in tqdm.tqdm(src_txt):
            parsed_result = feature_extractor.featureExtract(sentence)
            f.write(json.dumps(get_constituency_graph(parsed_result)) + '\n')

In [7]:
# input_prep = InputParser()
# print get_constituency_graph(input_prep.featureExtract("what job are avail"))

In [8]:
def create_with_wo_con_info(output_file, src, graph_scale):
    graph_list = []
    batch_size = len(src)
    for num in range(batch_size):
        info = {}
        graph = nx.DiGraph()
        graph_node_size = src[num]['seq_len']
        source_text = src[num]['word_list']
        source_text_for_edge = list(source_text)
        const_edge = src[num]['graph_edges']
        for idx in range(graph_scale):
            graph.add_node(idx)
#             if(idx >= 1 and idx <= graph_node_size - 1):
#                 graph.add_edge(idx, idx-1)
#                 graph.add_edge(idx-1, idx)
        for edge in const_edge:
            id0 = source_text_for_edge.index(edge[0])
            id1 = source_text_for_edge.index(edge[1])
#             graph.add_edge(id0, id1)
            graph.add_edge(id1, id0)

        adj_list = [sorted(n_dict.keys()) for nodes, n_dict in graph.adjacency()]

        g_ids = {}
        g_ids_features = {}
        g_adj = {}
        for i in range(graph_scale):
            g_ids[i] = i

            if i < len(source_text):
                if '_' in source_text[i]:
                    idx_dash = source_text[i].index('_')
                    g_ids_features[i] = source_text[i][:idx_dash]
                else:
                    g_ids_features[i] = source_text[i]
            else:
                g_ids_features[i] = '<P>'
            
            g_adj[i] = adj_list[i]

        info['g_ids'] = g_ids
        info['g_ids_features'] = g_ids_features
        info['g_adj'] = g_adj
        for w in range(len(source_text)):
            if '_' in source_text[w]:
                idx_dash = source_text[w].index('_')
                source_text[w] = source_text[w][:idx_dash]
        info['word_list'] = source_text
        info['word_len'] = graph_node_size
        graph_list.append(info)
        
    with open(output_file, "a+") as f:
        for idx in range(len(graph_list)):
            f.write(json.dumps(graph_list[idx]) + '\n')

    return graph_list

In [None]:
def create_with_wo_con_info(output_file, src, graph_scale):
    graph_list = []
    batch_size = len(src)
    for num in range(batch_size):
        info = {}
        graph = nx.DiGraph()
        graph_node_size = src[num]['seq_len']
        source_text = src[num]['word_list']
        source_text_for_edge = list(source_text)
        const_edge = src[num]['graph_edges']
        for idx in range(graph_scale):
            graph.add_node(idx)
            if(idx >= 1 and idx <= graph_node_size - 1):
                graph.add_edge(idx, idx-1)
                graph.add_edge(idx-1, idx)
        for edge in const_edge:
            id0 = source_text_for_edge.index(edge[0])
            id1 = source_text_for_edge.index(edge[1])
            graph.add_edge(id0, id1)
            graph.add_edge(id1, id0)

        adj_list = [sorted(n_dict.keys()) for nodes, n_dict in graph.adjacency()]
        
        for w in range(len(source_text)):
            if w >= graph_node_size:
                source_text[w] = '<P>'
        g_ids = {}
        g_ids_features = {}
        g_adj = {}
        for i in range(graph_scale):
            g_ids[i] = i
            if i < len(source_text):
                g_ids_features[i] = source_text[i]
            else:
                g_ids_features[i] = '<P>'
            
            g_adj[i] = adj_list[i]

        info['g_ids'] = g_ids
        info['g_ids_features'] = g_ids_features
        info['g_adj'] = g_adj
        info['word_list'] = source_text
        info['word_len'] = graph_node_size
        graph_list.append(info)
        
    with open(output_file, "a+") as f:
        for idx in range(len(graph_list)):
            f.write(json.dumps(graph_list[idx]) + '\n')

    return graph_list

# process dep data

In [28]:
# tes_res = read_parsed_result("../dataset/constituency_parsed_result.test")
# print tes_res[0]

In [29]:
# cut_root(tes_res[0])

In [30]:
# def cut_root(graph):
#     g = graph
#     word_list = g['word_list']
#     edge_list = g['graph_edges']
#     cnt_list = []
#     for w in word_list:
#         cnt = 0
#         for e in edge_list:
#             cnt += e.count(w)
#         cnt_list.append(cnt)
        
#     near_list = []
    
#     print cnt_list

def cut_one_layer(graph):
    g = graph
    word_list = g['word_list']
    edge_list = g['graph_edges']
    cnt_list = []
    for w in word_list:
        cnt = 0
        for e in edge_list:
            cnt += e.count(w)
        cnt_list.append(cnt)
#     print cnt_list
    
    leaf_list = g['seq']
    near_leaf_list = []
    for e in edge_list:
        if e[1] in leaf_list and cnt_list[word_list.index(e[0])] == 2:
            near_leaf_list.append(e[0])
#     print near_leaf_list
    
    new_edge_list = []
    for n in near_leaf_list:
        fw_node = n
        for e in g['graph_edges']:
            if e[1] == n:
                fw_node = e[0]
        for e in g['graph_edges']:
            if e[0] == n:
                new_edge = [fw_node, e[1]]
                new_edge_list.append(new_edge)
#     print new_edge_list        
    
    old_edge_list = []
    for e in g['graph_edges']:
        if e[0] not in near_leaf_list and e[1] not in near_leaf_list:
            old_edge_list.append(e)
#     print old_edge_list
    g['graph_edges'] = old_edge_list + new_edge_list
    for n in near_leaf_list:
        g['word_list'].remove(n)
        g['graph_nodes'].remove(n)
    return g

def cut_two_layer(graph):
    return cut_one_layer(cut_one_layer(graph))

In [20]:
def train_data_preprocess():
    time_start = time.time()
    word_manager = SymbolsManager(True)
    word_manager.init_from_file("{}/vocab.q.txt".format(data_dir), min_freq, max_vocab_size)
    form_manager = SymbolsManager(True)
    form_manager.init_from_file("{}/vocab.f.txt".format(data_dir), 0, max_vocab_size)
    print(word_manager.vocab_size)
    print(form_manager.vocab_size)
    data = []
    with open("{}/{}.txt".format(data_dir, "train"), "r") as f:
        for line in f:
            l_list = line.split("\t")
            w_list = l_list[0].strip().split(' ')
            r_list = form_manager.get_symbol_idx_for_list(l_list[1].strip().split(' '))
            cur_tree = convert_to_tree(r_list, 0, len(r_list), form_manager)

            data.append((w_list, r_list, cur_tree))

    out_graphfile = "{}/graph.train".format(data_dir)
    if os.path.exists(out_graphfile):
        os.remove(out_graphfile)
    # generate batch graph here
    if len(data) % batch_size != 0:
        n = len(data)
        for i in range(len(data)%batch_size):
            data.insert(n-i-1, data[n-i-1])
            
    constituency_parsed_result = read_parsed_result("{}/{}".format(data_dir, "constituency_parsed_result.train"))
    if len(constituency_parsed_result) % batch_size != 0:
        n = len(constituency_parsed_result)
        for i in range(len(constituency_parsed_result)%batch_size):
            constituency_parsed_result.insert(n-i-1, constituency_parsed_result[n-i-1])
    print len(data)
    print len(constituency_parsed_result)
    index = 0
    while index + batch_size <= len(data):
        # generate graphs with order and dependency information
        constituency_batch = [cut_two_layer(constituency_parsed_result[index+idx]) for idx in range(batch_size)]
        max_constituency_node_size = max([len(constituency_batch[idx]['word_list']) for idx in range(batch_size)])
        constituency_graph_batch = create_with_wo_con_info(out_graphfile, constituency_batch, max_constituency_node_size)
        
        for idx in range(len(constituency_batch)):
            w_list_len = constituency_batch[idx]['seq_len']
            w_list = constituency_batch[idx]['word_list'][w_list_len:]
            for j in w_list:
                if j not in word_manager.symbol2idx:
                        word_manager.add_symbol(j)
                        print "{} Added.".format(j)
        index += batch_size

    out_datafile = "{}/train.pkl".format(data_dir)
    with open(out_datafile, "wb") as out_data:
        pkl.dump(data, out_data)
    
    out_mapfile = "{}/map.pkl".format(data_dir)
    with open(out_mapfile, "wb") as out_map:
        pkl.dump([word_manager, form_manager], out_map)

    print(word_manager.vocab_size)
    print(form_manager.vocab_size)

    time_end = time.time()
    print "time used:" + str(time_end - time_start)

In [21]:
def test_data_preprocess():
    data = []
    managers = pkl.load( open("{}/map.pkl".format(data_dir), "rb" ) )
    word_manager, form_manager = managers
    with open("{}/{}.txt".format(data_dir, "test"), "r") as f:
        for line in f:
            l_list = line.split("\t")
            w_list = l_list[0].strip().split(' ')
            r_list = form_manager.get_symbol_idx_for_list(l_list[1].strip().split(' '))
            cur_tree = convert_to_tree(r_list, 0, len(r_list), form_manager)
            data.append((w_list, r_list, cur_tree))
    out_datafile = "{}/test.pkl".format(data_dir)
    with open(out_datafile, "wb") as out_data:
        pkl.dump(data, out_data)

    out_graphfile = "{}/graph.test".format(data_dir)
    if os.path.exists(out_graphfile):
        os.remove(out_graphfile)

    # generate batch graph here
    if len(data) % batch_size != 0:
        n = len(data)
        for i in range(len(data)%batch_size):
            data.insert(n-i-1, data[n-i-1])
    constituency_parsed_result = read_parsed_result("{}/{}".format(data_dir, "constituency_parsed_result.test"))
    if len(constituency_parsed_result) % batch_size != 0:
        n = len(constituency_parsed_result)
        for i in range(len(constituency_parsed_result)%batch_size):
            constituency_parsed_result.insert(n-i-1, constituency_parsed_result[n-i-1])
            
    index = 0
    while index + batch_size <= len(data):
        # generate graphs with order and dependency information
        constituency_batch = [cut_two_layer(constituency_parsed_result[index+idx]) for idx in range(batch_size)]
        max_constituency_node_size = max([len(constituency_batch[idx]['word_list']) for idx in range(batch_size)])
        constituency_graph_batch = create_with_wo_con_info(out_graphfile, constituency_batch, max_constituency_node_size)
       
        index += batch_size

In [22]:
begin_parsing_con()

100%|██████████| 500/500 [00:10<00:00, 21.79it/s]
100%|██████████| 140/140 [00:02<00:00, 28.98it/s]


In [23]:
train_data_preprocess()

loading vocabulary file: ../dataset/vocab.q.txt
loading vocabulary file: ../dataset/vocab.f.txt
129
52
500
500
ROOT Added.
FRAG Added.
X Added.
NP Added.
PP Added.
S Added.
VP Added.
NX Added.
SBARQ Added.
WHNP Added.
SQ Added.
SBAR Added.
ADJP Added.
SINV Added.
NP-TMP Added.
IN Added.
UCP Added.
ADVP Added.
CONJP Added.
QP Added.
WHADJP Added.
DT Added.
VBP Added.
NN Added.
RRC Added.
CC Added.
NNS Added.
EX Added.
. Added.
FW Added.
159
52
time used:0.616720914841


In [24]:
test_data_preprocess()