/
DependencyTagging.py
110 lines (84 loc) · 3.31 KB
/
DependencyTagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import hanlp
import json
hanlp.pretrained.mtl.ALL
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
import re
def cut_sent(para):
para = re.sub('([。!?\?])([^”’])', r"\1\n\2", para)
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)
para = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip()
return para.split("\n")
def load_file(filename, pred=False):
data = []
with open(filename, "r",encoding='utf-8') as f:
for line in f.readlines():
if pred:
data.append({"label": line.strip()})
else:
data.append(json.loads(line))
f.close()
return data
token_list_1 = ["nsubj","root","dobj","pobj"]
token_list = ["nsubj","root","dobj"]
def get_token(text):
text = cut_sent(text)
doc = HanLP(text, tasks='dep')
strs = ""
for toks , deps in zip(doc["tok/fine"],doc["dep"]):
for tok , dep in zip(toks,deps):
if dep[1] in token_list:
strs = strs + tok + "<"+dep[1]+">"
else:
strs = strs + tok
return strs
def write_txt_file_source(filename,data):
with open(filename, "w+",encoding='utf-8') as f:
for i in data:
str = i['title']
for s in i['outline']:
str = str + "#" + s
str = str + "<extra_id_1>\n"
f.write(str)
def write_txt_file_target(filename,data):
with open(filename, "w+",encoding='utf-8') as f:
count = 0
for i in data:
count = count +1
if count %10 == 0:
print(count , len(data))
truth = get_token(i['story'])
str = "<extra_id_1>" + truth + "\n"
f.write(str)
def trans_txt2json_file(filename_w,filename_r):
with open(filename_w, "w+",encoding='utf-8') as wf,open(filename_r, "r",encoding='utf-8') as rf:
for line in rf.readlines():
dic = {}
dic["story"] = line.replace("<nsubj>","").replace("<root>","").replace("<dobj>","").replace("<pobj>","")
str1 = str(dic)+"\n"
str1 = str1.replace("\'","\"")
wf.write(str1)
def trans_txt2txt_file(filename_w,filename_r):
with open(filename_w, "w+",encoding='utf-8') as wf,open(filename_r, "r",encoding='utf-8') as rf:
for line in rf.readlines():
str1 = line.replace("<nsubj>","").replace("<root>","").replace("<dobj>","").replace("<pobj>","")
str1 = str1
wf.write(str1)
"""
text = cut_sent(data[4]['story'])
doc = HanLP(text, tasks='dep')
doc.pretty_print()
"""
if __name__ == '__main__':
data = load_file("./outgen/train.jsonl")
# read the training data from json file
write_txt_file_source("./outgen/train.source",data)
# save the outline from data to the file
write_txt_file_target("./outgen/train.target",data)
# adding the Dependency token into the story text in the data
# and save the story text to the file
#trans_txt2json_file("train.jsonl","train.target")
# remove the token inside the story text, and write as .jsonl file
#trans_txt2txt_file("trainW.target","trainR.target")
# remove the token inside the story text, and write as .txt file