-
Notifications
You must be signed in to change notification settings - Fork 0
/
LxGrTgr_04.py
552 lines (484 loc) · 21.1 KB
/
LxGrTgr_04.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 10 11:47:46 2020
@author: kkyle2
This script is loosely based on TAASSC 2.0.0.58, which was used in Kyle et al. 2021a,b.
The data for those publications were processed using
- TAASSC 2.0.0.58 (see https://github.com/kristopherkyle/TAASSC)
- Python version 3.7.3
- spaCy version 2.1.8
- spaCy `en_core_web_sm` model version 2.1.0.
This version of the code is part of a project designed to:
a) make a more user-friendly interface (including a python package, and online tool, and a desktop tool)
b) ensure that the tags are as accurate as possible
License:
The Lexicogrammatical Tagger
Copyright (C) 2022 Kristopher Kyle and Douglas Biber [and others?]
This program is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)
See https://creativecommons.org/licenses/by-nc-sa/4.0/ for a summary of the license (and a link to the full license).
"""
### imports ####################################
version = "0.0.4"
version_notes = "0.0.4 - More bug fixes on 20221203; Tags follow discussions with Doug, Randi, and Jesse (with a few tweaks)."
import glob #for finding all filenames in a folder
import os #for making folders
# from xml.dom import minidom #for pretty printing
# import xml.etree.ElementTree as ET #for xml parsing
from random import sample #for random samples
import re #for regulat expressions
#from lexical_diversity import lex_div as ld #for lexical diversity. Should probably upgrade to TAALED
### spacy
import spacy #base NLP
#nlp = spacy.load("en_core_web_sm") #load model
nlp = spacy.load("en_core_web_trf") #load model
nlp.max_length = 1728483 #allow more characters to be processed than default. This allows longer documents to be processed. This may need to be made longer.
######################################################
### Load lists, etc. #################################
nominal_stop = open("lists_LGR/nom_stop_list_edited.txt").read().split("\n") # created based on frequently occuring nouns with [potential] nominalizer suffixes in TMLE + T2KSWAL
prepVerbList = open("lists_LGR/prepVerbList.txt").read().split("\n") # From LGSWE; currently ignored in favor of OntoNotes classifications
phrasalVerbList = open("lists_LGR/phrasalVerbList.txt").read().split("\n") # From LGSWE
##########################################
### Utility Functions ####################
def safe_divide(numerator,denominator):
if float(denominator) == 0.0:
return(0.0)
else:
return(numerator/denominator)
###########################################
#### other functions ###
# def prettify(elem):
# """Return a pretty-printed XML string for the Element.
# """
# rough_string = ET.tostring(elem, 'utf-8')
# reparsed = minidom.parseString(rough_string)
# return(reparsed.toprettyxml(indent=" "))
###########################################
class tokenInfo():
def __init__(self, spacyToken):
self.idx = None #will have to add this from position in sentence
self.word = spacyToken.text #raw text
self.lemma = spacyToken.lemma_.lower() #lowered lemma form
self.upos = spacyToken.pos_ #Universal part of speech
self.xpos = spacyToken.tag_ #penn tag
self.deprel = spacyToken.dep_ #dependency relation (based on CLEAR tagset)
self.head = spacyToken.head#.text #text of head
self.headidx = spacyToken.head.i #id of head
self.children = spacyToken.children
self.lxgrtag = None #main tag
self.cat1 = None #additional tag (based on tag schema)
self.cat2 = None #additional tag (based on tag schema)
self.cat3 = None #additional tag (based on tag schema)
self.cat4 = None #additional tag (based on tag schema)
self.cat5 = None #additional tag (based on tag schema)
self.cat6 = None #additional tag (based on tag schema)
self.cat7 = None #additional tag (based on tag schema)
self.cat8 = None #additional tag (based on tag schema)
self.cat9 = None #additional tag (based on tag schema)
self.semtag = None #additional tag (based on tag schema)
### Linguistic Analysis Functions ###
def nouns(token,nominalStopList = nominal_stop): #revised 2022-11-22; This is probably overly greedy. It was filtered using frequent candidates in T2KSWAL and TMLE
if token.upos in ["NOUN", "PROPN"]:
token.lxgrtag = "nn"
if token.xpos in ["NNS","NNPS"]:
token.cat1 = "pl"
nominalSuff = ["al","cy","ee","er","or","ry","ant","ent","dom","ing","ity","ure","age","ese","ess","ful","ism","ist","ite","let","als","ees","ers","ors","ate","ance","ence","ment","ness","tion","ship","ette","hood","cies","ries","ants","ents","doms","ings","ages","fuls","isms","ists","ites","lets","eses","ates","ician","ities","ances","ences","ments","tions","ships","esses","ettes","hoods","nesses"]
#the titles list is incomplete
titles = ["mr","mr.","mister","ms","ms.", "miss","mrs","mrs.","missus","mistress","dr","dr.","doctor","professor"]
#note that zero derivation is not included
#stop list comes from list derived from manual analysis of tagged T2KSWAL + TMLE
nominalization = False
for suff in nominalSuff:
if len(token.word) > len(suff): #make sure word token is at least one character longer than suffix
if token.word.lower()[0-len(suff):] == suff:
nominalization = True
break
if token.upos == "PROPN":
if token.word.lower() in titles:
token.cat2 = "title"
else:
token.cat2 = "proper"
elif nominalization == True:
token.cat2 = "nom"
#section for cat3:
if token.deprel in ["compound","nmod"]:
token.cat3 = "npremod"
elif token.deprel == "appos":
token.cat3 = "nappos"
elif token.deprel == "poss":
token.cat3 = "sgen"
def adjectives(token): #2022-11-22
if token .deprel in ["acomp","amod"]:
token.lxgrtag = "jj"
if token.deprel == "acomp":
token.cat1 = "pred"
elif token.deprel == "amod":
token.cat1 = "attr"
#gerundial or participial?
if token.xpos == "VBG": #we may have to rely on morphology here.
token.cat2 = "ing"
elif token.xpos == "VBN":
token.cat2 = "ed"
#if token.pos_ == "ADV" or token.dep_ in ["npadvmod","advmod", "intj"] #double check the effects of this line (from previous iteration)
def adverbs(token,sent): #2022-11-22; tagged on adverb
#this list needs to be more robust - taken from Table 10.17, page 879 LGSWE
#will need to deal with prepositional linking adverbials elsewhere
linking = ["so","then","though","anyway","however","thus","therefore","e.g.","i.e.","first","finally","furthermore","hence","nevertheless","rather","yet"]
if token.deprel == "advmod":
token.lxgrtag = "rb"
#print(token.word)
if token.word[-2:].lower() == "ly":
token.cat2 = "ly"
if token.deprel == "advmod" and token.head.pos_ in ["VERB","AUX"]: #note that copular verbs get "AUX"
if token.word.lower() in linking and token.idx < token.head.i: #if the word can be alinking adverb and occurs before the main verb:
token.cat1 = "link"
else:
token.cat1 = "advl"
elif token.deprel == "advmod" and token.head.dep_ == "acomp":
token.cat1 = "adjmod"
#split aux section:
for tkn in sent:
if tkn.headidx == token.headidx:
if tkn.deprel == "aux" and tkn.idx < token.idx:
if token.idx < token.headidx and token.head.pos_ == "VERB":
token.cat3 = "splaux"
break
if token.deprel == "prt" and token.head.i < token.idx: #check for particles, make sure they aren't infinitive "to"
token.lxgrtag = "rb"
token.cat1 = "prtcle"
if token.lxgrtag == "rb" and token.cat1 == None:
token.cat1 = "othr"
def verbs(token,sent): #need to add spearate tags for tense/aspect and passives
that0_list = "check consider ensure illustrate fear say assume understand hold appreciate insist feel reveal indicate wish decide express follow suggest saw direct pray observe record imagine see think show confirm ask meant acknowledge recognize need accept contend come maintain believe claim verify demonstrate learn hope thought reflect deduce prove find deny wrote read repeat remember admit adds advise compute reach trust yield state describe realize expect mean report know stress note told held explain hear gather establish suppose found use fancy submit doubt felt".split(" ")
#don't tag verbs functioning as adjectives as verbs; only tag main verbs
if token.upos in ["VERB","AUX"] and token.deprel not in ["amod","acomp","aux","auxpass"]:
token.lxgrtag = "vbmain"
if token.lemma == "be": #need to add "+ NP" frames; need to deal with negation. This is kind of a mess
if token.idx +2 <= sent[-1].idx and " ".join([token.lemma,sent[token.idx + 1].word.lower(),sent[token.idx + 2].word.lower()]) in prepVerbList:
token.cat1 = "prepv"
else:
token.cat1 = "be"
elif token.idx +1 <= sent[-1].idx and " ".join([token.lemma,sent[token.idx + 1].word.lower()]) in prepVerbList: #need to add "+ NP" frames
token.cat1 = "prepv"
#need to add phrasal verbs
elif "prt" in [x.deprel for x in sent if x.headidx == token.idx]:
token.cat1 = "phrsv"
else:
token.cat1 = "vblex"
#this is an ordered list of POS tags for the main verb and auxs represented in the VP
vp_pos = [x.xpos for x in sent if (x.headidx == token.idx and x.deprel in ["aux","auxpass"]) or x.idx == token.idx]
#print(vp_pos)
if "MD" in vp_pos:
token.cat2 = "vp_w_modal"
elif "VBZ" in vp_pos:
token.cat2 = "pres"
elif "VBP" in vp_pos:
token.cat2 = "pres"
elif "VBD" in vp_pos:
token.cat2 = "past"
else:
token.cat2 = "nonfinite" #probably need more testing here
### aspect analysis ###
aux_text = [x.lemma.lower() for x in sent if (x.headidx == token.idx and x.deprel in ["aux"])]
if "have" in aux_text:
if token.xpos == "VBG":
token.cat3 = "perfprog"
else:
token.cat3 = "perf"
elif token.xpos == "VBG" and "be" in aux_text:
token.cat3 = "prog"
else:
token.cat3 = "simple"
### voice analysis ####
pass_deps = [x.deprel for x in sent if x.headidx == token.idx]
if "auxpass" in pass_deps:
if "agent" in pass_deps:
token.cat4 = "pasv_by"
else:
token.cat4 = "pasv_agls"
else:
token.cat4 = "active"
### cat5 analysis
if token.deprel in ["ccomp","csubj"]: #may need to add dependent relations here ("csubj"?)
token.cat5 = "compcls"
elif token.deprel == "advcl":
token.cat5 = "advlcls"
elif token.deprel in ["relcl","acl"]:
token.cat5 = "nmod_cls"
# elif sent[token.headidx].upos in ["NOUN","PROPN","PRON"]:
# token.cat5 = "nmod_cls"
#####################
### cat6 analysis ###
#####################
### that ###
if "that" in [x.word.lower() for x in sent if x.word.lower() == "that" and x.headidx == token.idx and x.deprel in ["mark","nsubj","nsubjpass"]]:
if token.cat5 in ["compcls","nmod_cls"]:
token.cat6 = "thatcls"
### wh clause: ###
if token.cat5 in ["compcls","nmod_cls","advlcls"]:
if len([x.word.lower() for x in sent if x.headidx == token.idx and x.deprel in ["nsubj","nsubjpass","advmod","attr"] and x.xpos in ["WDT","WP", "WP$", "WRB"] and x.word.lower() != "that"]) > 0:
token.cat6 = "whcls"
if token.cat2 == "nonfinite":
### to clause ###
if "to" in [x.word.lower() for x in sent if x.headidx == token.idx and x.idx < token.idx and x.upos == "PART"]:
token.cat6 = "tocls" #probably needs more work
### ing clause ###
if token.xpos == "VBG":
token.cat6 = "ingcls"
### ed clause ###
if token.xpos == "VBN" and token.cat4 == "active":
token.cat6 = "edcls"
#####################
### cat7 analysis ###
#####################
if token.cat5 == "compcls":
if sent[token.headidx].upos in ["VERB","AUX"] and token.idx != token.headidx:
token.cat7 = "vcomp"
elif sent[token.headidx].upos == "ADJ":
token.cat7 = "jcomp"
elif sent[token.headidx].upos in ["NOUN","PROPN","PRON"]:
token.cat7 = "ncomp"
elif token.deprel == "pcomp":
token.cat7 = "incomp"
markL = [x.word.lower() for x in sent if x.headidx == token.idx and x.deprel == "mark"] #list of subordinators
if len(markL) != 0:
mark = markL[0]
if mark == "because":
token.cat7 = "causative"
elif mark in ["if","unless"]:
token.cat7 = "conditional"
elif mark in ["though","although","while"]:
token.cat7 = "concessive"
elif mark != "that":
token.cat7 = "other_advl"
######################
### cat 8 analysis ###
######################
### relativizer deletion ###
if token.cat5 == "nmod_cls" and token.cat6 not in ["tocls"]:
if "dobj" in [x.deprel for x in sent if x.headidx == token.idx and x.idx < token.idx]:
if len([x.deprel for x in sent if x.headidx == token.idx and x.idx < token.idx and token.word.lower() in ["that","who","which", "what","how","where","why","when","whose","whom","whomever"]]) == 0:
token.cat8 = "reldel"
elif len ([x.word.lower() for x in sent if x.headidx == token.idx and x.deprel in ["nsubj","nsubjpass"] and x.word.lower() in ["that","who","which","what","how","where","why","when","whose","whom","whomever"]]) == 0:
token.cat6 = "reldel"
### complementizer deletion ###
if token.cat5 == "compcls" and token.cat6 not in ["tocls","ingcls","thatcls","whcls"]:
token.cat6 = "compdel"
### aux analysis ####
if token.deprel in ["aux","auxpass"]: #check for auxilliaries
if token.word.lower() == "to":
token.lxgrtag = "to"
else:
token.lxgrtag = "vbaux"
if token.xpos == "MD":
token.cat1 = "mod"
if token.word.lower() in "can may might could".split(" "):
token.cat2 = "pos"
elif token.word.lower() in "ought must should".split(" "):
token.cat2 = "nec"
elif token.word.lower() in "will would shall".split(" "):
token.cat2 = "prd"
def personal_pronouns(token): #updated 2022-11-02
#NOTE: Spacy tags "our" and "my" as determiners - run with out tags
pp1 = "i we us me ourselves myself".split(" ") # previously tagged as pronouns "our my your thy thine their his her".split()
pp2 = "you yourself ya thee".split(" ")
pp3 = "he she they them him themselves himself herself it".split(" ") #no "it" following Biber et al., 2004
pp_sg = "i me myself you ya thee he she his her him himself herself it".split(" ")
pp_pl = "us ourselves they them themselves".split() #not sure how we want to deal with "they"
pp_all = pp1+pp2+pp3 #+pp3_it
if token.word.lower() in pp_all:
token.lxgrtag = "pro"
if token.word.lower() in pp1:
token.cat1 = "1st"
elif token.word.lower() in pp2:
token.cat1 = "2nd"
elif token.word.lower() in pp3:
token.cat1 = "3rd"
if token.word.lower() in pp_sg:
token.cat2 = "sg"
elif token.word.lower() in pp_pl:
token.cat1 = "pl"
def advanced_pronoun(token,sent): #updated 2022-11-02
demonstrative_list = ["this","that","these","those"]
sg = ["this","that"]
pl = ["these","those", "ones"]
indefinite_l = "everybody everyone everything somebody someone something anybody anyone anything nobody noone none nothing one ones".split(" ")
indef_pl = "ones" #double check this
#note that "one" captures "no one"
#indefinite list from Longman grammar pp. 352-355
if token.word.lower() in indefinite_l and token.deprel in ["nsubj","nsubjpass","dobj","pobj"]:
token.lxgrtag = "pro"
token.cat1 = "other" #consider changing to "indefinite"
if token.word.lower() in indef_pl:
token.cat2 = "pl"
else:
token.cat2 = "sg"
#determining whether demonstrative pronouns are actually being used as demonstrative can be tricky using spacy:
elif token.word.lower() in demonstrative_list and token.deprel in ["nsubj","nsubjpass","dobj","pobj"]:
if token.deprel == "advmod":
token.lxgrtag = "pro"
token.cat1 = "dem"
if token.word.lower() in sg:
token.cat2 = "sg"
else:
token.cat2 = "pl"
elif token.idx + 1 < len(sent) and sent[token.idx + 1].word.lower() in ["who",".","!","?",":"]:
token.lxgrtag = "pro"
token.cat1 = "dem"
if token.word.lower() in sg:
token.cat2 = "sg"
else:
token.cat2 = "pl"
elif token.deprel == "nsubjpass":
token.lxgrtag = "pro"
token.cat1 = "dem"
if token.word.lower() in sg:
token.cat2 = "sg"
else:
token.cat2 = "pl"
elif token.deprel == "pobj":
token.lxgrtag = "pro"
token.cat1 = "dem"
if token.word.lower() in sg:
token.cat2 = "sg"
else:
token.cat2 = "pl"
elif token.deprel in ["nsubj","dobj"]:
token.lxgrtag = "pro"
token.cat1 = "dem"
if token.word.lower() in sg:
token.cat2 = "sg"
else:
token.cat2 = "pl"
def prepositions(token,sent):
if token.deprel == "prep":
token.lxgrtag = "in"
if sent[token.headidx].upos in ["VERB","AUX"]:
token.cat1 = "advl"
elif sent[token.headidx].upos in ["NOUN","PROPN","PRON"]:
token.cat1 = "nmod"
elif sent[token.headidx].upos == "ADJ":
token.cat1 = "jcomp"
else:
token.cat1 = "in_othr"
def coordinators(token,sent): #takes spacy's definition of "cc"
if token.deprel == "cc":
token.lxgrtag = "cc"
if sent[token.headidx].upos in ["NOUN", "ADJ", "ADV", "PRON", "PROPN", "PART"]:
token.cat1 = "phrs"
elif sent[token.headidx].upos in ["VERB","AUX"]:
ccl = [x.idx for x in sent if x.deprel == "conj" and x.headidx == token.headidx] #and len([y.deprel for y in sent if y.headidx == sent[token.headidx].headidx and y.deprel in ["nsubj","csubj"]]) > 0 ]) != 0:
if token.idx == 0: #sentence initial ccs
token.cat1 = "cls"
elif len (ccl) > 0 and len([x for x in sent if x.headidx == ccl[0] and x.deprel in ["csubj","nsubj"]]) > 0:
token.cat1 = "cls"
else:
token.cat1 = "phrs"
def subordinators(token,sent):
if token.deprel == "mark" and token.word.lower() not in ["that", "which","who","whom","whose","what","how","where","why","when"]:
token.lxgrtag = "cs"
if token.word.lower() == "because":
token.cat1 = "cos" #causitive
elif token.word.lower() in ["if","unless"]:
token.cat1 = "cnd" #conditional
elif token.word.lower() in ["though","although","while"]:
token.cat1 = "con" #concessive
else:
token.cat1 = "cs_othr"
if token.word.lower() == "that" and token.deprel == "mark" and sent[token.headidx].deprel == "advcl":
#print("found it!!")
token.lxgrtag = "cs"
token.cat1 = "cs_othr"
def determiners(token):
demonstrative_list = ["this","that","these","those"]
if token.deprel == "det":
token.lxgrtag = "dt"
if token.word.lower() in demonstrative_list:
token.cat1 = "dt_dem"
if token.word.lower() in ["a","the"]:
token.cat1 = "art"
if token.deprel == "poss":
token.lxgrtag = "dt"
token.cat1 = "poss"
def that_wh(token,sent): #tweaked version
if token.word.lower() == "that" and token.deprel in ["nsubj","nsubjpass","mark"]:
if sent[token.headidx].deprel in ["relcl","acl"]:
token.lxgrtag = "relpro"
token.cat1 = "relpro_that"
sent[token.headidx].cat8 = None
elif sent[token.headidx].deprel in ["ccomp","csubj"]:
token.lxgrtag = "comp"
token.cat1 = "comp_that"
sent[token.headidx].cat8 = None
if token.word.lower() in ["which","who","whom","whose","what","how","where","why","when"] and token.deprel in ["nsubj","nsubjpass","mark","attr"]:
if sent[token.headidx].deprel in ["relcl","acl"]:
token.lxgrtag = "relpro"
token.cat1 = "relpro_wh"
sent[token.headidx].cat8 = None
elif token.word.lower() in ["which","who","whom","whose","what","how","where","why","when"] and token.deprel in ["nsubj","nsubjpass","mark","attr","advmod","dobj"]:
if sent[token.headidx].deprel in ["ccomp","csubj"]:
sent[token.headidx].cat6 = "whcls"
token.lxgrtag = "comp"
token.cat1 = "comp_wh"
sent[token.headidx].cat8 = None
#############################
#### These functions use the previous functions to conduct tagging and tallying of lexicogramamtical features ###
def preprocess(text): #takes raw text, processes with spacy, then outputs a list of sentences filled with tokenObjects
output = []
doc = nlp(text)
for sent in doc.sents:
sentl = []
sidx = 0 #within sentence idx
firstWord = True #check for first word
for token in sent:
#print(token.idx)
if firstWord == True: #check for first word in sentence
sstartidx = token.i #set to document position of first token in sentence
firstWord = False
#print(sstartidx)
tok = tokenInfo(token)
tok.idx = sidx
tok.headidx = tok.headidx - sstartidx #adjust heads from document position to sentence position
sidx += 1
sentl.append(tok)
output.append(sentl)
return(output)
def tag(sstring): #tags :)
sents = []
for sent in preprocess(sstring):
for token in sent:
personal_pronouns(token)
advanced_pronoun(token,sent)
adverbs(token,sent)
adjectives(token)
nouns(token)
verbs(token,sent)
prepositions(token,sent)
coordinators(token,sent)
subordinators(token,sent)
determiners(token)
that_wh(token,sent)
sents.append(sent)
return(sents)
def printer(loToks):
for sent in loToks:
for token in sent:
print(token.idx, token.word, token.lemma, token.lxgrtag, token.cat1,token.cat2,token.cat3, token.cat4, token.cat5, token.cat6, token.cat7, token.cat8, token.xpos, token.upos, token.deprel,token.headidx)
def writer(outname,loToks,joiner = "\t"):
outf = open(outname,"w")
docout = []
for sent in loToks:
sentout = []
for token in sent:
tokenout = []
items = [token.idx,token.word,token.lemma,token.lxgrtag, token.cat1,token.cat2,token.cat3, token.cat4, token.cat5, token.cat6, token.cat7, token.cat8, token.xpos, token.upos, token.deprel,token.headidx]
for item in items:
if item == None:
tokenout.append("")
else:
tokenout.append(str(item))
sentout.append(joiner.join(tokenout))
docout.append("\n".join(sentout))
outf.write("\n\n".join(docout))
outf.flush()
outf.close()