# tokenize
## text2conll

In [1]:
import re

In [338]:
reurl = re.compile(
	r'''(https?://|\w+@)?[\w\d\%\.]*\w\w\.\w\w[\w\d~/\%\#]*(\?[\w\d~/\%\#]+)*''', 
	re.U+re.M+re.I)
respacenum = re.compile(
	r'\d+[ ,.]+[0-9 ,.]*\d+'
)
rerematch = re.compile(
	r'\\\d+\\'
)

def tokenize(	text, 
	     		sent_ends='.;!?\\n', # these characters end a sentence backslach escapes should be double escaped like \\n
	     		new_sent_upper=".!?", # if not empty, these characters end a sentence only if the following character is upper case, should be a subset of sent_ends
				char_in_word='_-', # characters that should be treated as letters inside words
				glue_left="'~", # cut token after these characters 
				glue_right="", # cut token before these characters 
				whole_words="aujourd'hui l'on etc. Mr. M. Nr. N° ;) ;-)", # keep these space-separated words as one tokens
				special_suffix="n't -je -tu -il -elle -on -nous -vous -ils -ils -elles -y -t-il -t-elle -t-ils -t-ils -t-on",
				# keep these space-separated suffixes as separate tokens
				keep_url=True, # look for URLs and keep them together
				combine_numbers=True, # spaces, commas, and points between numbers are grouped together such as 999 349
				sent_cut="", # a unique word or sequence where cutting should be done. if set, sent_ends is ignored
				escape = '____' # no need to change this. should be letters (\w) used to escape internally. Should not appear anywhere in the text
				# sent_not_cut="", # TOD: should be added eventually
				):
	"""
	TODO: sent_not_cut: symbols to place after the potential sent_ends that i don't want to cut.
	"""

	# replacing words that have to remain untouched
	whole_words = whole_words.strip().split()
	special_suffix = special_suffix.strip()
	num_dot = (escape+'{}'+escape).format('NUMBERDOT')
	space_after_esc = (escape+'{}'+escape).format('NOSPACEAFTER')
	ind = 0
	ntext = text
	for w in whole_words: 
		ntext = ntext.replace(w,'\\{ind}\\'.format(ind=ind))
		ind +=1
	if special_suffix:
		respecial_suffix = re.compile(r'({})\b'.format('|'.join(special_suffix.split())))
		for m in respecial_suffix.finditer(ntext):
			ntext = ntext.replace(m.group(0),'\\{ind}\\'.format(ind=ind))
			whole_words += [m.group(0)]
			ind +=1
	if keep_url:
		for murl in reurl.finditer(ntext):
			ntext = ntext.replace(murl.group(0),'\\{ind}\\'.format(ind=ind))
			whole_words += [murl.group(0)]
			ind +=1
	if combine_numbers:
		for mnum in respacenum.finditer(ntext):
			ntext = ntext.replace(mnum.group(0),'\\{ind}\\'.format(ind=ind))
			whole_words += [mnum.group(0)]
			ind +=1

	# replace "the 2. guy" by "the 2___NUMDOT___ guy":
	re_num_dot = re.compile(r'\b(\d+)\.(?! [0-9A-ZÀÈÌÒÙÁÉÍÓÚÝÂÊÎÔÛÄËÏÖÜÃÑÕÆÅÐÇØ])') # num followed by . not followed by upper case
	ntext = re_num_dot.sub(r'\1'+num_dot, ntext)
	# now we split into sentences:
	if sent_cut:
		sents = ntext.split(sent_cut)
	else:
		if new_sent_upper:
			sent_ends_nopoint = re.sub(r'[{new_sent_upper}]+'.format(new_sent_upper=new_sent_upper),'', sent_ends)
			re_sent_bounds = re.compile(
				'(([{sent_ends_nopoint}]+\s*)|([{sent_ends}]+\s*(?=[0-9\\\A-ZÀÈÌÒÙÁÉÍÓÚÝÂÊÎÔÛÄËÏÖÜÃÑÕÆÅÐÇØ])))'.format(
							sent_ends_nopoint=sent_ends_nopoint, sent_ends=new_sent_upper.replace('.','\.')), re.U+re.M)
		else:
			re_sent_bounds = re.compile(
				'([{sent_ends}]+\s*)'.format(sent_ends=sent_ends), re.U+re.M)
		doubsents = re_sent_bounds.split(ntext)+['']
		sents = []
		for i in range(0, len(doubsents), 2):
			if doubsents[i] and doubsents[i+1]:
				sents += [(doubsents[i] + (doubsents[i+1] if i+1 < len(doubsents) else '')).strip()]
	
	### now we got the sents list, making the actual tokens
	retok = re.compile("(?!(\\\\d+\\\)|([\\\{} ]+))(\W+)(?!\d)".format(re.escape((char_in_word+glue_left+glue_right).replace('-','\-'))))
	reglue_left = re.compile(r'([{}])'.format(glue_left)) if glue_left else None
	reglue_right = re.compile(r'([{}])'.format(glue_right)) if glue_right else None
	stoks = {}
	def simplerematchreplace(matchobj): # used to reconstruct the sentence
		return whole_words[int(matchobj.group(0)[1:-1])]
	def rematchreplace(matchobj): # used to build the correct tokens
		if special_suffix and respecial_suffix.match(whole_words[int(matchobj.group(0)[1:-1])]):
			return space_after_esc+whole_words[int(matchobj.group(0)[1:-1])]
		return whole_words[int(matchobj.group(0)[1:-1])]

	for si,s in enumerate(sents):
		rs = rerematch.sub(simplerematchreplace,s.replace(num_dot,'.'))
		if glue_left: s = reglue_left.sub(r'\1 ', s)
		if glue_right: s = reglue_right.sub(r' \1', s)
		s = retok.sub(r'{}\3 '.format(space_after_esc), s) # adding the additional spaces
		toks = []
		spaceafters = []
		for t in s.split():
			t = t.replace(num_dot,'.')
			ts = rerematch.sub(rematchreplace,t) if rerematch.search(t) else t
			tsl = [tt for tt in ts.split(space_after_esc) if tt] 
			toks+= tsl
			spaceafters += [ii==len(tsl)-1 for ii,tt in enumerate(tsl)]
		stoks[(si,rs)] = list(zip(toks,spaceafters)) # 'si' makes keys unique and allows duplicate sentences
	return stoks


In [339]:
text = """Voici le test N° 17. C'est quand-même 
bête, tout ça, aujourd'hui, avec le la~ lave-linge etc., peu importe ce que l'on dit sur https://blöd.com!!! T'en dis quoi, toi ;) ;-) ? Vas-y ! Ouais, M. le professeur Nr. 2. J'y crois à 100,00% !
This is a 2. type of sample text! It contains (different) types of punctuation. 
How to split it? Let's see; this is another example: not split here. Split Here!123 also works. but here it won't split if new_sent_upper is set.
H_ello! regex-fan, this is a $$-test. What do you think???!!
"""
sent2toks = tokenize(text)
print(sent2toks)

{(0, 'Voici le test N° 17.'): [('Voici', True), ('le', True), ('test', True), ('N°', True), ('17', False), ('.', True)], (1, "C'est quand-même"): [("C'", True), ('est', True), ('quand-même', True)], (2, "bête, tout ça, aujourd'hui, avec le la~ lave-linge etc., peu importe ce que l'on dit sur https://blöd.com!!!"): [('bête', False), (',', True), ('tout', True), ('ça', False), (',', True), ("aujourd'hui", False), (',', True), ('avec', True), ('le', True), ('la~', True), ('lave-linge', True), ('etc.', False), (',', True), ('peu', True), ('importe', True), ('ce', True), ('que', True), ("l'on", True), ('dit', True), ('sur', True), ('https://blöd.com', False), ('!!!', True)], (3, "T'en dis quoi, toi ;) ;-) ?"): [("T'", True), ('en', True), ('dis', True), ('quoi', False), (',', True), ('toi', True), (';)', True), (';-)', True), ('?', True)], (4, 'Vas-y !'): [('Vas', False), ('-y', True), ('!', True)], (5, 'Ouais, M. le professeur Nr. 2.'): [('Ouais', False), (',', True), ('M.', True), ('le', 

In [341]:
def conllize(sent2toks,id='my_sample',start=1):
    conlls=[]
    for (si,s),toksas in sent2toks.items():
        conllines=[
            '# sent_id = {id}__{ind}'.format(id=id,ind=start),
            '# text = {s}'.format(s=s)
        ]
        for i,(tok,sa) in enumerate(toksas):
            li = '{ind}\t{tok}\t_\t_\t_\t_\t_\t_\t_\t{spac}\t'.format(ind=i+1,tok=tok,spac='_' if sa else 'SpaceAfter=No')
            conllines+=[li]
        conlls+=['\n'.join(conllines)]
        start+=1
    return '\n\n'.join(conlls)+'\n'
conll = conllize(sent2toks)
open('test.conllu','w').write(conll)

4391

In [None]:
# one liner:
open('test.conllu','w').write(conllize(tokenize(text)))


# playground:

In [193]:
retok = re.compile("(?!(\\\\d+\\\)|([\\\% ]+))(\W+)(?!\d)")
print(retok)
s='azer \\ wxcv$ \\0\\ qsdf*** dfg%'
print([s])
for m in retok.findall(s):
    print(m)
print(retok.sub(r' \3 ',s))

re.compile('(?!(\\\\d+\\\\)|([\\\\ ]+))(\\W+)(?!\\d)')
['azer \\ wxcv$ \\0\\ qsdf*** dfg%']
('', '', '$ ')
('', '', '*** ')
('', '', '%')
azer \ wxcv $  \0\ qsdf ***  dfg % 


In [4]:
import glob, os

from parseSentences import simpletokenize

In [5]:
def emptyFromSentence(sentencefile, outfolder="."):
	"""
	file with one sentence per line --> conll10
	"""
	scode = os.path.basename(sentencefile).split('.')[0]
	outname=os.path.join(outfolder,scode+".conllu")
	# outname=os.path.join(outfolder,sentencefile.split('.')[0]+".conll")
	print("emptyFromSentence:",sentencefile, outfolder, outname)
	counter = 1
	with open(sentencefile) as f, open(outname,"w") as g:
		for li in f:
			if li.strip():
				g.write('# sent_id = '+scode+'__'+str(counter)+'\n')
				g.write('# text = '+li)
				toks = simpletokenize(li)
				for num,tok in enumerate( toks ):
					g.write("\t".join([str(num+1), tok ]+["_"]*8)+'\n')
				g.write("\n")
				counter+=1

	return outname

In [6]:
for sentencefile in glob.glob("/home/kim/Downloads/phrases_ester_pour_dependances/*.txt"):
	print(sentencefile)
	outfolder="emptyconlls"
	emptyConll = emptyFromSentence(sentencefile,outfolder=outfolder)
	print(emptyConll)
	

/home/kim/Downloads/phrases_ester_pour_dependances/20030505_1400_1500_rfi_elda_phrases.txt
emptyFromSentence: /home/kim/Downloads/phrases_ester_pour_dependances/20030505_1400_1500_rfi_elda_phrases.txt emptyconlls emptyconlls/20030505_1400_1500_rfi_elda_phrases.conllu
emptyconlls/20030505_1400_1500_rfi_elda_phrases.conllu
/home/kim/Downloads/phrases_ester_pour_dependances/20030507_1400_1500_rfi_elda_phrases.txt
emptyFromSentence: /home/kim/Downloads/phrases_ester_pour_dependances/20030507_1400_1500_rfi_elda_phrases.txt emptyconlls emptyconlls/20030507_1400_1500_rfi_elda_phrases.conllu
emptyconlls/20030507_1400_1500_rfi_elda_phrases.conllu
/home/kim/Downloads/phrases_ester_pour_dependances/20000525_0930_1030_rfi_fm_dga_phrases.txt
emptyFromSentence: /home/kim/Downloads/phrases_ester_pour_dependances/20000525_0930_1030_rfi_fm_dga_phrases.txt emptyconlls emptyconlls/20000525_0930_1030_rfi_fm_dga_phrases.conllu
emptyconlls/20000525_0930_1030_rfi_fm_dga_phrases.conllu
/home/kim/Downloads/phr