Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
34 lines (25 sloc) 847 Bytes
import thulac
import os
dir_input = "./data/"
file_output = "./results/tokenization/tokens_raw.txt"
file_input_temp = "./temp/input_temp.txt"
file_output_temp = "./temp/output_temp.txt"
fid_output = open(file_output, 'w')
tokenizer = thulac.thulac(filt=True, deli='/')
for file_txt in os.listdir(dir_input):
if file_txt.endswith(".txt"):
file_input = dir_input + file_txt
with open(file_input, 'r') as fid_input:
for line in fid_input:
fid_input_temp = open(file_input_temp, 'w')
fid_input_temp.write(line)
fid_input_temp.close()
try:
tokenizer.cut_f(file_input_temp, file_output_temp)
fid_output_temp = open(file_output_temp, 'r')
tokens_cur = fid_output_temp.read()
fid_output_temp.close()
fid_output.write(tokens_cur)
except:
print("Unexpected Error !!!")
fid_output.close()