# Tokenize
  
Data format:  
(time)(time_devider)(token1_text)(tag_devider)(token1_tag)(tag_devider)(token1_BIO)(token_devider)(token2_text)...  
  
Default:  
(time)@@-(token1_text),@,(token1_tag),@,(token1_BIO)|@|(token2_text)...  

## Requirements  
  
* pip install pythainlp
* pip install pythainlp[thai_ner]
* pip install python-crfsuite

In [None]:
#from pythainlp import correct
from pythainlp.tag import NER
from pythainlp.util import normalize
#from pythainlp.tokenize import word_tokenize
from concurrent import futures
from threading import Lock

## Adjust here:

In [None]:
TIME_DEVIDER = "@@-"
TOKEN_DEVIDER = "|@-"
TAG_DEVIDER = ",@-"

ID = "Mitsubishi"

WORKERS_N = 4

text_file = "TextData/PantipTextData_%s.txt"%ID
tag_file = "Tag/PantipTag_%s.txt"%ID

Spell correction is comment outed because of speed issue

In [None]:
#Data of each line(=post)
#usage: construct "Post(LINE)", calculation automated
#       to_string() to get string result
class Post:
    time = ""
    original_content = ""
    
    #normalized + spellchecked
    fixed_content = ""

    tagged = []

    conflicted = False

    #read line
    def __init__(self, line):
        #process line
        split = line.split(TIME_DEVIDER)
        self.time = split[0]
        self.original_content = TIME_DEVIDER.join(split[1:]).replace("\n", "") #revert original time_devider + get rid of new-line-letter

        self.fix()

        if self.has_conflict():
            return

        self.tag()

    def fix(self):
        #normalize
        self.fixed_content = normalize(self.original_content)

        # ##spell correction
        # #collapse text
        # word_list = word_tokenize(self.fixed_content)
        # fixed_list = []
        
        # #correct each word
        # for word in word_list:
        #     fixed_list.append(correct(word))

        # #merge
        # self.fixed_content = "".join(fixed_list)

    #check if text conflicts with devider letters
    def has_conflict(self):
        if (TOKEN_DEVIDER in self.fixed_content) | (TAG_DEVIDER in self.fixed_content):
            self.conflicted = True
            print("WORNING: devider CONFLICTION; "+self.original_content)
            return True
        
        return False

    def tag(self):
        ner = NER("thainer")
        self.tagged = ner.tag(self.fixed_content)

    #format : (time)@@@(word),(tag),(IBO)
    def to_string(self):
        if self.conflicted:
            return ""

        #time
        result = self.time
        result += TIME_DEVIDER

        #each token
        first = True
        for token in self.tagged:
            #first token don't need devider
            if first:
                first = False
            else:
                result += TOKEN_DEVIDER
            
            #text
            result += token[0]
            result += TAG_DEVIDER

            #tag
            result += token[1]
            result += TAG_DEVIDER

            #BIO
            result += token[2]

        return result

In [None]:
def process(line):
    global fw, processed, write_lock

    post = Post(line)

    with write_lock:
        fw.write(post.to_string() + "\n")

    ##show progress
    processed += 1
    if processed % 100000 == 0:
        print(processed)

In [None]:
#read
f = open(text_file, "r", encoding="utf-8")

#write
fw = open(tag_file, "w", encoding="utf-8")

#process data
processed = 0
line = True
future_list = []
write_lock = Lock()
with futures.ThreadPoolExecutor(max_workers=WORKERS_N) as executor:
    while line:
        try:
            line = f.readline()

            #rid of exceeding
            if line == "":
                continue

            #register to future
            future = executor.submit(process, line=line)
            future_list.append(future)

        except:
            #skip decode-error row
            pass    

    #wait until finish
    _ = futures.as_completed(fs=future_list)

f.close()
fw.close()