In [1]:
import sefr_cut

# Load Model

In [2]:
'''
Engine : 
    SEFR Cut model
        - ws1000 train on Wisesight1000
        - tnhc train on TNHC (train:80/test:20)
    DeepCut
        - deepcut original deepcut train from BEST corpus
    Transfer Learning DeepCut
        - tl-deepcut-ws1000
        - tl-deepcut-tnhc
'''
sefr_cut.load_model(engine='tnhc')

loading model.....
Success


# Tokenize

In [3]:
print(sefr_cut.tokenize(['สวัสดีพ่อแม่พี่น้องชาวไทย','วันนี้คุณยิ้มแล้วหรือยัง?']))
print(sefr_cut.tokenize(['สวัสดีพ่อแม่พี่น้องชาวไทย']))
print(sefr_cut.tokenize('สวัสดีพ่อแม่พี่น้องชาวไทย'))

[['สวัสดี', 'พ่อ', 'แม่', 'พี่', 'น้อง', 'ชาว', 'ไทย'], ['วัน', 'นี้', 'คุณ', 'ยิ้ม', 'แล้ว', 'หรือ', 'ยัง', '?']]
[['สวัสดี', 'พ่อ', 'แม่', 'พี่', 'น้อง', 'ชาว', 'ไทย']]
[['สวัสดี', 'พ่อ', 'แม่', 'พี่', 'น้อง', 'ชาว', 'ไทย']]


## You can do with a vary of $k$ value

In [4]:
print(sefr_cut.tokenize(['สวัสดีพ่อแม่พี่น้องชาวไทย','วันนี้คุณยิ้มแล้วหรือยัง?'],k=1)) # refine only 1% of character number
print(sefr_cut.tokenize(['สวัสดีพ่อแม่พี่น้องชาวไทย','วันนี้คุณยิ้มแล้วหรือยัง?'],k=100)) # refine 100% of character number

[['สวัสดี', 'พ่อแม่พี่น้อง', 'ชาว', 'ไทย'], ['วัน', 'นี้', 'คุณ', 'ยิ้ม', 'แล้ว', 'หรือ', 'ยัง', '?']]
[['สวัสดี', 'พ่อแม่', 'พี่น้อง', 'ชาว', 'ไทย'], ['วัน', 'นี้', 'คุณ', 'ยิ้ม', 'แล้ว', 'หรือ', 'ยัง', '?']]


# We also provide evaluation method in our paper for you!

### You can do with raw text, list, 2d list

In [5]:
answer = 'สวัสดี|ประเทศไทย'
pred = 'สวัสดี|ประเทศ|ไทย'
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'Word Score: {word_score} Char Score: {char_score}')

Word Score: 0.4 Char Score: 0.8


In [6]:
answer = ['สวัสดี|ประเทศไทย']
pred = ['สวัสดี|ประเทศ|ไทย']
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'Word Score: {word_score} Char Score: {char_score}')

Word Score: 0.4 Char Score: 0.8


In [7]:
answer = [['สวัสดี|'],['ประเทศไทย']] # ->'สวัสดี|ประเทศไทย'
pred = [['สวัสดี|'],['ประเทศ|ไทย']] # -> 'สวัสดี|ประเทศ|ไทย'
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'Word Score: {word_score} Char Score: {char_score}')

Word Score: 0.4 Char Score: 0.8


## But some of a segmenter didn't have '|' at the end of the sentence, so you can use evaluation(sep='|')

In [8]:
answer = [['สวัสดี'],['ประเทศไทย']] # ->'สวัสดี|ประเทศไทย'
pred = [['สวัสดี'],['ประเทศ|ไทย']] # -> 'สวัสดี|ประเทศ|ไทย'
char_score,word_score = sefr_cut.evaluation(answer,pred,sep='|')
print(f'Word Score: {word_score} Char Score: {char_score}')

Word Score: 0.4 Char Score: 0.8


# Comparison between DeepCut,TL-DeepCut-ws1000, and Our method (ws1000)

### Example 1.

In [9]:
answer = '''ขี้เกียจ|ไป|ยืม|รถ|พี่| |พี่|แม่ง|ขับ|เทียน่า| |คัน|ใหญ่|เกิ๊นนนนนนน| |ที่|เหลือ||ก็มี|แต่|คัน|ใหญ่|ๆ| |ขับ|ไม่|ถนัด| |5555''' 

In [10]:
sefr_cut.load_model(engine='ws1000')
pred = '|'.join(sefr_cut.tokenize(answer.replace('|',''))[0]) # Predict with our method
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'WS1000: Word Score: {word_score} Char Score: {char_score}')

loading model.....
Success
WS1000: Word Score: 0.8813559322033899 Char Score: 0.9655172413793104


In [11]:
sefr_cut.load_model(engine='tl-deepcut-ws1000')
pred = '|'.join(sefr_cut.tokenize(answer.replace('|',''))[0]) # Predict with our method
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'TL: Word Score: {word_score} Char Score: {char_score}')

loading model.....
Success
TL: Word Score: 0.9310344827586207 Char Score: 0.9824561403508771


In [12]:
sefr_cut.load_model(engine='deepcut')
pred = '|'.join(sefr_cut.tokenize(answer.replace('|',''))[0]) # Predict with our method
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'DeepCut: Word Score: {word_score} Char Score: {char_score}')

loading model.....
Success
DeepCut: Word Score: 0.8333333333333334 Char Score: 0.9491525423728813


### Example 2.

In [13]:
answer = '''แพง|เว่อร์| |เบียร์|ช้าง|ต้นทุน|ขวด|ละ|ไม่|ถึง| |50| |ขาย| |120| |😰|😰|😰์''' 

In [14]:
sefr_cut.load_model(engine='ws1000')
pred = '|'.join(sefr_cut.tokenize(answer.replace('|',''))[0]) # Predict with our method
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'WS1000: Word Score: {word_score} Char Score: {char_score}')

loading model.....
Success
WS1000: Word Score: 0.9268292682926829 Char Score: 0.975609756097561


In [15]:
sefr_cut.load_model(engine='tl-deepcut-ws1000')
pred = '|'.join(sefr_cut.tokenize(answer.replace('|',''))[0]) # Predict with our method
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'TL: Word Score: {word_score} Char Score: {char_score}')

loading model.....
Success
TL: Word Score: 0.8421052631578948 Char Score: 0.9473684210526316


In [16]:
sefr_cut.load_model(engine='deepcut')
pred = '|'.join(sefr_cut.tokenize(answer.replace('|',''))[0]) # Predict with our method
char_score,word_score = sefr_cut.evaluation(answer,pred)
print(f'DeepCut: Word Score: {word_score} Char Score: {char_score}')

loading model.....
Success
DeepCut: Word Score: 0.85 Char Score: 0.9500000000000001
