In [None]:
!pip install -q --user --upgrade --pre pythainlp 

# Workshop Notebook 1: Getting started with PyThaiNLP üòÜ


Updated: 31 October 2019

## Header


In [None]:
from typing import Set, List
from functools import reduce
import re

## 1. Word Tokenization

Word Tokenization is a process to determin word boundaries in a text or sentence.


Given a sentence, the tokenizer then read the sentence and return a list of words (i.e. tokens).

```python

    definition: Tokenizer(str) -> List[str]
    
    
    
    Tokenizer(str:"‡πÄ‡∏ò‡∏≠‡∏Ñ‡∏∑‡∏≠ My Ambulance ‡∏Ç‡∏≠‡∏á‡∏â‡∏±‡∏ô")  -> List["‡πÄ‡∏ò‡∏≠", "‡∏Ñ‡∏∑‡∏≠", "My", "Ambulance", "‡∏Ç‡∏≠‡∏á", "‡∏â‡∏±‡∏ô"]

```



### Dictionary-based tokenizer


Dictionary-based tokenizer is an alogirithm the read through the sentence character by character.  If it found sequences of characters match with a vocabulary in the pre-defined dictionary, it maps sequences of characters as a token.
https://www.cs.ait.ac.th/~mdailey/papers/Choochart-Wordseg.pdf


```python

dictionary = Set["‡∏â‡∏±‡∏ô", "‡∏ä‡∏≠‡∏ö", "‡∏£‡∏ñ‡πÑ‡∏ü", "‡∏£‡∏ñ", "‡∏£‡∏î", "‡∏ô‡πà‡∏≥", "‡∏ï‡πâ‡∏ô", "‡πÑ‡∏°‡πâ", "‡∏ï‡πâ‡∏ô‡πÑ‡∏°‡πâ", " "]


Dictionary_Tokenizer(dictionary:Set[str])

```




#### 1.1 Longest matching (LM)

Longest matching is an algorithm to split words from a sentence by considering logest vocab first.

In [None]:
dictionary = set(["‡∏â‡∏±‡∏ô", "‡∏ä‡∏≠‡∏ö", "‡∏£‡∏ñ‡πÑ‡∏ü", "‡∏£‡∏ñ", "‡∏£‡∏î", "‡∏ô‡πà‡∏≥", "‡∏ï‡πâ‡∏ô", "‡πÑ‡∏°‡πâ", "‡∏ï‡πâ‡∏ô‡πÑ‡∏°‡πâ", " ", "‡∏ü‡πâ‡∏≤"])

def search_longest(term, dictionary):
    term_length = len(term)
    max_length = 0
    for vocab in dictionary:
        if term in vocab:
            max_length = max(max_length, len(vocab))

    return max_length == term_length

def Dictionary_Tokenizer_LM_debug(sentence:str, dictionary: Set[str]):
    buffer = ""
    tokens = []
    for char in sentence:
        buffer += char
        print("buffer", buffer)
        if search_longest(buffer, dictionary) == True:
            print("select this token: {}".format(buffer))
            tokens.append(buffer)
            buffer = ""
            print("clear the buffer.")
            print("")
    return tokens

def Dictionary_Tokenizer_LM(sentence:str, dictionary: Set[str]):
    buffer = ""
    tokens = []
    for char in sentence:
        buffer += char
        if search_longest(buffer, dictionary) == True:
            tokens.append(buffer)
            buffer = ""
    return tokens

In [None]:
Dictionary_Tokenizer_LM_debug("‡∏â‡∏±‡∏ô‡∏ä‡∏≠‡∏ö ‡∏£‡∏ñ‡πÑ‡∏ü‡∏ü‡πâ‡∏≤", dictionary)

#### __Question 1:__ Create your own dictionary to tokenize the following sentences that can tokenize all the words.

In [None]:
test_sentences = [
    "‡∏Å‡∏£‡∏∞‡∏ó‡∏£‡∏ß‡∏á‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏°‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∑‡πà‡∏≠‡∏™‡∏≤‡∏£‡∏Å‡∏≤‡∏ï‡∏≤‡∏£‡πå ‡∏à‡∏±‡∏î‡∏á‡∏≤‡∏ô Qatar Information Technology Exhibition and Conference (QITCOM 2019)",
    "‡∏ì ‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏î‡∏Æ‡∏≤ ‡∏£‡∏±‡∏ê‡∏Å‡∏≤‡∏ï‡∏≤‡∏£‡πå",
]

In [None]:
# Fill the vocabulary to dictionary_lm

dictionary_lm = set([
    " ",
    "Qatar",
    "Information",
    "‡∏Å‡∏£‡∏∞‡∏ó‡∏£‡∏ß‡∏á‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏°‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∑‡πà‡∏≠‡∏™‡∏≤‡∏£",
    # add more vocab
])

__Test:__

In [None]:
def test_Dictionary_Tokenizer_LM(dictionary_lm):
    
    tokens_list = [ Dictionary_Tokenizer_LM(sentence, dictionary_lm) for sentence in test_sentences]
    character_count_expect = sum([len(sentence) for sentence in test_sentences])
    character_count_actual = 0
    for tokens in tokens_list:
        character_count_actual += sum(map(lambda token : len(token),tokens))

    if(character_count_actual == character_count_expect):
        print("‚úÖ Test succeed. üòÅ")
        
        print("\n tokens_list: ", tokens_list)
    else:
        print("Test failed. üò≠\n")
        
        print("test_sentences", test_sentences)
        print("tokens_list", tokens_list)
        
        print('')
        print("character_count_actual != character_count_expect")
        print("{} != {}".format(character_count_actual, character_count_expect))

In [None]:
# Run this block to test the code
test_Dictionary_Tokenizer_LM(dictionary_lm)

__Solution:__

In [None]:
dictionary_lm = set([
    "Qatar",
    "Information",
    "‡∏Å‡∏£‡∏∞‡∏ó‡∏£‡∏ß‡∏á‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏°‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∑‡πà‡∏≠‡∏™‡∏≤‡∏£",
    "‡∏Å‡∏≤‡∏ï‡∏≤‡∏£‡πå",
    "‡∏ì",
    "‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏î‡∏Æ‡∏≤",
    "‡∏£‡∏±‡∏ê",
    " ",
    "‡∏à‡∏±‡∏î‡∏á‡∏≤‡∏ô",
    "(",
    ")",
    "QITCOM", "2019",
    "Qatar",
    "Information",
    "Technology",
    "Exhibition",
    "and",
    "Conference"
])

In [None]:
test_Dictionary_Tokenizer_LM(dictionary_lm)

#### 1.2 Maximal matching (MM)


Unlike Longest Matching, Maximal matching is an algorithm to split words from a sentence in which it prefers minumum number of tokens to be splited.


```python

dictionary = set(["‡∏£‡∏ñ", "‡∏£‡∏ñ‡πÑ‡∏ü", "‡∏ü‡πâ‡∏≤", "‡πÑ‡∏ü‡∏ü‡πâ‡∏≤", "‡πÉ‡∏ï‡πâ‡∏î‡∏¥‡∏ô"])


sentence = "‡∏£‡∏ñ‡πÑ‡∏ü‡∏ü‡πâ‡∏≤‡πÉ‡∏ï‡πâ‡∏î‡∏¥‡∏ô"

Possible_segments(sentence) ->
["‡∏£‡∏ñ‡πÑ‡∏ü", "‡∏ü‡πâ‡∏≤", "‡πÉ‡∏ï‡πâ‡∏î‡∏¥‡∏ô"]
["‡∏£‡∏ñ", "‡πÑ‡∏ü‡∏ü‡πâ‡∏≤", "‡πÉ‡∏ï‡πâ‡∏î‡∏¥‡∏ô"]
["‡∏£‡∏ñ‡πÑ‡∏ü‡∏ü‡πâ‡∏≤", "‡πÉ‡∏ï‡πâ‡∏î‡∏¥‡∏ô"]


selected_segment = ["‡∏£‡∏ñ‡πÑ‡∏ü‡∏ü‡πâ‡∏≤", "‡πÉ‡∏ï‡πâ‡∏î‡∏¥‡∏ô"]

```


#### PyThaiNLP's Tokenizer (newmm)

In [None]:
from pythainlp.tokenize import word_tokenize

In [None]:
test_sentence = "‡∏Å‡∏£‡∏∞‡∏ó‡∏£‡∏ß‡∏á‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏°‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∑‡πà‡∏≠‡∏™‡∏≤‡∏£‡∏Å‡∏≤‡∏ï‡∏≤‡∏£‡πå ‡∏à‡∏±‡∏î‡∏á‡∏≤‡∏ô Qatar Information Technology Exhibition and Conference (QITCOM 2019)"


In [None]:
tokens = word_tokenize(test_sentence, engine="newmm")
print(tokens)

__Try out:__

2.1 Try adding your own sentence.



In [None]:
# Example sentence
print(word_tokenize("‡∏â‡∏±‡∏ô‡∏≠‡∏¢‡∏∏‡πã‡∏ó‡∏µ‡πà ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ö‡∏±‡∏ì‡∏ë‡∏¥‡∏ï‡∏û‡∏±‡∏í‡∏ô‡∏ö‡∏£‡∏¥‡∏´‡∏≤‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå", engine="newmm"))

In [None]:
# Enter you own setnence
print(word_tokenize("....", engine="newmm"))

2.2 Try adding your own sentence with misspelling.


In [None]:
# Example sentence with misspelling words
print(word_tokenize("‡∏â‡∏±‡∏ô‡∏≠‡∏¢‡∏∏‡πã‡∏ó‡∏µ‡πà ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ö‡∏±‡∏ì‡∏ë‡∏¥‡∏ï‡∏û‡∏±‡∏í‡∏ô‡∏ö‡∏¢‡∏£‡∏¥‡∏´‡∏≤‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå", engine="newmm"))

In [None]:
# Enter you own setnence
print(word_tokenize("....", engine="newmm"))

#### __Question 2:__ Add your own custom dictionary for `newmm` tokenizer to tokenize the into the following tokens:

```

"‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 22 ‡∏ï.‡∏Ñ. ‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏ß‡πà‡∏≤ ‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞ ‡∏ó‡∏£‡∏á‡πÄ‡∏Ç‡πâ‡∏≤‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡πÅ‡∏´‡πà‡∏á‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô‡πÇ‡∏î‡∏¢‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡πÅ‡∏•‡πâ‡∏ß‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ ‡∏ó‡∏µ‡πà‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á‡∏´‡∏•‡∏ß‡∏á‡πÉ‡∏ô‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß",

```

Result with the default dictionary:

```
['‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà', ' ', '22', ' ', '‡∏ï.‡∏Ñ.', ' ', '‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ', '‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô', '‡∏ß‡πà‡∏≤', ' ', '‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à', '‡∏û‡∏£‡∏∞', '‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥', '‡∏ô‡∏≤', '‡∏£‡∏∏', '‡∏Æ‡∏¥', '‡πÇ‡∏ï‡∏∞', ' ', '‡∏ó‡∏£‡∏á', '‡πÄ‡∏Ç‡πâ‡∏≤', '‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ', '‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å', ' ', '‡πÄ‡∏õ‡πá‡∏ô', '‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à', '‡∏û‡∏£‡∏∞', '‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥', '‡πÅ‡∏´‡πà‡∏á', '‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô', '‡πÇ‡∏î‡∏¢', '‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå', '‡πÅ‡∏•‡πâ‡∏ß', '‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ', ' ', '‡∏ó‡∏µ‡πà', '‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á', '‡∏´‡∏•‡∏ß‡∏á', '‡πÉ‡∏ô', '‡∏Å‡∏£‡∏∏‡∏á', '‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß']
```

Expectation:
```
['‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà', ' ', '22', ' ', '‡∏ï.‡∏Ñ.', ' ', '‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ', '‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô', '‡∏ß‡πà‡∏≤', ' ', '‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥', '‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞', ' ', '‡∏ó‡∏£‡∏á', '‡πÄ‡∏Ç‡πâ‡∏≤', '‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ', '‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å', ' ', '‡πÄ‡∏õ‡πá‡∏ô', '‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥', '‡πÅ‡∏´‡πà‡∏á', '‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô', '‡πÇ‡∏î‡∏¢', '‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå', '‡πÅ‡∏•‡πâ‡∏ß', '‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ', ' ', '‡∏ó‡∏µ‡πà', '‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á', '‡∏´‡∏•‡∏ß‡∏á', '‡πÉ‡∏ô', '‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß']
```

In [None]:
from pythainlp.tokenize.trie import Trie
from pythainlp.corpus import thai_words

In [None]:
text_from_news = """‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 22 ‡∏ï.‡∏Ñ. ‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏ß‡πà‡∏≤ ‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞ ‡∏ó‡∏£‡∏á‡πÄ‡∏Ç‡πâ‡∏≤‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡πÅ‡∏´‡πà‡∏á‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô‡πÇ‡∏î‡∏¢‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡πÅ‡∏•‡πâ‡∏ß‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ ‡∏ó‡∏µ‡πà‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á‡∏´‡∏•‡∏ß‡∏á‡πÉ‡∏ô‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß"""

In [None]:
# Add vocab in this list
custom_vocab = [
    
    
]


__Test:__

In [None]:
def test_tokenize_japan_news(custom_vocab):
    expect = ['‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà', ' ', '22', ' ', '‡∏ï.‡∏Ñ.', ' ', '‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ', '‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô', '‡∏ß‡πà‡∏≤', ' ',
              '‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥', '‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞', ' ', '‡∏ó‡∏£‡∏á', '‡πÄ‡∏Ç‡πâ‡∏≤', '‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ', '‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å',
              ' ', '‡πÄ‡∏õ‡πá‡∏ô', '‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥', '‡πÅ‡∏´‡πà‡∏á', '‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô', '‡πÇ‡∏î‡∏¢', '‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå', '‡πÅ‡∏•‡πâ‡∏ß', '‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ',
              ' ', '‡∏ó‡∏µ‡πà', '‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á', '‡∏´‡∏•‡∏ß‡∏á', '‡πÉ‡∏ô', '‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß']
    
    custom_dict_trie = Trie( list(thai_words()) + custom_vocab)

    actual = word_tokenize(text_from_news, custom_dict=custom_dict_trie, engine="newmm")
    
    
   
    if actual == expect:
        print("‚úÖ Test succeed. üòÅ")
    else:
        print("‚ùå Test failed. üò≠")
        print("\nYour result    :\n\n", "|".join(actual))
        print("\nExtected result:\n\n", "|".join(expect))

In [None]:
test_tokenize_japan_news(custom_vocab)

__Solution:__

In [None]:
# Add vocab
custom_vocab = [
    "‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥",
    "‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß",
    "‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞"
]

test_tokenize_japan_news(custom_vocab)

### Learning-based tokenizer


Tokenizer is a Machine Learning model and train on supervised daataset (labeled dataset).


For example, one tokenizer of PyThaiNLP (`attacut`) uses Convolutional-neural Network to read the whole text and then determind word boundaries.

![attacut](images/attacut.png)

#### attacut

In [None]:
test_sentence = "‡∏Å‡∏£‡∏∞‡∏ó‡∏£‡∏ß‡∏á‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏°‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏™‡∏∑‡πà‡∏≠‡∏™‡∏≤‡∏£‡∏Å‡∏≤‡∏ï‡∏≤‡∏£‡πå ‡∏à‡∏±‡∏î‡∏á‡∏≤‡∏ô Qatar Information Technology Exhibition and Conference (QITCOM 2019)"

tokens = word_tokenize(test_sentence, engine="attacut")
print(tokens)

In [None]:
test_sentence = "‡∏â‡∏±‡∏ô‡∏≠‡∏¢‡∏∏‡πã‡∏ó‡∏µ‡πà ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ö‡∏±‡∏ì‡∏ë‡∏¥‡∏ï‡∏û‡∏±‡∏í‡∏ô‡∏ö‡∏£‡∏¥‡∏´‡∏≤‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå"

tokens = word_tokenize(test_sentence, engine="attacut")
print(tokens)

__Try out:__ Try adding your own sentence.



In [None]:
# Enter you own setnence
print(word_tokenize("....", engine="attacut"))



## 2. Part of speech and Named Entity Recognition Tagging


In [None]:
from pythainlp.tag.named_entity import ThaiNameTagger

tagger = ThaiNameTagger()


#### Named Entitiy Regcognition (NER) Tags:

|       Tags       |      Examples                       |
|------------------|-------------------------------------|
        DATE       |   1 ‡∏ï‡∏∏‡∏•‡∏≤‡∏Ñ‡∏° 2012                      |
        EMAIL      |   hr@mycompany.com                  |    
        LAW        |  ‡∏û‡∏£‡∏ö.‡∏Ñ‡∏∏‡πâ‡∏°‡∏Ñ‡∏£‡∏≠‡∏á‡∏ú‡∏π‡πâ‡∏ö‡∏£‡∏¥‡πÇ‡∏†‡∏Ñ                   |
        LEN        |       80 ‡∏Å‡∏¥‡πÇ‡∏•‡πÄ‡∏°‡∏ï‡∏£                    |     
      LOCATION     |  ‡∏Å‡∏£‡∏∏‡∏á‡πÄ‡∏ó‡∏û, ‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏à‡∏µ‡∏ô, ‡πÄ‡∏≠‡πÄ‡∏ß‡∏≠‡πÄ‡∏£‡∏™‡∏ï‡πå        | 
        MONEY      |   2,190 ‡∏•‡πâ‡∏≤‡∏ô‡∏ö‡∏≤‡∏ó                      |
    ORGANIZATION   |  ‡∏Ñ‡∏ì‡∏∞‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå ‡∏à‡∏∏‡∏¨‡∏≤‡∏•‡∏á‡∏Å‡∏£‡∏ì‡πå‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢     |
       PERCENT     |   95.34%, 10‡πÄ‡∏õ‡∏≠‡∏£‡πå‡πÄ‡∏ã‡∏ô‡∏ï‡πå                |
       PERSON      |   ‡∏≠‡∏£‡∏£‡∏ñ‡∏û‡∏• ‡∏ò‡∏≥‡∏£‡∏á‡∏£‡∏±‡∏ï‡∏ô‡∏§‡∏ó‡∏ò‡∏¥‡πå                 |
        PHONE      |   +6611-123-1123                    |
         TIME      |      14:20 ‡∏ô, ‡πÄ‡∏ß‡∏•‡∏≤‡πÄ‡∏ó‡∏µ‡πà‡∏¢‡∏á‡∏ï‡∏£‡∏á           |
          URL      |     mycompany.com                   |
         ZIP       |     ‡∏£‡∏´‡∏±‡∏™‡πÑ‡∏õ‡∏£‡∏ì‡∏µ‡∏¢‡πå 21210                  |

In [None]:
sentence = "‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡πÑ‡∏î‡πâ‡πÑ‡∏õ‡∏á‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î‡∏ö‡πâ‡∏≤‡∏ô ‡∏ó‡∏µ‡πà‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢‡∏ò‡∏£‡∏£‡∏°‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå"

In [None]:
tagger.get_ner(sentence, pos=False)

In [None]:
tagger.get_ner(sentence, pos=True)

#### __Question 3:__ From the following setentences how many types of named-entity appear in the sentence


```text
‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà ‡πì‡πë ‡∏ï‡∏∏‡∏•‡∏≤‡∏Ñ‡∏° ‡πí‡πï‡πñ‡πí ‡πÄ‡∏ß‡∏•‡∏≤ 13:00 ‡∏ô. ‡∏ï‡∏≤‡∏°‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢
```





In [None]:
tagger.get_ner("‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà ‡πì‡πë ‡∏ï‡∏∏‡∏•‡∏≤‡∏Ñ‡∏° ‡πí‡πï‡πñ‡πí ‡πÄ‡∏ß‡∏•‡∏≤ 13:00 ‡∏ô. ‡∏ï‡∏≤‡∏°‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢", pos=False)

#### __Question 4:__ From the following setentences how many types of named-entity appear in the sentence

Reference: https://www.khaosod.co.th/around-the-world-news/news_2993136

```text
‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 22 ‡∏ï.‡∏Ñ. ‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏ß‡πà‡∏≤ ‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞ ‡∏ó‡∏£‡∏á‡πÄ‡∏Ç‡πâ‡∏≤‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å
‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡πÅ‡∏´‡πà‡∏á‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô‡πÇ‡∏î‡∏¢‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡πÅ‡∏•‡πâ‡∏ß‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ ‡∏ó‡∏µ‡πà‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á‡∏´‡∏•‡∏ß‡∏á‡πÉ‡∏ô‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß
```





In [None]:
# Try out
tagger.get_ner("‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 22 ‡∏ï.‡∏Ñ. ‡πÄ‡∏≠‡πÄ‡∏≠‡∏ü‡∏û‡∏µ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏ß‡πà‡∏≤ ‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡∏ô‡∏≤‡∏£‡∏∏‡∏Æ‡∏¥‡πÇ‡∏ï‡∏∞ ‡∏ó‡∏£‡∏á‡πÄ‡∏Ç‡πâ‡∏≤‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏û‡∏¥‡∏ò‡∏µ‡∏ö‡∏£‡∏°‡∏£‡∏≤‡∏ä‡∏≤‡∏†‡∏¥‡πÄ‡∏©‡∏Å ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏°‡πÄ‡∏î‡πá‡∏à‡∏û‡∏£‡∏∞‡∏à‡∏±‡∏Å‡∏£‡∏û‡∏£‡∏£‡∏î‡∏¥‡πÅ‡∏´‡πà‡∏á‡∏ç‡∏µ‡πà‡∏õ‡∏∏‡πà‡∏ô‡πÇ‡∏î‡∏¢‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡πÅ‡∏•‡πâ‡∏ß‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ ‡∏ó‡∏µ‡πà‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏ä‡∏ß‡∏±‡∏á‡∏´‡∏•‡∏ß‡∏á‡πÉ‡∏ô‡∏Å‡∏£‡∏∏‡∏á‡πÇ‡∏ï‡πÄ‡∏Å‡∏µ‡∏¢‡∏ß", pos=False)

#### __Question 5:__ From the following setentences how many types of named-entity appear in the sentence

Reference: [link](http://www.arts.chula.ac.th/ling/blog/tag/%E0%B8%AD%E0%B8%A3%E0%B8%A3%E0%B8%96%E0%B8%9E%E0%B8%A5-%E0%B8%98%E0%B8%B3%E0%B8%A3%E0%B8%87%E0%B8%A3%E0%B8%B1%E0%B8%95%E0%B8%99%E0%B8%A4%E0%B8%97%E0%B8%98%E0%B8%B4%E0%B9%8C/)

```text
‡∏Ñ‡∏ì‡∏∞‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå ‡∏à‡∏∏‡∏¨‡∏≤‡∏•‡∏á‡∏Å‡∏£‡∏ì‡πå‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢ ‡∏Ç‡∏≠‡πÄ‡∏ä‡∏¥‡∏ç‡∏ä‡∏ß‡∏ô‡∏ú‡∏π‡πâ‡∏™‡∏ô‡πÉ‡∏à‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡πà‡∏ß‡∏°‡∏ü‡∏±‡∏á‡∏ö‡∏£‡∏£‡∏¢‡∏≤‡∏¢‡∏û‡∏¥‡πÄ‡∏®‡∏©
‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á ‚Äú‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡∏õ‡∏£‡∏¥‡∏à‡πÄ‡∏â‡∏ó‡πÅ‡∏ö‡∏ö‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å‡∏Ñ‡∏≥‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‚Äù
‡πÇ‡∏î‡∏¢ ‡∏î‡∏£.‡∏≠‡∏£‡∏£‡∏ñ‡∏û‡∏• ‡∏ò‡∏≥‡∏£‡∏á‡∏£‡∏±‡∏ï‡∏ô‡∏§‡∏ó‡∏ò‡∏¥‡πå

‡∏ß‡∏±‡∏ô‡∏®‡∏∏‡∏Å‡∏£‡πå‡∏ó‡∏µ‡πà 17 ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏¢‡∏ô 2560 ‡πÄ‡∏ß‡∏•‡∏≤ 13.30-14.30 ‡∏ô.
‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡πâ‡∏ô‡πÑ‡∏õ ‡∏ì ‡∏´‡πâ‡∏≠‡∏á 401/5 ‡∏≠‡∏≤‡∏Ñ‡∏≤‡∏£‡∏°‡∏´‡∏≤‡∏à‡∏±‡∏Å‡∏£‡∏µ‡∏™‡∏¥‡∏£‡∏¥‡∏ô‡∏ò‡∏£ ‡∏Ñ‡∏ì‡∏∞‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå ‡∏à‡∏∏‡∏¨‡∏≤‡∏•‡∏á‡∏Å‡∏£‡∏ì‡πå‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢

‡∏™‡∏≠‡∏ö‡∏ñ‡∏≤‡∏°‡∏£‡∏≤‡∏¢‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ï‡∏¥‡∏°‡πÑ‡∏î‡πâ‡∏ó‡∏µ‡πà 0-2218-4692
```





In [None]:
text = """‡∏Ñ‡∏ì‡∏∞‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå ‡∏à‡∏∏‡∏¨‡∏≤‡∏•‡∏á‡∏Å‡∏£‡∏ì‡πå‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢
‡∏Ç‡∏≠‡πÄ‡∏ä‡∏¥‡∏ç‡∏ä‡∏ß‡∏ô‡∏ú‡∏π‡πâ‡∏™‡∏ô‡πÉ‡∏à‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡πà‡∏ß‡∏°‡∏ü‡∏±‡∏á‡∏ö‡∏£‡∏£‡∏¢‡∏≤‡∏¢‡∏û‡∏¥‡πÄ‡∏®‡∏© 
‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á ‚Äú‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡∏õ‡∏£‡∏¥‡∏à‡πÄ‡∏â‡∏ó‡πÅ‡∏ö‡∏ö‡∏≠‡∏±‡∏ï‡πÇ‡∏ô‡∏°‡∏±‡∏ï‡∏¥‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å‡∏Ñ‡∏≥‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‚Äù
‡πÇ‡∏î‡∏¢ ‡∏î‡∏£.‡∏≠‡∏£‡∏£‡∏ñ‡∏û‡∏• ‡∏ò‡∏≥‡∏£‡∏á‡∏£‡∏±‡∏ï‡∏ô‡∏§‡∏ó‡∏ò‡∏¥‡πå
‡∏ß‡∏±‡∏ô‡∏®‡∏∏‡∏Å‡∏£‡πå‡∏ó‡∏µ‡πà 17 ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏¢‡∏ô 2560 ‡πÄ‡∏ß‡∏•‡∏≤ 13.30-14.30 ‡∏ô. ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡πâ‡∏ô‡πÑ‡∏õ ‡∏ì ‡∏´‡πâ‡∏≠‡∏á 401/5 ‡∏≠‡∏≤‡∏Ñ‡∏≤‡∏£‡∏°‡∏´‡∏≤‡∏à‡∏±‡∏Å‡∏£‡∏µ‡∏™‡∏¥‡∏£‡∏¥‡∏ô‡∏ò‡∏£ ‡∏Ñ‡∏ì‡∏∞‡∏≠‡∏±‡∏Å‡∏©‡∏£‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå ‡∏à‡∏∏‡∏¨‡∏≤‡∏•‡∏á‡∏Å‡∏£‡∏ì‡πå‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢
‡∏™‡∏≠‡∏ö‡∏ñ‡∏≤‡∏°‡∏£‡∏≤‡∏¢‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏ï‡∏¥‡∏°‡πÑ‡∏î‡πâ‡∏ó‡∏µ‡πà 0-2218-4692
"""

tagger.get_ner(text, pos=False)

#### Part of Speech (POS) Tags:


Reference: [PUD Tags](https://universaldependencies.org/u/pos/all.html)




|  Abbreviation |      Part-of-Speech tag    |            Examples             |       
|---------------|----------------------------|---------------------------------|
| ADJ           |  Adjective                 |    ‡πÉ‡∏´‡∏°‡πà, ‡∏û‡∏¥‡πÄ‡∏®‡∏© , ‡∏Å‡πà‡∏≠‡∏ô, ‡∏°‡∏≤‡∏Å, ‡∏™‡∏π‡∏á     |   
| ADP           |  Adposition                |   ‡πÅ‡∏°‡πâ, ‡∏ß‡πà‡∏≤, ‡πÄ‡∏°‡∏∑‡πà‡∏≠, ‡∏Ç‡∏≠‡∏á, ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö       |   
| ADV           |  Adverb                    |   ‡∏Å‡πà‡∏≠‡∏ô, ‡∏Å‡πá, ‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢, ‡πÄ‡∏•‡∏¢, ‡∏™‡∏∏‡∏î       |   
| AUX           |  Auxiliary                 |   ‡πÄ‡∏õ‡πá‡∏ô, ‡πÉ‡∏ä‡πà, ‡∏Ñ‡∏∑‡∏≠, ‡∏Ñ‡∏•‡πâ‡∏≤‡∏¢             |   
| CCONJ         |  Coordinating conjunction  |   ‡πÅ‡∏ï‡πà, ‡πÅ‡∏•‡∏∞, ‡∏´‡∏£‡∏∑‡∏≠                  |        
| DET           |  Determiner                |   ‡∏ô‡∏µ‡πâ, ‡∏ô‡∏±‡πâ‡∏ô, ‡∏ó‡∏±‡πâ‡∏á, ‡πÄ‡∏û‡∏µ‡∏¢‡∏á, (‡∏´‡∏ô‡∏∂‡πà‡∏á)‡∏Ñ‡∏ô      |   
| INTJ          |  Interjection              |   ‡∏≠‡∏∏‡πâ‡∏¢, ‡πÇ‡∏≠‡πâ‡∏¢                       |   
| NOUN          |  Noun                      |   ‡∏Å‡∏≥‡∏°‡∏∑‡∏≠, ‡∏û‡∏ß‡∏Å, ‡∏™‡∏ô‡∏≤‡∏°, ‡∏Å‡∏µ‡∏¨‡∏≤, ‡∏ö‡∏±‡∏ç‡∏ä‡∏µ     |   
| NUM           |  Numeral                   |   5,000, 103.7, 2004, ‡∏´‡∏ô‡∏∂‡πà‡∏á, ‡∏£‡πâ‡∏≠‡∏¢  |   
| PART          |  Particle                  |   ‡∏°‡∏≤ ‡∏Ç‡∏∂‡πâ‡∏ô ‡πÑ‡∏°‡πà ‡πÑ‡∏î‡πâ ‡πÄ‡∏Ç‡πâ‡∏≤               |      
| PRON          |  Pronoun                   |   ‡πÄ‡∏£‡∏≤, ‡πÄ‡∏Ç‡∏≤, ‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á, ‡πÉ‡∏Ñ‡∏£, ‡πÄ‡∏ò‡∏≠     |   
| PROPN         |  Proper noun               |   ‡πÇ‡∏≠‡∏ö‡∏≤‡∏°‡∏≤, ‡πÅ‡∏Ñ‡∏õ‡∏¥‡∏ï‡∏≠‡∏•‡∏Æ‡∏¥‡∏•, ‡∏à‡∏µ‡πÇ‡∏≠‡∏û‡∏µ, ‡πÑ‡∏°‡πÄ‡∏Ñ‡∏¥‡∏• |   
| PUNCT         |  Punctuation               |   (, ), ", ', :                 |    
| SCONJ         |  Subordinating conjunction |    ‡∏´‡∏≤‡∏Å, ‡πÄ‡∏û‡πà‡∏£‡∏≤‡∏∞‡∏ß‡πà‡∏≤, ‡∏ñ‡πâ‡∏≤             |   
| VERB          |  Verb                      |   ‡πÄ‡∏õ‡∏¥‡∏î, ‡πÉ‡∏´‡πâ, ‡πÉ‡∏ä‡πâ, ‡πÄ‡∏ú‡∏ä‡∏¥‡∏ç, ‡∏≠‡πà‡∏≤‡∏ô        |


In [None]:
from pythainlp.tag import pos_tag

In [None]:

sentence = "‡∏â‡∏±‡∏ô‡πÑ‡∏õ‡πÄ‡∏î‡∏¥‡∏ô‡πÉ‡∏ô‡∏™‡∏ß‡∏ô‡∏™‡∏≤‡∏ò‡∏≤‡∏£‡∏ì‡∏∞"
tokens = word_tokenize(sentence, keep_whitespace=False)
pos_tag(tokens, corpus="pud", engine="perceptron")



```
Explaination:

PRON = Pronoun 

VERB = Verb

ADP = Adposition

NOUN = Noun

```

In [None]:

sentence = "‡∏â‡∏±‡∏ô‡πÑ‡∏õ‡πÄ‡∏î‡∏¥‡∏ô‡πÉ‡∏ô‡∏™‡∏ß‡∏ô‡∏™‡∏≤‡∏ò‡∏≤‡∏£‡∏ì‡∏∞"
tokens = word_tokenize(sentence, keep_whitespace=False)
pos_tag(tokens, corpus="orchid", engine="perceptron")


```
Explaination:

PPRS = Personal pronoun 

VACT = Active verb

RPRE = Preposition

NCMN = Common noun
```

#### __Question 6:__ From the following setentences what are the POS tags (based on UD)


```text
‡∏´‡∏°‡∏≤ ‡πÅ‡∏•‡∏∞ ‡πÅ‡∏°‡∏ß ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏Å‡∏¥‡∏ô ‡∏≠‡∏≤‡∏´‡∏≤‡∏£
```


Hint: Here is the list of POS tags of this sentence.

- NOUN = Noun
- CCONJ = Coordinating Conjunction
- VERB = Active Verb



In [None]:
# Run this block to see the result

sentence = "‡∏´‡∏°‡∏≤‡πÅ‡∏•‡∏∞‡πÅ‡∏°‡∏ß‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏Å‡∏¥‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£"
tokens = word_tokenize(sentence, keep_whitespace=False)
pos_tag(tokens, corpus="ud", engine="perceptron")


#### __Question 7:__ From the following setentences what are the POS tags (based on Orchid)


```text
‡∏´‡∏°‡∏≤ ‡πÅ‡∏•‡∏∞ ‡πÅ‡∏°‡∏ß ‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏Å‡∏¥‡∏ô ‡∏≠‡∏≤‡∏´‡∏≤‡∏£
```

Hint: Here is the list of POS tags of this sentence.

- NCMN = Common Noun
- JCRG = Coordinating Conjunction
- VACT = Active Verb

In [None]:
# Run this block to see the result

sentence = "‡∏´‡∏°‡∏≤‡πÅ‡∏•‡∏∞‡πÅ‡∏°‡∏ß‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏Å‡∏¥‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£"
tokens = word_tokenize(sentence, keep_whitespace=False)
pos_tag(tokens, corpus="orchid", engine="perceptron")

## 3. Spell checking




### 3.1 PyThaiNLP's spell checker

In [None]:
from pythainlp.spell import correct

In [None]:
mispelled_words = [
    "‡πÇ‡∏£‡∏á‡∏û‡∏¢‡∏≤‡∏ö‡∏≤‡∏ô",
    "‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏¥",
    "‡∏õ‡∏£‡∏∞‡∏ò‡∏≤‡∏£‡∏≤‡∏ò‡∏¥‡∏õ‡∏î‡∏µ",
    "‡∏™‡∏±‡∏õ‡∏õ‡∏∞‡∏£‡∏î",
    "‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï‡∏∏",
    "‡πÄ‡∏´‡∏ï‡∏Å‡∏≤‡∏£‡∏ì‡πå",
    "‡∏≠‡∏ô‡∏∏‡∏ç‡∏≤‡∏ï‡∏¥",
    "‡∏ù‡∏±‡∏Å‡πÑ‡∏ù‡πà",
    "‡∏ô‡∏≤‡∏¢‡∏Å‡∏£‡∏±‡∏ç‡∏°‡∏ô‡∏ï‡∏µ"
]

In [None]:
for word in mispelled_words:
    print("{} -> {}".format(word, correct(word)))
    print("")

__Try out:__ Put any mispelling words and correct them.

In [None]:
correct("..")



## 4. Utility functions

### 4.1 Thai digits and currency Conversion


In [None]:
from pythainlp.util import (
    thai_digit_to_arabic_digit,
    arabic_digit_to_thai_digit,
    bahttext,
    digit_to_text,
    thaiword_to_num)


In [None]:
thai_digit_to_arabic_digit("‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà ‡πì‡πë ‡∏ï‡∏∏‡∏•‡∏≤‡∏Ñ‡∏° ‡πí‡πï‡πñ‡πí ‡πÄ‡∏ß‡∏•‡∏≤ ‡πë‡πì:‡πê‡πê ‡∏ô. ‡∏ï‡∏≤‡∏°‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢")

In [None]:
arabic_digit_to_thai_digit("‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 31 ‡∏ï‡∏∏‡∏•‡∏≤‡∏Ñ‡∏°2562 ‡πÄ‡∏ß‡∏•‡∏≤ 13:00 ‡∏ô. ‡∏ï‡∏≤‡∏°‡πÄ‡∏ß‡∏•‡∏≤‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÑ‡∏ó‡∏¢")

In [None]:
bahttext(1234.24)

In [None]:
bahttext(21)

In [None]:
bahttext(240000000000)

In [None]:
thaiword_to_num("‡∏´‡∏ô‡∏∂‡πà‡∏á‡∏£‡πâ‡∏≠‡∏¢")

In [None]:
digit_to_text("‡πì ‡∏£‡πâ‡∏≠‡∏¢‡∏•‡πâ‡∏≤‡∏ô")

In [None]:
digit_to_text("‡πì‡πë")

In [None]:
thaiword_to_num("‡∏´‡∏ô‡∏∂‡πà‡∏á‡∏û‡∏±‡∏ô‡∏´‡∏ô‡∏∂‡πà‡∏á")

In [None]:
thaiword_to_num("‡∏´‡∏ô‡∏∂‡πà‡∏á‡∏•‡πâ‡∏≤‡∏ô‡∏´‡∏Å‡∏™‡∏¥‡∏ö‡πÄ‡∏≠‡πá‡∏î")

In [None]:
thaiword_to_num("‡∏û‡∏±‡∏ô‡∏•‡πâ‡∏≤‡∏ô")

#### __Question 8 :__ Given a text representing an amont money, convert into number.

```
‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏°‡∏π‡∏•‡∏Ñ‡πà‡∏≤‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏≤‡∏ã‡∏∂‡πà‡∏á‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì ‡πë‡πí,‡πï‡πê‡πê ‡∏•‡πâ‡∏≤‡∏ô‡∏î‡∏≠‡∏•‡∏•‡∏≤‡∏£‡πå‡∏™‡∏´‡∏£‡∏±‡∏ê ‡πÉ‡∏´‡πâ‡∏ó‡∏ß‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏õ‡πá‡∏ô ‡πì ‡∏´‡∏°‡∏∑‡πà‡∏ô‡∏•‡πâ‡∏≤‡∏ô
```
->
```
‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏°‡∏π‡∏•‡∏Ñ‡πà‡∏≤‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏≤‡∏ã‡∏∂‡πà‡∏á‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì 12,500 ‡∏•‡πâ‡∏≤‡∏ô‡∏î‡∏≠‡∏•‡∏•‡∏≤‡∏£‡πå‡∏™‡∏´‡∏£‡∏±‡∏ê ‡πÉ‡∏´‡πâ‡∏ó‡∏ß‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏õ‡πá‡∏ô 3 ‡∏´‡∏°‡∏∑‡πà‡∏ô‡∏•‡πâ‡∏≤‡∏ô

```


In [None]:
text_with_thai_digits = "‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏°‡∏π‡∏•‡∏Ñ‡πà‡∏≤‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏≤‡∏ã‡∏∂‡πà‡∏á‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì ‡πë‡πí,‡πï‡πê‡πê ‡∏•‡πâ‡∏≤‡∏ô‡∏î‡∏≠‡∏•‡∏•‡∏≤‡∏£‡πå‡∏™‡∏´‡∏£‡∏±‡∏ê ‡πÉ‡∏´‡πâ‡∏ó‡∏ß‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏õ‡πá‡∏ô ‡πì ‡∏´‡∏°‡∏∑‡πà‡∏ô‡∏•‡πâ‡∏≤‡∏ô"

In [None]:
def convert(text):
    splits = text.split(" ")
    
    for index, split in enumerate(splits):
        
        if re.search(r"[‡πê-‡πô]", split):
            print("\nselcted split: ", split)
            ## Modify the following line to convert from thai digits to arabic

            splits[index] = split
            
            ##--------------------- ##
            print("convert to: ", splits[index])
    
    return " ".join(splits)
    

In [None]:
convert(text_with_thai_digits)

__Test:__ Given a list of sentences, please return only Thai sentences.

In [None]:
def test_convert_thai_digits(convert):
    expect = "‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏°‡∏π‡∏•‡∏Ñ‡πà‡∏≤‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏≤‡∏ã‡∏∂‡πà‡∏á‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì 12,500 ‡∏•‡πâ‡∏≤‡∏ô‡∏î‡∏≠‡∏•‡∏•‡∏≤‡∏£‡πå‡∏™‡∏´‡∏£‡∏±‡∏ê ‡πÉ‡∏´‡πâ‡∏ó‡∏ß‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏õ‡πá‡∏ô 3 ‡∏´‡∏°‡∏∑‡πà‡∏ô‡∏•‡πâ‡∏≤‡∏ô"
    actual = convert(text_with_thai_digits)

    if actual == expect:
        print("‚úÖ Test succeed. üòÅ")
    else:
        print("‚ùå Test failed. üò≠")
        print("The actual results:", actual)

test_convert_thai_digits(convert)

__Solution:__

In [None]:
def convert(text):
    splits = text.split(" ")
    
    for index, split in enumerate(splits):
        
        if re.search(r"[‡πê-‡πô]", split):
            print("\nselcted split: ", split)
            ## Write the code to convert

            splits[index] = thai_digit_to_arabic_digit(split)
            
            
            print("convert to: ", splits[index])

    return " ".join(splits)
    

In [None]:
test_convert_thai_digits(convert)

### 4.2 Thai Word count

In [None]:
from pythainlp.util import countthai



Count percentage of Thai chacters in a text.

```python
 countthai(text:str) -> percentage:float
```



In [None]:
countthai("Hello world.")

In [None]:
countthai("‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ ‡∏â‡∏±‡∏ô‡∏ä‡∏≠‡∏ö‡∏ô‡∏±‡πà‡∏á‡∏£‡∏ñ‡πÑ‡∏ü")

In [None]:
countthai("‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ Jane Doe")

#### __Question 9:__ Given a list of sentences, please return only Thai sentences.

In [None]:
en_th_sentences = [
    "‡∏°‡∏±‡∏ô‡∏à‡∏∞‡∏°‡∏µ‡∏≠‡∏∞‡πÑ‡∏£‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡πÉ‡∏´‡πâ ‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÑ‡∏î‡πâ‡∏•‡πà‡∏∞?",
    "‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ... ...‡∏î‡∏¥‡πä‡∏Å",
    "# Just to get a glimpse beyond this illusion #",
    "# I was soaring ever higher #",
    "# but I flew too high #",
    "    ‡πÉ‡∏ä‡πà",
    "‡πÉ‡∏ä‡πà ‡πÄ‡∏Ç‡∏≤‡πÅ‡∏•‡∏∞‡πÅ‡∏ü‡∏£‡∏á‡∏Ñ‡πå ‡πÅ‡∏•‡∏∞ ‡πÅ‡∏Ñ‡∏™ ‡∏ñ‡πâ‡∏≤‡∏Ç‡∏ß‡∏î‡πÄ‡∏´‡∏•‡πâ‡∏≤‡πÄ‡∏Ç‡∏≤ ‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏Å‡∏£‡∏∞‡πÄ‡∏õ‡πã‡∏≤",
    "‡∏°‡∏±‡∏ô‡∏Å‡πá‡∏î‡∏µ‡∏ó‡∏µ‡πà‡πÄ‡∏£‡∏≤‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏≤‡∏ß‡∏•‡∏µ‡∏¢‡πå ‡∏°‡∏≤‡∏≠‡∏¢‡∏π‡πà‡∏ù‡∏±‡πà‡∏á‡πÄ‡∏£‡∏≤‡∏ñ‡∏π‡∏Å‡∏°‡∏±‡πâ‡∏¢?",
    "‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏£‡∏≤‡∏ß‡∏•‡∏µ‡∏¢‡πå ‡πÄ‡∏£‡∏≤‡∏°‡∏µ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏±‡∏ô‡πÄ‡∏¢‡∏≠‡∏∞‡πÄ‡∏•‡∏¢",
    "‡πÄ‡∏ä‡∏¥‡∏ç‡∏ô‡∏±‡πà‡∏á",
    "== sync, corrected by elderman ==",
    "# though my eyes could see, I still was a blind man #",
    "# though my mind could think, I still was a madman #",
    "‡πÑ‡∏î‡πâ‡πÄ‡∏´‡πá‡∏ô‡∏ß‡πà‡∏≤ ‡∏û‡∏ß‡∏Å‡∏°‡∏±‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏°‡∏≤‡∏ï‡∏≤‡∏°‡∏•‡πà‡∏≤‡πÄ‡∏Ç‡∏≤",
    "# I hear the voices when I'm dreaming #",
]

In [None]:
def test_thai_sentence(sentence):

    ## Write down the code, to return value True if the sentence is in Thai language
    
    
    
    ##
    return False

__Test:__

In [None]:
th_sentences = [
    "‡∏°‡∏±‡∏ô‡∏à‡∏∞‡∏°‡∏µ‡∏≠‡∏∞‡πÑ‡∏£‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡πÉ‡∏´‡πâ ‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÑ‡∏î‡πâ‡∏•‡πà‡∏∞?",
    "‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ... ...‡∏î‡∏¥‡πä‡∏Å",
    "    ‡πÉ‡∏ä‡πà",
    "‡πÉ‡∏ä‡πà ‡πÄ‡∏Ç‡∏≤‡πÅ‡∏•‡∏∞‡πÅ‡∏ü‡∏£‡∏á‡∏Ñ‡πå ‡πÅ‡∏•‡∏∞ ‡πÅ‡∏Ñ‡∏™ ‡∏ñ‡πâ‡∏≤‡∏Ç‡∏ß‡∏î‡πÄ‡∏´‡∏•‡πâ‡∏≤‡πÄ‡∏Ç‡∏≤ ‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏Å‡∏£‡∏∞‡πÄ‡∏õ‡πã‡∏≤",
    "‡∏°‡∏±‡∏ô‡∏Å‡πá‡∏î‡∏µ‡∏ó‡∏µ‡πà‡πÄ‡∏£‡∏≤‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏≤‡∏ß‡∏•‡∏µ‡∏¢‡πå ‡∏°‡∏≤‡∏≠‡∏¢‡∏π‡πà‡∏ù‡∏±‡πà‡∏á‡πÄ‡∏£‡∏≤‡∏ñ‡∏π‡∏Å‡∏°‡∏±‡πâ‡∏¢?",
    "‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏£‡∏≤‡∏ß‡∏•‡∏µ‡∏¢‡πå ‡πÄ‡∏£‡∏≤‡∏°‡∏µ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏ï‡πâ‡∏≠‡∏á‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏±‡∏ô‡πÄ‡∏¢‡∏≠‡∏∞‡πÄ‡∏•‡∏¢",
    "‡πÄ‡∏ä‡∏¥‡∏ç‡∏ô‡∏±‡πà‡∏á",
    "‡πÑ‡∏î‡πâ‡πÄ‡∏´‡πá‡∏ô‡∏ß‡πà‡∏≤ ‡∏û‡∏ß‡∏Å‡∏°‡∏±‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏°‡∏≤‡∏ï‡∏≤‡∏°‡∏•‡πà‡∏≤‡πÄ‡∏Ç‡∏≤",
]

In [None]:
actual = list(filter(test_thai_sentence, en_th_sentences))
expect = th_sentences

if actual == expect:
    print("‚úÖ Test succeed. üòÅ")
else:
    print("‚ùå Test failed. üò≠")
    print("The actual results:", actual)
    print("\nThe expected sentences to be returned:\n")
    for i, sentence in enumerate(th_sentences):
        print(i+1, sentence)

__Solution:__

In [None]:
def test_thai_sentence(sentence):

    ## Write down the code, to return value True if the sentence is in Thai language
    
    if countthai(sentence) == 100.0:
        return True
    
    ##
    return False

In [None]:
actual = list(filter(test_thai_sentence, en_th_sentences))
expect = th_sentences

if actual == expect:
    print("‚úÖ Test succeed. üòÅ")
else:
    print("‚ùå Test failed. üò≠")
    print("The actual results:", actual)

### 4.3 Data and time

This function uses Thai names and Thai Buddhist Era for these directives:

- __%a__ - abbreviated weekday name (e.g. ‚Äú‡∏à‚Äù, ‚Äú‡∏≠‚Äù, ‚Äú‡∏û‚Äù, ‚Äú‡∏û‡∏§‚Äù, ‚Äú‡∏®‚Äù, ‚Äú‡∏™‚Äù, ‚Äú‡∏≠‡∏≤‚Äù)

- __%A__ - full weekday name (e.g.‚Äú‡∏ß‡∏±‡∏ô‡∏à‡∏±‡∏ô‡∏ó‡∏£‡πå‚Äù, ‚Äú‡∏ß‡∏±‡∏ô‡∏≠‡∏±‡∏á‡∏Ñ‡∏≤‡∏£‚Äù, ‚Äú‡∏ß‡∏±‡∏ô‡πÄ‡∏™‡∏≤‡∏£‡πå‚Äù, ‚Äú‡∏ß‡∏±‡∏ô‡∏≠‡∏≤‡∏ó‡∏¥‡∏ï‡∏¢‡πå‚Äù)

- __%b__ - abbreviated month name (e.g.‚Äú‡∏°.‡∏Ñ.‚Äù,‚Äù‡∏Å.‡∏û.‚Äù,‚Äù‡∏°‡∏µ.‡∏Ñ.‚Äù,‚Äù‡πÄ‡∏°.‡∏¢.‚Äù,‚Äù‡∏û.‡∏Ñ.‚Äù,‚Äù‡∏°‡∏¥.‡∏¢.‚Äù, ‚Äú‡∏ò.‡∏Ñ.‚Äù)

- __%B__ - full month name (e.g. ‚Äú‡∏°‡∏Å‡∏£‡∏≤‡∏Ñ‡∏°‚Äù, ‚Äú‡∏Å‡∏∏‡∏°‡∏†‡∏≤‡∏û‡∏±‡∏ô‡∏ò‡πå‚Äù, ‚Äú‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏¢‡∏ô‚Äù, ‚Äú‡∏ò‡∏±‡∏ô‡∏ß‡∏≤‡∏Ñ‡∏°‚Äù,)

- __%y__ - year without century (e.g. ‚Äú56‚Äù, ‚Äú10‚Äù)

- __%Y__ - year with century (e.g. ‚Äú2556‚Äù, ‚Äú2410‚Äù)

- __%c__ - date and time representation (e.g. ‚Äú‡∏û 6 ‡∏ï.‡∏Ñ. 01:40:00 2519‚Äù)

- __%v__ - short date representation (e.g. ‚Äù 6-‡∏°.‡∏Ñ.-2562‚Äù, ‚Äú27-‡∏Å.‡∏û.-2555‚Äù)

- __%d__ - day (e.g. "01", "07", 10", "31")

- __%-d__ - day with no zero padding (e.g. "1", "7",10", "31")
 
- __%H__  - hour (e.g. "01", "06", "23")

- __%-H__ - hour with no zero padding (e.g. "1", "6", "23"))

- __%M__  - minute (e.g. "1", "2", "11", "12")

- __%S__  - second (e.g. "1", "2", "11", "12")

In [None]:
from datetime import datetime
from pythainlp.util import thai_strftime


In [None]:
# Print the current date in Thai format

thai_strftime(datetime.now(), "%d %B %Y")

#### __Question 10:__ Given a date time object, return the datetime string in the following format


```
‡∏ß‡∏±‡∏ô‡∏®‡∏∏‡∏Å‡∏£‡πå ‡∏ó‡∏µ‡πà 1 ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏¢‡∏ô ‡∏õ‡∏µ ‡∏û.‡∏®. 2562 ‡πÄ‡∏ß‡∏•‡∏≤ 11 ‡∏ô‡∏≤‡∏¨‡∏¥‡∏Å‡∏≤ 30 ‡∏ô‡∏≤‡∏ó‡∏µ 10 ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ
```

In [None]:
datetime_object_workshop_day = datetime(year=2019, month=11, day=1,hour=11,minute=30,second=10)

def print_datetime_thai(datetime_object):
    # Write down the format string.
    fmt = ""
    return thai_strftime(datetime_object, fmt)

__Test:__

In [None]:
def test_print_datetime_thai(fn):
    expect = "‡∏ß‡∏±‡∏ô‡∏®‡∏∏‡∏Å‡∏£‡πå ‡∏ó‡∏µ‡πà 1 ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏¢‡∏ô ‡∏û.‡∏®. 2562 ‡πÄ‡∏ß‡∏•‡∏≤ 11 ‡∏ô‡∏≤‡∏¨‡∏¥‡∏Å‡∏≤ 30 ‡∏ô‡∏≤‡∏ó‡∏µ 10 ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ"
    actual = fn(datetime_object_workshop_day)

    if actual == expect:
        print("‚úÖ Test succeed. üòÅ\n")
        print("Your Result:",expect)
    else:
        print("‚ùå Test failed. üò≠")
        print("\nYour result    :", actual)
        print("\nExpected result:", expect)

test_print_datetime_thai(print_datetime_thai)

__Solution:__

In [None]:
def print_datetime_thai(datetime_object):
    # Write down the format string.
    fmt = "%A ‡∏ó‡∏µ‡πà %-d %B ‡∏û.‡∏®. %Y ‡πÄ‡∏ß‡∏•‡∏≤ %H ‡∏ô‡∏≤‡∏¨‡∏¥‡∏Å‡∏≤ %M ‡∏ô‡∏≤‡∏ó‡∏µ %S ‡∏ß‡∏¥‡∏ô‡∏≤‡∏ó‡∏µ"
    return thai_strftime(datetime_object, fmt)

test_print_datetime_thai(print_datetime_thai)