In [42]:
text_list = ['อาหารอร่อยมากเลยครับ','ชอบรายการอาหารที่หลากหลาย','มีกลิ่นติดตัว', 'รายการอาหารไม่มากพอ' , 'บริการดีมาก', 'พนักงานไม่เอาใจใส่','วันนี้คุณสวยจัง','hi how are you.','นิสัยไม่ดีเลย','welcome to my world.']

In [43]:
import re
import string

def clean_msg(msg):
    
    
    # ลบ text ที่อยู่ในวงเล็บ <> ทั้งหมด
    msg = re.sub(r'<.*?>','', msg)
    
    # ลบ hashtag
    msg = re.sub(r'#','',msg)
    
    # ลบ …
    msg = re.sub(r'…','',msg)
    
    # ลบ เครื่องหมายคำพูด (punctuation)
    for c in string.punctuation:
        msg = re.sub(r'\{}'.format(c),'',msg)
    
    # ลบ separator เช่น \n \t
    msg = ' '.join(msg.split())
    
    return msg

In [44]:
print('original text:\n',text_list[0])
print('clean text:\n',clean_msg(text_list[0]))

original text:
 อาหารอร่อยมากเลยครับ
clean text:
 อาหารอร่อยมากเลยครับ


In [45]:
print('original text:\n',text_list[1])
print('clean text:\n',clean_msg(text_list[1]))

original text:
 ชอบรายการอาหารที่หลากหลาย
clean text:
 ชอบรายการอาหารที่หลากหลาย


In [46]:
clean_text = [clean_msg(txt) for txt in text_list]

In [47]:
#!pip install pythainlp
#!pip install stop_words
import pythainlp
from pythainlp import word_tokenize
from pythainlp.corpus import thai_stopwords
from pythainlp.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.corpus import words
from stop_words import get_stop_words

In [48]:
import nltk
nltk.download('words')
th_stop = tuple(thai_stopwords())
en_stop = tuple(get_stop_words('en'))
p_stemmer = PorterStemmer()

#print(th_stop)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Seena\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [49]:
def split_word(text):
            
    
    tokens = word_tokenize(text,engine='newmm')
    
    # Remove stop words ภาษาไทย และภาษาอังกฤษ
    tokens = [i for i in tokens if not i in th_stop and not i in en_stop]
    #tokens = [i for i in tokens ]
    
    # หารากศัพท์ภาษาไทย และภาษาอังกฤษ
    # English
    tokens = [p_stemmer.stem(i) for i in tokens]
    
    # Thai
    tokens_temp=[]
    for i in tokens:
        w_syn = wordnet.synsets(i)
        if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0):
            tokens_temp.append(w_syn[0].lemma_names('tha')[0])
        else:
            tokens_temp.append(i)
    
    tokens = tokens_temp
    
    # ลบตัวเลข
    tokens = [i for i in tokens if not i.isnumeric()]
    
    # ลบช่องว่าง
    tokens = [i for i in tokens if not ' ' in i]

    return tokens

In [50]:
print('tokenized text:\n',split_word(clean_msg(text_list[0])))
print('tokenized text:\n',split_word(clean_msg(text_list[1])))
print('tokenized text:\n',split_word(clean_msg(text_list[2])))
print('tokenized text:\n',split_word(clean_msg(text_list[3])))
print('tokenized text:\n',split_word(clean_msg(text_list[4])))
print('tokenized text:\n',split_word(clean_msg(text_list[5])))
print('tokenized text:\n',split_word(clean_msg(text_list[6])))
print('tokenized text:\n',split_word(clean_msg(text_list[7])))
print('tokenized text:\n',split_word(clean_msg(text_list[8])))
print('tokenized text:\n',split_word(clean_msg(text_list[9])))

tokenized text:
 ['อาหาร', 'อร่อย']
tokenized text:
 ['ชอบ', 'เมนู', 'หลากหลาย']
tokenized text:
 ['กลิ่น', 'ติดตัว']
tokenized text:
 ['เมนู']
tokenized text:
 ['บริการ', 'ดีมาก']
tokenized text:
 ['พนักงาน', 'เอาใจใส่']
tokenized text:
 ['สวย']
tokenized text:
 ['hi']
tokenized text:
 ['นิสัย', 'ไม่ดี']
tokenized text:
 ['welcom', 'world']


In [51]:
tokens_list = [split_word(txt) for txt in clean_text]

In [52]:
tokens_list

[['อาหาร', 'อร่อย'],
 ['ชอบ', 'เมนู', 'หลากหลาย'],
 ['กลิ่น', 'ติดตัว'],
 ['เมนู'],
 ['บริการ', 'ดีมาก'],
 ['พนักงาน', 'เอาใจใส่'],
 ['สวย'],
 ['hi'],
 ['นิสัย', 'ไม่ดี'],
 ['welcom', 'world']]

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
tokens_list_j = [','.join(tkn) for tkn in tokens_list]
cvec = CountVectorizer(analyzer=lambda x:x.split(','))
c_feat = cvec.fit_transform(tokens_list_j)

In [54]:
import pickle
# save the model to disk
model = 'c_feat.pkl'
pickle.dump(c_feat, open(model, 'wb'))

In [55]:
vocab = cvec.vocabulary_
print(vocab)

{'อาหาร': 13, 'อร่อย': 12, 'ชอบ': 4, 'เมนู': 14, 'หลากหลาย': 11, 'กลิ่น': 3, 'ติดตัว': 6, 'บริการ': 8, 'ดีมาก': 5, 'พนักงาน': 9, 'เอาใจใส่': 15, 'สวย': 10, 'hi': 0, 'นิสัย': 7, 'ไม่ดี': 16, 'welcom': 1, 'world': 2}


In [56]:
c_feat[:,:20].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(analyzer=lambda x:x.split(','),)
t_feat = tvec.fit_transform(tokens_list_j)

In [58]:
t_feat[:,:5].todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.60604332],
        [0.        , 0.        , 0.        , 0.70710678, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.70710678, 0.70710678, 0.        , 0.        ]])

In [59]:
print(len(tvec.idf_),len(tvec.vocabulary_))

17 17


In [60]:
c_feat[:,:5].todense()

matrix([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0]], dtype=int64)

In [61]:
print(c_feat.todense())

[[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1]
 [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [62]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

In [63]:
X = np.array(c_feat.todense())

In [64]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [65]:
 y = np.array(['+', '+', '-', '-','+','-','+','+','-','+'])

In [66]:
clf = MultinomialNB(class_prior=[0.25, 0.5])

# Train model
model = clf.fit(X, y)

In [67]:
# Create new observation
new_observation = [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [68]:
# Predict new observation's class
y_predicted = model.predict(X)

In [69]:
print(y_predicted)

['+' '+' '-' '-' '+' '-' '-' '-' '-' '+']
