In [5]:
import pandas as pd
import numpy as np
from pythainlp.corpus.common import thai_words
from pythainlp.tokenize import dict_trie, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("Expenses.csv")
df.head()

Unnamed: 0,date,text,money,cate
0,1/10/19,นมช็อคโกแลต,12.0,Food
1,1/10/19,ข้าวเที่ยง,35.0,Food
2,1/10/19,น้ำกระเจี๊ยบ,5.0,Food
3,1/10/19,ชานม,35.0,Food
4,1/10/19,ข้าวเย็น,50.0,Food


In [3]:
custom_dict = set(thai_words())
custom_dict.add('ราเมง')
custom_dict.add('คาปูชิโน่')
trie = dict_trie(dict_source=custom_dict)

# corpus = [word_tokenize(i, engine='dict', custom_dict=trie) for i in df.text]
corpus = []
for i in df.text:
    for j in word_tokenize(i, engine='dict', custom_dict=trie):
        if j not in corpus:
            corpus.append(j)
corpus
print(corpus)

['นม', 'ช็อคโกแลต', 'ข้าว', 'เที่ยง', 'น้ำ', 'กระเจี๊ยบ', 'ชา', 'ข้าวเย็น', 'ค่า', 'อินเตอร์เน็ต', 'ค่าโทรศัพท์', 'หมา', 'ล่า', 'ไอติม', 'ตัดผม', 'ซาลาเปา', 'ไส้กรอก', 'ดอย', 'คำ', 'ใบมีดโกน', 'ห้อง', 'Netflix', 'เงินเก็บ', 'ราเมง', 'นมเปรี้ยว', 'ขนมปัง', 'ขนม', 'arl', 'mrt', 'ซื้อ', 'กางเกง', 'ถั่ว', 'มาม่า', 'อิชิตัน', 'คาปูชิโน่', 'viu', 'บัวลอย']


In [4]:
data = [list() for i in range(len(df.text))]
l = 0
count = 1
for i in df.text:
    tmp = word_tokenize(i, engine='dict', custom_dict=trie)
    for j in corpus:

        if j in tmp:

            data[l].append(1)
            tmp.remove(j)
        else:
            data[l].append(0)
        
    if len(tmp) != 0:
        data[l].append(len(tmp))
    elif len(tmp) == 0:
        data[l].append(0)
    l += 1
data
    

[[1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,


In [6]:
ytrain = df.cate
xtrain = data

dtree = DecisionTreeClassifier()
fitted_tree = dtree.fit(X=xtrain,y=ytrain)
predictiontree = dtree.predict(xtrain)
dense_matrix = fitted_tree.decision_path(xtrain).todense()

In [7]:
predictiontree

array(['Food', 'Food', 'Food', 'Food', 'Food', 'Bill', 'Bill', 'Food',
       'Food', 'Food', 'Food', 'Food', 'Other', 'Food', 'Food', 'Food',
       'Shopping', 'Bill', 'Food', 'Bill', 'Food', 'Food', 'Saving',
       'Food', 'Food', 'Food', 'Food', 'Food', 'Food', 'Transportation ',
       'Transportation ', 'Transportation ', 'Transportation ',
       'Transportation ', 'Shopping', 'Food', 'Food', 'Food', 'Food',
       'Food', 'Food', 'Food', 'Bill', 'Food', 'Food', 'Food', 'Food',
       'Food'], dtype=object)

In [8]:
txt = ['น้ำดื่ม']
data = [list() for i in range(len(txt))]
l = 0
for i in txt:
    tmp = word_tokenize(i, engine='dict', custom_dict=trie)
    for j in corpus:

        if j in tmp:

            data[l].append(1)
            tmp.remove(j)
        else:
            data[l].append(0)
        
    if len(tmp) != 0:
        data[l].append(len(tmp))
    l += 1
data

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1]]

In [9]:
predictiontree = dtree.predict(data)
predictiontree

array(['Food'], dtype=object)