# Classify Category (Decision Tree)
##### Mr.Jakkrit Sonsanit

***

### Import Library 

In [2]:
import pandas as pd
import numpy as np
from pythainlp.corpus.common import thai_words
from pythainlp.tokenize import dict_trie, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

### Import data

In [42]:
df = pd.read_csv("Expenses.csv")
df.tail()

Unnamed: 0,date,text,money,cate
50,9/10/2019,ค่ารถ,40.0,Transportation
51,9/10/2019,ชานม,90.0,Food
52,9/10/2019,ข้าวเย็น,80.0,Food
53,9/10/2019,โกโก้,35.0,Food
54,9/10/2019,ค่าหนัง,140.0,Entertainment


### Create Corpus

In [18]:
custom_dict = set(thai_words())
custom_dict.add('ราเมง')
custom_dict.add('คาปูชิโน่')
trie = dict_trie(dict_source=custom_dict)

# corpus = [word_tokenize(i, engine='dict', custom_dict=trie) for i in df.text]
corpus = []
for i in df.text:
    for j in word_tokenize(i, engine='dict', custom_dict=trie):
        if j not in corpus:
            corpus.append(j)
corpus
print(corpus)

['นม', 'ช็อคโกแลต', 'ข้าว', 'เที่ยง', 'น้ำ', 'กระเจี๊ยบ', 'ชา', 'ข้าวเย็น', 'ค่า', 'อินเตอร์เน็ต', 'ค่าโทรศัพท์', 'หมา', 'ล่า', 'ไอติม', 'ตัดผม', 'ซาลาเปา', 'ไส้กรอก', 'ดอย', 'คำ', 'ใบมีดโกน', 'ห้อง', 'Netflix', 'เงินเก็บ', 'ราเมง', 'นมเปรี้ยว', 'ขนมปัง', 'ขนม', 'arl', 'mrt', 'ซื้อ', 'กางเกง', 'ถั่ว', 'มาม่า', 'อิชิตัน', 'คาปูชิโน่', 'viu', 'บัวลอย', 'ค่ารถ', 'โกโก้', 'หนัง']


### Create Bag of Word

In [53]:
BOW = [list() for i in range(len(df.text))]
l = 0
count = 1
for i in df.text:
    tmp = word_tokenize(i, engine='dict', custom_dict=trie)
    for j in corpus:

        if j in tmp:

            BOW[l].append(1)
            tmp.remove(j)
        else:
            BOW[l].append(0)
        
    if len(tmp) != 0:
        BOW[l].append(len(tmp))
    elif len(tmp) == 0:
        BOW[l].append(0)
    l += 1

### Train Decision Tree model

In [54]:
ytarget = df.cate
xtrain = BOW

dtree = DecisionTreeClassifier()
dtree.fit(X=xtrain,y=ytarget)
predictiontree = dtree.predict(xtrain)

### Create Bag of Word form Test data

In [58]:
txt = ['ค่าหนัง', 'นม', 'ขนม']
BOW_t = [list() for i in range(len(txt))]
l = 0
for i in txt:
    tmp = word_tokenize(i, engine='dict', custom_dict=trie)
    for j in corpus:

        if j in tmp:

            BOW_t[l].append(1)
            tmp.remove(j)
        else:
            BOW_t[l].append(0)
        
    if len(tmp) != 0:
        BOW_t[l].append(len(tmp))
    elif len(tmp) == 0:
        BOW_t[l].append(0)
    l += 1

### Predict Category 

In [59]:
predictiontree = dtree.predict(BOW_t)
predictiontree

array(['Entertainment', 'Food', 'Food'], dtype=object)