In [42]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
import joblib
import sys

from pythainlp.corpus.common import thai_words
from sklearn.feature_extraction.text import TfidfTransformer
from pythainlp.tokenize import dict_trie, word_tokenize
from sklearn.model_selection import cross_val_score

In [45]:
sys.path.append('..')

#### Function

In [14]:
def get_dict(word_list):
    """
    This function customize dict
    
    Parameters
    ------------
    word_list: add word to dict for tokenize
    """
    custom_dict = set(thai_words())
    for i in word_list:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)
    return trie

def get_corpus(text, trie):
    """
    This function create corpus from list text
    
    Parameters
    ------------
    text: list of text
    """
    corpus = []
    for i in text:
        for j in word_tokenize(i, engine='dict', custom_dict=trie):
            if j not in corpus:
                corpus.append(j)
    return corpus

def get_BOW(text, corpus, trie, tfidf = True):
    BOW = [list() for i in range(len(text))]
    l = 0
    count = 1
    for i in text:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW[l].append(tmp.count(j))
                tmp.remove(j)

            else:
                BOW[l].append(0)

        if len(tmp) != 0:
            BOW[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW[l].append(0)
        l += 1
        
    BOW = np.array(BOW)
    
    if tfidf:
        tfidf_transformer = TfidfTransformer()
        return tfidf_transformer.fit_transform(BOW)
    else:
        return BOW

In [15]:
file = open('Data/line-bot-chrins-export.json', encoding='utf-8')
data = json.load(file)
del data['bmi']
expenses_df = pd.DataFrame.from_dict(data['expenses']).T
income_df = pd.DataFrame.from_dict(data['income']).T
income_df.drop("-LwSJ24Y1cVwv83oCbTJ", inplace = True)

income_df.loc[:, "Label"] = 'income'
expenses_df.loc[:, "Label"] = 'expenses'

df = pd.concat([expenses_df, income_df])

In [16]:
word_list = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน','หอยลาย', 'ปุ้มปุ้ย']
cus_dict = get_dict(word_list)
corpus = get_corpus(df.text, cus_dict)
X_train_tfidf = get_BOW(df.text, corpus, cus_dict)

In [17]:
clf_xgb = xgb.XGBRFClassifier()
clf_xgb.fit(X_train_tfidf, df.Label)

XGBRFClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
                colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1,
                importance_type='gain', interaction_constraints=None,
                learning_rate=1, max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=0, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8,
                tree_method=None, validate_parameters=False, verbosity=None)

In [18]:
X_test_tfidf = get_BOW(['ได้เงิน'], corpus, cus_dict)

In [19]:
clf_xgb.predict(X_test_tfidf)

array(['income'], dtype=object)

#### Export finalized model

In [20]:
joblib.dump(clf_xgb, 'text_clf.sav')

['text_clf.sav']

In [21]:
joblib.dump(corpus, 'corpus.sav')
joblib.dump(cus_dict, 'cus_dict.sav')

['cus_dict.sav']

---

#### classify category

In [22]:
df = pd.read_csv("Data/Expenses.csv")
df.tail()

Unnamed: 0,date,text,money,cate
642,2020-02-27,ค่าอินเตอร์เน็ต,631.3,Bill
643,2020-02-27,ข้าวเย็น,40.0,Food
644,2020-02-27,ไข่ต้ม,11.0,Food
645,2020-02-27,น้ำ,35.0,Food
646,2020-02-27,ขนม,58.0,Food


In [23]:
word_list = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน','หอยลาย', 'ปุ้มปุ้ย']
cus_dict = get_dict(word_list)
corpus = get_corpus(df.text, cus_dict)
X_train_tfidf = get_BOW(df.text, corpus, cus_dict)

In [24]:
clf_cate_xgb = xgb.XGBRFClassifier()
clf_cate_xgb.fit(X_train_tfidf, df.cate)

XGBRFClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
                colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1,
                importance_type='gain', interaction_constraints=None,
                learning_rate=1, max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=0, num_parallel_tree=100,
                objective='multi:softprob', random_state=0, reg_alpha=0,
                reg_lambda=1e-05, scale_pos_weight=None, subsample=0.8,
                tree_method=None, validate_parameters=False, verbosity=None)

#### Export finalized model

In [25]:
joblib.dump(clf_cate_xgb, 'cate_clf.sav')

['cate_clf.sav']

In [None]:
import pandas as pd
import joblib
import re
import numpy as np
import datetime
import pyrebase
import firebase_admin

from firebase_admin import credentials
from firebase_admin import db
from pythainlp.tokenize import dict_trie, word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

class chat:

    def __init__(self, label, text, money):
        self.label = label
        self.money = money
        self.text = text

        if self.label == 'expenses':
            cate_clf = joblib.load('asset/cate_clf.sav')
            tmp = get_BOW([self.text], corpus, cus_dict)
            pred = cate_clf.predict(tmp)
            
            self.cate = pred
            
        elif self.label == 'income':
            self.cate = 'income'

def main(chat_text):
    model = joblib.load('asset/text_clf.sav')
    corpus = joblib.load('asset/corpus.sav')
    cus_dict = joblib.load('asset/cus_dict.sav')

    money = int(re.findall("[0-9]+", chat_text)[0])
    expression = r"(.*?)[0-9]"
    t = re.findall(expression, chat_text)[0].replace(' ', '')

    tmp = get_BOW([chat_text], corpus, cus_dict)
    pred = model.predict(tmp)

    chat_data = chat(pred[0], t, money)

    update_db(chat_data)

def update_db(chat_data)
    # Fetch the service account key JSON file contents
    cred = credentials.Certificate('line-bot-chrins-firebase-adminsdk.json')

    # Initialize the app with a service account, granting admin privileges
    firebase_admin.initialize_app(cred, {
        'databaseURL': 'https://line-bot-chrins.firebaseio.com'
    })
    ref = db.reference("/")

    d = datetime.datetime.today().strftime("%Y/%m/%d")
    data = {"Date":d, "money":chat_data.money, "text":chat_data.text, 'type':chat_data.label, 'cate':chat_data.cate}
    ref.push(data)


def get_BOW(text, corpus, trie, tfidf = True):
    BOW = [list() for i in range(len(text))]
    l = 0
    count = 1
    for i in text:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW[l].append(tmp.count(j))
                tmp.remove(j)

            else:
                BOW[l].append(0)

        if len(tmp) != 0:
            BOW[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW[l].append(0)
        l += 1
        
    BOW = np.array(BOW)
    
    if tfidf:
        tfidf_transformer = TfidfTransformer()
        return tfidf_transformer.fit_transform(BOW)
    else:
        return BOW

In [35]:
class chat:

    def __init__(self, label, text, money):
        self.label = label
        self.money = money
        self.text = text

        if self.label == 'expenses':
            cate_clf = joblib.load('cate_clf.sav')
            tmp = get_BOW([self.text], corpus, cus_dict)
            pred = cate_clf.predict(tmp)
            
            self.cate = pred
            
        elif self.label == 'income':
            self.cate = 'income'
            
def main(chat_text):
    model = joblib.load('text_clf.sav')
    corpus = joblib.load('corpus.sav')
    cus_dict = joblib.load('cus_dict.sav')

    money = int(re.findall("[0-9]+", chat_text)[0])
    expression = r"(.*?)[0-9]"
    t = re.findall(expression, chat_text)[0].replace(' ', '')

    tmp = get_BOW([chat_text], corpus, cus_dict)
    pred = model.predict(tmp)

    return chat(pred[0], t, money)

In [39]:
d = main('ขนม 25')

In [41]:
d.cate

array(['Food'], dtype=object)