In [137]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
import joblib

from pythainlp.corpus.common import thai_words
from sklearn.feature_extraction.text import TfidfTransformer
from pythainlp.tokenize import dict_trie, word_tokenize
from sklearn.model_selection import cross_val_score

#### Function

In [138]:
def get_dict(word_list):
    """
    This function customize dict
    
    Parameters
    ------------
    word_list: add word to dict for tokenize
    """
    custom_dict = set(thai_words())
    for i in word_list:
        custom_dict.add(i)
    trie = dict_trie(dict_source=custom_dict)
    return trie

def get_corpus(text, trie):
    """
    This function create corpus from list text
    
    Parameters
    ------------
    text: list of text
    """
    corpus = []
    for i in text:
        for j in word_tokenize(i, engine='dict', custom_dict=trie):
            if j not in corpus:
                corpus.append(j)
    return corpus

def get_BOW(text, corpus, trie, tfidf = True):
    BOW = [list() for i in range(len(text))]
    l = 0
    count = 1
    for i in text:
        tmp = word_tokenize(i, engine='dict', custom_dict=trie)
        for j in corpus:

            if j in tmp:

                BOW[l].append(tmp.count(j))
                tmp.remove(j)

            else:
                BOW[l].append(0)

        if len(tmp) != 0:
            BOW[l].append(len(tmp))
        elif len(tmp) == 0:
            BOW[l].append(0)
        l += 1
        
    BOW = np.array(BOW)
    
    if tfidf:
        tfidf_transformer = TfidfTransformer()
        return tfidf_transformer.fit_transform(BOW)
    else:
        return BOW

In [139]:
file = open('Data/line-bot-chrins-export.json', encoding='utf-8')
data = json.load(file)
del data['bmi']
expenses_df = pd.DataFrame.from_dict(data['expenses']).T
income_df = pd.DataFrame.from_dict(data['income']).T
income_df.drop("-LwSJ24Y1cVwv83oCbTJ", inplace = True)

income_df.loc[:, "Label"] = 0
expenses_df.loc[:, "Label"] = 1

df = pd.concat([expenses_df, income_df])

In [140]:
word_list = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน','หอยลาย', 'ปุ้มปุ้ย']
cus_dict = get_dict(word_list)
corpus = get_corpus(df.text, cus_dict)
X_train_tfidf = get_BOW(df.text, corpus, cus_dict)

In [141]:
clf_xgb = xgb.XGBRFClassifier()
clf_xgb.fit(X_train_tfidf, df.Label)

XGBRFClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
                colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1,
                importance_type='gain', interaction_constraints=None,
                learning_rate=1, max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=0, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8,
                tree_method=None, validate_parameters=False, verbosity=None)

In [142]:
X_test_tfidf = get_BOW(['ได้เงิน'], corpus, cus_dict)

In [143]:
clf_xgb.predict(X_test_tfidf)

array([0], dtype=int64)

#### Export finalized model

In [144]:
joblib.dump(clf_xgb, 'text_clf.sav')

['text_clf.sav']

In [145]:
joblib.dump(corpus, 'corpus.sav')

['corpus.sav']

In [146]:
loaded_corpus = joblib.load('corpus.sav')