# `true-intent` Destination Classification

We provide 3 benchmarks for the 7-class multi-class classification of `destination` column in `truevoice-intnet` dataset: [fastText](https://github.com/facebookresearch/fastText), LinearSVC and [ULMFit](https://github.com/cstorm125/thai2fit). In the transfer learning cases, we first finetune the embeddings using all data. The test set contains 20% of all data split by [TrueVoice](http://www.truevoice.co.th/). The rest is split into 85/15 train-validation split randomly. Performance metrics are micro-averaged accuracy and F1 score.

| model     | accuracy | micro-F1 |
|-----------|----------|----------|
| fastText  | 0.384116 | 0.384116 |
| LinearSVC | 0.807876 | 0.327565 |
| ULMFit    |       0  |      0   |

In [1]:
import pandas as pd
import numpy as np
from pythainlp import word_tokenize
from tqdm import tqdm_notebook
from collections import Counter
import re

#viz
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns

def replace_newline(t):
    return re.sub('[\n]{1,}', ' ', t)

ft_data = 'ft_data/'

y = 'destination'
nb_class = 6

In [2]:
import string
import emoji
def replace_url(text):
    URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    return re.sub(URL_PATTERN, 'xxurl', text)

def replace_rep(text):
    def _replace_rep(m):
        c,cc = m.groups()
        return f'{c}xxrep'
    re_rep = re.compile(r'(\S)(\1{2,})')
    return re_rep.sub(_replace_rep, text)

def ungroup_emoji(toks):
    res = []
    for tok in toks:
        if emoji.emoji_count(tok) == len(tok):
            for char in tok:
                res.append(char)
        else:
            res.append(tok)
    return res

def process_text(text):
    #pre rules
    res = text.lower().strip()
    res = replace_url(res)
    res = replace_rep(res)
    
    #tokenize
    res = [word for word in res.split('|') if word and not re.search(pattern=r"\s+", string=word)]
    
    #post rules
    res = ungroup_emoji(res)
    
    return res

## Train-validation-test Split

We perform 85/15 train-validation split in addition to the test split by [TrueVoice](http://www.truevoice.co.th/).

In [27]:
from sklearn.model_selection import train_test_split
all_df = pd.read_csv(f'mari_train.csv')
all_df['destination'] = all_df.destination.map(lambda x: x.replace(' ','_'))
train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = pd.read_csv(f'mari_test.csv')
test_df['destination'] = test_df.destination.map(lambda x: x.replace(' ','_'))
print(train_df.shape, valid_df.shape, test_df.shape)

(10998, 5) (1941, 5) (3236, 5)


In [28]:
#test set prevalence
test_df['destination'].value_counts() / test_df.shape[0]

billing_and_payment      0.301916
internet                 0.190049
promotions               0.182324
other_queries            0.169654
international_dialing    0.062732
lost_and_stolen          0.060569
true_money               0.032756
Name: destination, dtype: float64

## [fastText](https://github.com/facebookresearch/fastText) Model

We used embeddings pretrained on [Thai Wikipedia Dump](https://github.com/facebookresearch/fastText/blob/master/docs/pretrained-vectors.md) and finetuned them using all of `truevoice-intent` using skipgram model. After that, we do a multi-class classification and compute performance metrics.

In [29]:
df_txts = ['train','valid','test']
dfs = [train_df,valid_df,test_df]

for i in range(3):
    df = dfs[i]
    ft_lines = []
    for _,row in df.iterrows():
        ft_lab = f'__label__{row[y]}'
        ft_text = replace_newline(f'{row["texts"]}')
        ft_line = f'{ft_lab} {ft_text}'
        ft_lines.append(ft_line)

    doc = '\n'.join(ft_lines)
    with open(f'{ft_data}{df_txts[i]}.txt','w') as f:
        f.write(doc)
    f.close()

In [30]:
#for fasttext embedding finetuning
ft_lines = []
for _,row in all_df.iterrows():
    ft_lab = '__label__0'
    ft_text = replace_newline(f'{row["texts"]}')
    ft_line = f'{ft_lab} {ft_text}'
    ft_lines.append(ft_line)

doc = '\n'.join(ft_lines)
with open(f'{ft_data}df_all.txt','w') as f:
    f.write(doc)
f.close()

In [31]:
#finetune with all data
!/home/charin/fastText-0.1.0/fasttext skipgram \
-pretrainedVectors 'model/wiki.th.vec' -dim 300 \
-input ft_data/df_all.txt -output 'model/finetuned'

Read 0M words
Number of words:  538
Number of labels: 1
Progress: 100.0%  words/sec/thread: 31923  lr: 0.000000  loss: 2.440569  eta: 0h0m 


In [32]:
#train classifier
!/home/charin/fastText-0.1.0/fasttext supervised \
-input 'ft_data/train.txt' -output 'model/classifier' \
-pretrainedVectors 'model/finetuned.vec' -epoch 5 -dim 300 -wordNgrams 2 

Read 0M words
Number of words:  19060
Number of labels: 7
Progress: 100.0%  words/sec/thread: 296072  lr: 0.000000  loss: 0.471060  eta: 0h0m 


In [33]:
#get prediction
preds = !/home/charin/fastText-0.1.0/fasttext predict 'model/classifier.bin' 'ft_data/test.txt'

In [35]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
pred_lab = np.array([i[9:] for i in preds])

enc_fit = enc.fit(test_df[y][:,None])
pred_ohe = enc_fit.transform(pred_lab[:,None]).toarray()
y_ohe = enc_fit.transform(test_df[y][:,None]).toarray()

In [36]:
import warnings
warnings.filterwarnings("ignore")
#macro metrics
for i in range(nb_class):
    print(
        (pred_ohe[:,i]==y_ohe[:,i]).mean(),
        f1_score(pred_ohe[:,i],y_ohe[:,i]),
        precision_score(pred_ohe[:,i],y_ohe[:,i]),
        recall_score(pred_ohe[:,i],y_ohe[:,i])
         )

0.4020395550061805 0.48687350835322196 0.9396110542476971 0.3285612025769506
0.9409765142150803 0.11981566820276499 0.06403940886699508 0.9285714285714286
0.8241656365883807 0.1929078014184397 0.11056910569105691 0.7555555555555555
0.9508652657601978 0.31759656652360513 0.18877551020408162 1.0
0.8340543881334982 0.12965964343598058 0.07285974499089254 0.5882352941176471
0.8389987639060569 0.32947232947232946 0.21694915254237288 0.6844919786096256


In [37]:
print('micro metrics')
(pred_lab==test_df[y]).mean(), \
f1_score(test_df[y],pred_lab,average='micro'), \
precision_score(test_df[y],pred_lab,average='micro'), \
recall_score(test_df[y],pred_lab,average='micro')

micro metrics


(0.3841161928306551,
 0.3841161928306552,
 0.3841161928306551,
 0.3841161928306551)

## LinearSVC Model

Code for LinearSVC is provided by [@lukkiddd](https://github.com/lukkiddd).

In [38]:
X_train, y_train = train_df['texts'], train_df[y]
X_test, y_test = test_df['texts'], test_df[y]

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2))),
    ('clf', LinearSVC()),
])

text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [40]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
pred_lab = text_clf.predict(X_test)

enc_fit = enc.fit(test_df[y][:,None])
pred_ohe = enc_fit.transform(pred_lab[:,None]).toarray()
y_ohe = enc_fit.transform(test_df[y][:,None]).toarray()

In [41]:
import warnings
warnings.filterwarnings("ignore")
#macro metrics
for i in range(nb_class):
    print(
        (pred_ohe[:,i]==y_ohe[:,i]).mean(),
        f1_score(pred_ohe[:,i],y_ohe[:,i]),
        precision_score(pred_ohe[:,i],y_ohe[:,i]),
        recall_score(pred_ohe[:,i],y_ohe[:,i])
         )

0.32756489493201485 0.4731234866828087 1.0 0.30986362194735173
0.9372682323856613 0.0 0.0 0.0
0.8105686032138443 0.006482982171799028 0.0032520325203252032 1.0
0.9493201483312732 0.2807017543859649 0.16326530612244897 1.0
0.830964153275649 0.007259528130671506 0.0036429872495446266 1.0
0.8260197775030902 0.08752025931928686 0.04576271186440678 1.0


In [42]:
print('micro metrics')
(pred_ohe==y_ohe).mean(), \
f1_score(y_ohe,pred_ohe,average='micro'), \
precision_score(y_ohe,pred_ohe,average='micro'), \
recall_score(y_ohe,pred_ohe,average='micro')

micro metrics


(0.80787568426629,
 0.32756489493201485,
 0.32756489493201485,
 0.32756489493201485)

## [ULMFit](https://github.com/cstorm125/thai2fit) Model