In [None]:
! pip install pythainlp
! pip install -U tensorflow-text==2.6.0

In [None]:
import pandas as pd
import numpy as np
import re
import string
from tqdm import tqdm_notebook

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

import pythainlp.util
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize
from pythainlp.tag import pos_tag
from pythainlp.corpus.common import thai_stopwords
from pythainlp import thai_punctuations

In [None]:
engine= 'newmm'

In [None]:
#function
def process_text(text):
  token = word_tokenize(text, engine=engine, keep_whitespace=False)
  normalized_token = []
  for item in token:
    normalized_token.append(normalize(item))
  stopwords = thai_stopwords()
  woStopword_token = []
  for item in normalized_token:
    if item not in stopwords:
      woStopword_token.append(item)
  en_punctuation = string.punctuation
  th_punctuation = thai_punctuations
  punctuation = en_punctuation+th_punctuation
  final_token = []
  for item in woStopword_token:
	  if item not in punctuation:
		  final_token.append(item) 
  return final_token

# preprocess

In [None]:
df = pd.read_csv('/content/dataset(4class_agent).csv')
df = df.rename(columns={"text":"texts","class":"category"})
all_df = df.dropna()
all_df

Unnamed: 0,filename,texts,category
0,_10_10_7073379372317606809_1_74,สวัสดีค่ะวรัสรินยินดีให้บริการค่ะรถทะเบียนอะไร...,1557menu1
1,_10_10_7073450200623291783_1_74,สวัสดีค่ะวิริยะประกันภัยสุมิตตายินดีให้บริการค...,1557menu1
2,_10_10_7079087469753341967_1_74,สวัสดีค่ะวิริยะประกันภัยณัฐสรณ์ยินดีให้บริการค...,1557menu1
3,_10_10_7079297824366592124_1_74,สวัสดีค่ะบริษัทวิริยะประกันภัยประภานิชายินดีบร...,1557menu1
4,_10_10_7080719527261054596_1_74,สวัสดีค่ะวิริยะประกันภัย ธนวรรณดียินดีให้บริกา...,1557menu1
...,...,...,...
344,_1_Telesale_1904_02.03.2565_8.36.43,วิริยะประกันภัยสวัสดีค่ะ ติดต่อเรื่องอะไรคะสวั...,A&H
345,_1_Telesale_1904_02.03.2565_8.58.55,สวัสดีค่ะ ขอเรียนสายคุณณัชญาภา ดิฉันโทรจากบริษ...,A&H
346,_1_Telesale_1904_02.03.2565_9.42.59,สวัสดีค่ะขอเรียนสายคุณศักดาค่ะค่ะรุ้งเพชรโทรจา...,A&H
347,_1_Telesale_1904_02.03.2565__8.49.46,วิริยะประกันภัยสวัสดีค่ะติดต่อเรื่องอะไรคะค่ะเ...,A&H


In [None]:
#สัดส่วน label
all_df.category.value_counts() / all_df.shape[0]

1557menu3    0.275072
1557menu2    0.260745
1557menu1    0.243553
A&H          0.220630
Name: category, dtype: float64

# RNN/LSTM

In [None]:
all_df['processed'] = all_df.texts.map(lambda x: '|'.join(process_text(x)))
all_df['wc'] = all_df.processed.map(lambda x: len(x.split('|')))
all_df['uwc'] = all_df.processed.map(lambda x: len(set(x.split('|'))))

In [None]:
#train-valid
train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=1412)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [None]:
print(len(train_df),len(valid_df))

296 53


In [None]:
y_train = train_df['category']
y_valid = valid_df['category']

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

text_train = [' '.join(x.split('|')) for x in train_df['processed'].values.tolist()]
text_valid = [' '.join(x.split('|')) for x in valid_df['processed'].values.tolist()]

**create lookup dictionary**

In [None]:
word_count = []
for sent in text_train:
  for w in sent.split():
      word_count.append(w)

In [None]:
VOCAB_SIZE = len(set(word_count)) #นับคำทั้งหมดโดยเอาคำซ้ำออก
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(text_train)

In [None]:
VOCAB_SIZE

3160

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20] 
#[UNK] = unknown

array(['', '[UNK]', 'นะคะ', 'ลูกค้า', 'ศูนย์', 'สวัสดี', 'สอง', 'ห้า',
       'วิริยะ', 'ไหม', 'รถ', 'เดี๋ยว', 'สาม', 'ข้อมูล', 'เจ้าหน้าที่',
       'เคลม', 'เก้า', 'สี่', 'ติดต่อ', 'แปด'], dtype='<U19')

**create model**

In [None]:
set(y_train)

{'1557menu1', '1557menu2', '1557menu3', 'A&H'}

In [None]:
y_train = train_df['category']
y_valid = valid_df['category']
y_class = ['1557menu1', '1557menu2', '1557menu3', 'A&H']

In [None]:
le = preprocessing.LabelEncoder()
le.fit(y_class)
y_train = le.transform(y_train)
y_val = le.transform(y_valid)
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
print(y_train.shape)
#แปลงy เป็น one-hot

(296, 4)


In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=512,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(y_train.shape[1], activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
#model train แล้ว
model.fit(np.array(text_train), y_train, validation_data=(np.array(text_valid), y_val), epochs=3, verbose=2)

Epoch 1/3
10/10 - 102s - loss: 1.3775 - accuracy: 0.5034 - val_loss: 1.3692 - val_accuracy: 0.5094
Epoch 2/3
10/10 - 83s - loss: 1.3543 - accuracy: 0.6520 - val_loss: 1.3513 - val_accuracy: 0.5849
Epoch 3/3
10/10 - 81s - loss: 1.3270 - accuracy: 0.7331 - val_loss: 1.3268 - val_accuracy: 0.6604


<keras.callbacks.History at 0x7f15f04c1850>

In [None]:
value = model.predict(np.array(text_valid))
y_val_pred = np.argmax(value,axis=1)
y_val_true = np.argmax(y_val,axis=1)
print(classification_report(y_val_true, y_val_pred))

              precision    recall  f1-score   support

           0       1.00      0.17      0.29        12
           1       0.86      0.86      0.86        14
           2       0.46      0.86      0.60        14
           3       0.82      0.69      0.75        13

    accuracy                           0.66        53
   macro avg       0.78      0.64      0.62        53
weighted avg       0.78      0.66      0.63        53



# CNN

**Universal-sentence-encoder-multilingual:รองรับคำที่ไม่เคยเห็นได้ดีกว่า**

In [None]:
import tensorflow_hub as hub
import tensorflow_text
import tensorflow as tf #tensorflow 2.1.0

enc = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')

In [None]:
train_df, valid_df = train_test_split(all_df, test_size=0.2, random_state=1412)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [None]:
print(f"train_df :{len(train_df)}, test_df:{len(valid_df)}")

train_df :279, test_df:70


In [None]:
y_train = train_df['category']
y_valid = valid_df['category']

In [None]:
X_trains = []
X_vals = []
bs = 10

In [None]:
for i in tqdm_notebook(range(y_valid.shape[0]//bs+1)):
    X_vals.append(enc(valid_df.texts[(i*bs):((i+1)*bs)]).numpy())

for i in tqdm_notebook(range(y_train.shape[0]//bs+1)):
    X_trains.append(enc(train_df.texts[(i*bs):((i+1)*bs)]).numpy())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/8 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/28 [00:00<?, ?it/s]

In [None]:
X_val = np.concatenate(X_vals,0)
X_train = np.concatenate(X_trains,0)
X_train.shape, X_val.shape

((279, 512), (70, 512))

In [None]:
from sklearn.svm import LinearSVC

text_clf = LinearSVC(class_weight='balanced')
text_clf.fit(X_train, y_train)

LinearSVC(class_weight='balanced')

In [None]:
y_val_pred = text_clf.predict(X_val)
print(classification_report(y_valid, y_val_pred))

              precision    recall  f1-score   support

   1557menu1       0.94      0.94      0.94        16
   1557menu2       0.94      0.94      0.94        16
   1557menu3       0.94      0.80      0.86        20
         A&H       0.81      0.94      0.87        18

    accuracy                           0.90        70
   macro avg       0.91      0.90      0.90        70
weighted avg       0.91      0.90      0.90        70

