In [1]:
import pandas as pd
import numpy as np
from pythainlp import word_tokenize
from tqdm import tqdm_notebook
import re
import emoji

#viz
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

In [2]:
def replace_url(text):
    URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    return re.sub(URL_PATTERN, 'xxurl', text)

def replace_rep(text):
    def _replace_rep(m):
        c,cc = m.groups()
        return f'{c}xxrep'
    re_rep = re.compile(r'(\S)(\1{2,})')
    return re_rep.sub(_replace_rep, text)

def ungroup_emoji(toks):
    res = []
    for tok in toks:
        if emoji.emoji_count(tok) == len(tok):
            for char in tok:
                res.append(char)
        else:
            res.append(tok)
    return res

def process_text(text):
    #pre rules
    res = text.lower().strip()
    res = replace_url(res)
    res = replace_rep(res)
    
    #tokenize
    res = [word for word in word_tokenize(res) if word and not re.search(pattern=r"\s+", string=word)]
    
    #post rules
    res = ungroup_emoji(res)
    
    return res

In [3]:
with open('train.txt') as f:
    texts = [line.strip() for line in f.readlines()]
f.close()

with open('train_label.txt') as f:
    categories = [line.strip() for line in f.readlines()]
f.close()

all_df = pd.DataFrame({'category':categories, 'texts':texts})
all_df.to_csv('all_df.csv',index=False)
all_df.shape

(24063, 2)

In [4]:
with open('test.txt') as f:
    texts = [line.strip() for line in f.readlines()]
f.close()

test_df = pd.DataFrame({'category':'test', 'texts':texts})
test_df.to_csv('test_df.csv',index=False)
test_df.shape

(2674, 2)

In [4]:
all_df = pd.read_csv('all_df.csv')
test_df = pd.read_csv('test_df.csv')

In [5]:
all_df

Unnamed: 0,category,texts
0,neu,ประเทศเราผลิตและส่งออกยาสูบเยอะสุดในโลกจิงป่าวคับ
1,neu,คะ
2,neg,อิเหี้ยออมทำกูอยากกินเอ็มเค
3,neu,😅😅😅
4,neu,สวัสดีวันพุธ แนน อะไรนะ
...,...,...
24058,neg,แม่งควายล้วนนน
24059,neg,ดอยสุเทพน้องง ไปหมดแล้วววว #pm25
24060,neg,ค่าชุดอาจจะแพงกว่าส่วนลด
24061,neu,รัฐต้องการแค่ภาษีครับ


In [6]:
all_df['processed'] = all_df.texts.map(lambda x: '|'.join(process_text(x)))
all_df['wc'] = all_df.processed.map(lambda x: len(x.split('|')))

test_df['processed'] = test_df.texts.map(lambda x: '|'.join(process_text(x)))
test_df['wc'] = test_df.processed.map(lambda x: len(x.split('|')))

In [7]:
all_df.category.value_counts() / all_df.shape[0]

neu    0.544612
neg    0.255164
pos    0.178698
q      0.021527
Name: category, dtype: float64

In [8]:
zero_numbering = {'neu':0, 'neg':1, 'pos':2, 'q':3}
all_df['labels'] = all_df['category'].apply(lambda x: zero_numbering[x])


In [9]:
all_df.head()

Unnamed: 0,category,texts,processed,wc,labels
0,neu,ประเทศเราผลิตและส่งออกยาสูบเยอะสุดในโลกจิงป่าวคับ,ประเทศ|เรา|ผลิต|และ|ส่งออก|ยาสูบ|เยอะ|สุด|ใน|โ...,13,0
1,neu,คะ,คะ,1,0
2,neg,อิเหี้ยออมทำกูอยากกินเอ็มเค,อิ|เหี้ย|ออม|ทำ|กู|อยาก|กิน|เอ็ม|เค,9,1
3,neu,😅😅😅,😅|xxrep,2,0
4,neu,สวัสดีวันพุธ แนน อะไรนะ,สวัสดี|วัน|พุธ|แนน|อะไร|นะ,6,0


In [10]:
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [11]:
#count number of occurences of each word
counts = Counter()
for index, row in all_df.iterrows():
    counts.update(row['processed'].split('|'))

In [12]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 26723
num_words after: 14937


In [13]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [60]:
len(vocab2index.keys())

14938

In [56]:
len(words)

14939

In [57]:
vocab2index

{'': 2710,
 'UNK': 1,
 'ประเทศ': 2,
 'เรา': 3,
 'ผลิต': 4,
 'และ': 5,
 'ส่งออก': 6,
 'ยาสูบ': 7,
 'เยอะ': 8,
 'สุด': 9,
 'ใน': 10,
 'โลก': 11,
 'จิง': 12,
 'ป่าว': 13,
 'คับ': 14,
 'คะ': 15,
 'อิ': 16,
 'เหี้ย': 17,
 'ออม': 18,
 'ทำ': 19,
 'กู': 20,
 'อยาก': 21,
 'กิน': 22,
 'เอ็ม': 23,
 'เค': 24,
 '😅': 25,
 'xxrep': 26,
 'สวัสดี': 27,
 'วัน': 28,
 'พุธ': 29,
 'แนน': 30,
 'อะไร': 31,
 'นะ': 32,
 'ก้อน': 33,
 'ขอบพระคุณ': 34,
 'มาก': 35,
 'ๆ': 36,
 'คร้าบ': 37,
 'ที่': 38,
 'มาหา': 39,
 'การ': 40,
 'บริการ': 41,
 'ของ': 42,
 'พนักงาน': 43,
 'อีกด้วย': 44,
 '<': 45,
 '3': 46,
 'เน็ต': 47,
 'ควาย': 48,
 'พูด': 49,
 'จริง': 50,
 'เสียดาย': 51,
 'ตังค์': 52,
 'ติดตั้ง': 53,
 'โค': 54,
 'รต': 55,
 'ห่วย': 56,
 'ยิ่ง': 57,
 'ดึก': 58,
 'ตอน': 59,
 'มา': 60,
 'ติด': 61,
 'ก็ดี': 62,
 'นานๆ': 63,
 'ไป': 64,
 'เเย่': 65,
 'ขึ้น': 66,
 'เรื่อยๆ': 67,
 'กาก': 68,
 'กว่า': 69,
 'ค่าย': 70,
 'อื่น': 71,
 'อ่ะ': 72,
 'บอก': 73,
 'ไม่': 74,
 'สน': 75,
 'หรอ': 76,
 'จะ': 77,
 'เป็น': 78,
 'เด็ก': 79,
 

In [55]:
len(vocab2index)

14938

In [136]:
import json

In [138]:

a_file = open("vocab2index.json", "w")
json.dump(vocab2index, a_file)
a_file.close()

a_file = open("vocab2index.json", "r")
output = json.load(a_file)
print(output)

{'': 2710, 'UNK': 1, 'ประเทศ': 2, 'เรา': 3, 'ผลิต': 4, 'และ': 5, 'ส่งออก': 6, 'ยาสูบ': 7, 'เยอะ': 8, 'สุด': 9, 'ใน': 10, 'โลก': 11, 'จิง': 12, 'ป่าว': 13, 'คับ': 14, 'คะ': 15, 'อิ': 16, 'เหี้ย': 17, 'ออม': 18, 'ทำ': 19, 'กู': 20, 'อยาก': 21, 'กิน': 22, 'เอ็ม': 23, 'เค': 24, '😅': 25, 'xxrep': 26, 'สวัสดี': 27, 'วัน': 28, 'พุธ': 29, 'แนน': 30, 'อะไร': 31, 'นะ': 32, 'ก้อน': 33, 'ขอบพระคุณ': 34, 'มาก': 35, 'ๆ': 36, 'คร้าบ': 37, 'ที่': 38, 'มาหา': 39, 'การ': 40, 'บริการ': 41, 'ของ': 42, 'พนักงาน': 43, 'อีกด้วย': 44, '<': 45, '3': 46, 'เน็ต': 47, 'ควาย': 48, 'พูด': 49, 'จริง': 50, 'เสียดาย': 51, 'ตังค์': 52, 'ติดตั้ง': 53, 'โค': 54, 'รต': 55, 'ห่วย': 56, 'ยิ่ง': 57, 'ดึก': 58, 'ตอน': 59, 'มา': 60, 'ติด': 61, 'ก็ดี': 62, 'นานๆ': 63, 'ไป': 64, 'เเย่': 65, 'ขึ้น': 66, 'เรื่อยๆ': 67, 'กาก': 68, 'กว่า': 69, 'ค่าย': 70, 'อื่น': 71, 'อ่ะ': 72, 'บอก': 73, 'ไม่': 74, 'สน': 75, 'หรอ': 76, 'จะ': 77, 'เป็น': 78, 'เด็ก': 79, 'เเล้ว': 80, 'เม้น': 81, 'ด่า': 82, 'เเบบ': 83, 'นี้': 84, 'เเต่': 85, 'หา': 86,

In [None]:
output

In [38]:
def encode_sentence(text, vocab2index, N=100):
    tokenized = text.split('|')
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [39]:
all_df['encoded'] = all_df['processed'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
all_df.head()



Unnamed: 0,category,texts,processed,wc,labels,encoded
0,neu,ประเทศเราผลิตและส่งออกยาสูบเยอะสุดในโลกจิงป่าวคับ,ประเทศ|เรา|ผลิต|และ|ส่งออก|ยาสูบ|เยอะ|สุด|ใน|โ...,13,0,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,neu,คะ,คะ,1,0,"[[15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,neg,อิเหี้ยออมทำกูอยากกินเอ็มเค,อิ|เหี้ย|ออม|ทำ|กู|อยาก|กิน|เอ็ม|เค,9,1,"[[16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 0, 0,..."
3,neu,😅😅😅,😅|xxrep,2,0,"[[25, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,neu,สวัสดีวันพุธ แนน อะไรนะ,สวัสดี|วัน|พุธ|แนน|อะไร|นะ,6,0,"[[27, 28, 29, 30, 31, 32, 0, 0, 0, 0, 0, 0, 0,..."


In [40]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_df, test_size=0.15, random_state=42)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [41]:
X_train = train_df['encoded']
X_valid = valid_df['encoded']
y_train = train_df['labels']
y_valid = valid_df['labels']

In [42]:
class SentimentDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [43]:
train_ds = SentimentDataset(X_train, y_train)
valid_ds = SentimentDataset(X_valid, y_valid)

In [44]:
def train_model(model, epochs=50, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 2 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [45]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [46]:
batch_size = 2048
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [152]:
model_fixed =  LSTM_fixed_len(vocab_size, 100, 50)

In [153]:
train_model(model_fixed, epochs=50, lr=0.01)

train loss 1.085, val loss 1.074, val accuracy 0.542, and val rmse 1.075
train loss 1.065, val loss 1.067, val accuracy 0.541, and val rmse 1.076
train loss 1.056, val loss 1.062, val accuracy 0.541, and val rmse 1.077
train loss 1.057, val loss 1.065, val accuracy 0.541, and val rmse 1.076
train loss 1.052, val loss 1.066, val accuracy 0.541, and val rmse 1.075
train loss 1.023, val loss 1.020, val accuracy 0.541, and val rmse 1.075
train loss 0.902, val loss 0.924, val accuracy 0.635, and val rmse 0.975
train loss 0.804, val loss 0.917, val accuracy 0.655, and val rmse 0.970
train loss 0.745, val loss 0.932, val accuracy 0.650, and val rmse 0.970
train loss 0.684, val loss 0.967, val accuracy 0.652, and val rmse 0.972
train loss 0.635, val loss 0.965, val accuracy 0.626, and val rmse 1.009
train loss 0.598, val loss 0.979, val accuracy 0.630, and val rmse 1.000
train loss 0.566, val loss 1.000, val accuracy 0.615, and val rmse 1.022
train loss 0.537, val loss 1.009, val accuracy 0.62

In [61]:
train_model(model_fixed, epochs=50, lr=0.01)

train loss 0.470, val loss 1.151, val accuracy 0.628, and val rmse 1.030
train loss 0.438, val loss 1.199, val accuracy 0.630, and val rmse 1.027
train loss 0.428, val loss 1.199, val accuracy 0.635, and val rmse 1.015
train loss 0.419, val loss 1.202, val accuracy 0.637, and val rmse 1.008
train loss 0.396, val loss 1.263, val accuracy 0.638, and val rmse 1.001
train loss 0.391, val loss 1.247, val accuracy 0.635, and val rmse 1.011
train loss 0.382, val loss 1.257, val accuracy 0.637, and val rmse 0.998
train loss 0.368, val loss 1.266, val accuracy 0.637, and val rmse 1.006
train loss 0.365, val loss 1.280, val accuracy 0.641, and val rmse 1.001
train loss 0.358, val loss 1.267, val accuracy 0.636, and val rmse 1.003
train loss 0.344, val loss 1.335, val accuracy 0.640, and val rmse 1.002
train loss 0.337, val loss 1.328, val accuracy 0.634, and val rmse 1.015
train loss 0.334, val loss 1.315, val accuracy 0.635, and val rmse 1.014
train loss 0.325, val loss 1.348, val accuracy 0.63

In [17]:
test_texts = 'มือถือเครื่องนี้ทำงานไม่ดีเลย'

In [18]:
test_texts = '|'.join(process_text(test_texts))


In [19]:
test_texts ,length_test_text = encode_sentence(test_texts,vocab2index )

In [20]:
PATH = 'model'

In [28]:
torch.save(model_loaded.state_dict(), PATH)

In [49]:
vocab_size

14939

In [62]:
model_loaded = LSTM_fixed_len(vocab_size, 50, 50)
model_loaded.load_state_dict(torch.load(PATH))
model_loaded.eval()

LSTM_fixed_len(
  (embeddings): Embedding(14939, 50, padding_idx=0)
  (lstm): LSTM(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=5, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [24]:
model_loaded = torch.load(PATH)
# model_loaded.eval()

In [51]:
y_pred = model_loaded(torch.from_numpy(test_texts.reshape(1,-1)), torch.Tensor([length_test_text]))

In [52]:
np.argmax(y_pred.detach().numpy())

1