## Reading Data

In [1]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

In [2]:
#  Seed 고정
def torch_seed(random_seed=42):

    torch.manual_seed(random_seed)

    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed) # if use multi-GPU

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    np.random.seed(random_seed)
    random.seed(random_seed)

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

In [61]:
from kiwipiepy import Kiwi, Option
kiwi = Kiwi(num_workers=4, options=Option.LOAD_DEFAULT_DICTIONARY | Option.INTEGRATE_ALLOMORPH)
kiwi.prepare()

def tokenizer_kiwi(text):
    result = kiwi.analyze(text)
    for token, _, _, _ in result[0][0]:
            yield f'{token}'
# 성능이 생각보다 안나옴

In [62]:
#from transformers import ElectraTokenizer
#tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [63]:
TEXT = data.Field(batch_first = True, fix_length = 200, 
                  tokenize=tokenizer_kiwi, pad_first=True, pad_token='[PAD]', unk_token='[UNK]')
LABEL = data.LabelField(dtype=torch.float)

train_data = data.TabularDataset(path='train_data.csv', 
                    format='csv', 
                    fields=[("text",TEXT),
                            ("label",LABEL)],  
                    skip_header=True)

test_data = data.TabularDataset(path='test_data.csv', 
                    format='csv', 
                    fields=[("text",TEXT),
                            ("label",LABEL)],  
                    skip_header=True)

In [64]:
print(f'Train Data Length : {len(train_data.examples)}')  # 데이터의 개수를 확인
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 50000
Test Data Length : 15000


In [65]:
print('---- Data Sample ----')
print('Input : ')
print(' '.join(vars(train_data.examples[0])['text']),'\n')  # vars() : 데이터의 값을 직접 확인
print('Label : ')
print(vars(train_data.examples[1])['label'])  

---- Data Sample ----
Input : 
절대 가 지 말 시 어요 몸 다 상하 ᆸ니다 부서 절대 요 

Label : 
0


## Pre-processing Data

In [66]:
def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    return input_sentence

In [67]:
# 간단한 Data Cleansing 작업
for example in train_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()
    
for example in test_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

## Making Vocab & Setting Embedding

In [68]:
model_config = {'emb_type' : '', 'emb_dim' : 300}

In [69]:
# build_vocab() : Text Data와 Label Data의 Vocab을 만듦
TEXT.build_vocab(train_data,  
                 min_freq = 2,  # vocab에 해당하는 token에 최소한으로 등장하는 횟수 
                 max_size = None,  # 전체 vocab size 자체에 제한
                 vectors = f"glove.6B.{model_config['emb_dim']}d")  # pre-trained vector

LABEL.build_vocab(train_data)

model_config['vocab_size'] = len(TEXT.vocab) 

In [70]:
# Vocabulary Info
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()):
    if idx >= 10:  # 상위 10개 Vacab의 단어와 index 값을 가져옴
        break    
    print('\t', k, v)

print('---------------------------------')

# Label Info
print(f'Label Size : {len(LABEL.vocab)}')

print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Vocab Size : 9136
Vocab Examples : 
	 [UNK] 0
	 [PAD] 1
	 하 2
	 이 3
	 는 4
	 ᆫ 5
	 은 6
	 고 7
	 있 8
	 회사 9
---------------------------------
Label Size : 2
Lable Examples : 
	 0 0
	 1 1


## Spliting Validation Data & Making Data Iterator

In [71]:
train_data, valid_data = train_data.split(random_state = random.seed(42), split_ratio=0.8)

In [72]:
model_config['batch_size'] = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data, valid_data, test_data), 
                                                                           batch_size=model_config['batch_size'], device=device, sort=False)

## Sample Data

In [73]:
# Check batch data
sample_for_check = next(iter(train_iterator))  # 돌아가지 않을 경우 torchtext 버전이 0.3.1 버전이 맞는지 확인
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 64]
	[.text]:[torch.cuda.LongTensor of size 64x200 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 64 (GPU 0)]
tensor([[   1,    1,    1,  ...,    3,    5,    9],
        [   1,    1,    1,  ..., 1655,    8,   54],
        [   1,    1,    1,  ...,   10,  123,  471],
        ...,
        [   1,    1,    1,  ...,  210,    5,  435],
        [   1,    1,    1,  ...,   15,    6,    9],
        [   1,    1,    1,  ...,   34,    4,   26]], device='cuda:0')
tensor([1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
        0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1.,
        0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
        0., 0., 1., 0., 1., 0., 1., 0., 1., 0.], device='cuda:0')


In [74]:
print(' '.join([TEXT.vocab.itos[int(x)] for x in sample_for_check.text[0,:] if x not in [0,1]]))
print(LABEL.vocab.itos[int(sample_for_check.label[0])]) 

사람 들 이 잘 모르 지만 자기 도 모르 게 대한민국 의 많 은 직장인 들 이 이미 이용 중 이 ᆫ 회사
1


## Modeling

In [75]:
# CODE HERE ... 
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext':  # Vocab size * Embedding_dimension 행렬을 만들어 학습 
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],  
                                    embedding_dim = model_config['emb_dim'],  # 원하는 Embedding_dimension을 설정 
                                    _weight = TEXT.vocab.vectors)  # Pre-Trained Vector를 Embedding 행렬의 Initial Value로 설정
                                                                   # 이 옵션이 없는 경우 정규 분포에서 생성한 값을 Initial Value로 설정
        else:
            self.emb = nn.Embedding(num_embeddings = model_config['vocab_size'],
                                    embedding_dim = model_config['emb_dim'])
        
        self.bidirectional = model_config['bidirectional']  # 두 개의 독립적인 RNN을 합친 것으로 정방향과 역방향 순서 정보를 모두 학습
        self.num_direction = 2 if model_config['bidirectional'] else 1
        self.model_type = model_config['model_type'] 
        
        if model_config['model_type'] == 'RNN':
            self.RNN = nn.RNN (input_size = model_config['emb_dim'],  
                           hidden_size = model_config['hidden_dim'],  
                           dropout = model_config['dropout'],  
                           bidirectional = model_config['bidirectional'], 
                           batch_first = model_config['batch_first'],
                           num_layers = model_config['num_layers'])  # 사용자 지정 Hyperparameter
            
        elif model_config['model_type'] == 'LSTM':
            self.RNN = nn.LSTM (input_size = model_config['emb_dim'],  
                           hidden_size = model_config['hidden_dim'],  
                           dropout = model_config['dropout'],  
                           bidirectional = model_config['bidirectional'], 
                           batch_first = model_config['batch_first'],
                           num_layers = model_config['num_layers'])  # 사용자 지정 Hyperparameter
            
        elif model_config['model_type'] == 'GRU':
            self.RNN = nn.GRU (input_size = model_config['emb_dim'],  
                           hidden_size = model_config['hidden_dim'],  
                           dropout = model_config['dropout'],  
                           bidirectional = model_config['bidirectional'], 
                           batch_first = model_config['batch_first'],
                           num_layers = model_config['num_layers'])  # 사용자 지정 Hyperparameter
        
    
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_direction, model_config['output_dim'])
        
        self.drop = nn.Dropout(model_config['dropout'])

    def forward(self, x):
        # x : (Batch_Size, Max_Seq_Length)
        
        emb = self.emb(x) 
        # emb : (Batch_Size, Max_Seq_Length, Emb_dim)

        output, hidden = self.RNN(emb)
        # output : (Batch_Size, Max_Seq_Length, Hidden_dim * num_direction) 
        # hidden : (num_direction, Batch_Size, Hidden_dim)
        # hidden의 경우, batch_first 옵션이 안먹는 문제가 있음
        
        last_output = output[:,-1,:]
        # last_output : (Batch_Size, Hidden_dim * num_direction)
        
        return self.fc(self.drop(last_output))

In [76]:
model_config.update(dict(batch_first = True,
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0,
                         num_layers = 1))

In [77]:
model = SentenceClassification(**model_config).to(device)

In [78]:
predictions = model.forward(sample_for_check.text).squeeze()

In [79]:
loss_fn = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum()/len(correct)
    return acc

In [80]:
loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

In [81]:
print(predictions)
print(loss, acc)

tensor([0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238, 0.0238,
        0.0238], device='cuda:0', grad_fn=<SqueezeBackward0>)
tensor(0.6932, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>) tensor(0.5000, device='cuda:0')


### Training

In [82]:
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() 
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):
        
        # Initializing
        optimizer.zero_grad()
        
        # Forward 
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}")

        # Backward 
        loss.backward()
        optimizer.step()
        
        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator)

In [83]:
def evaluate(model, iterator, loss_fn):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### bi-RNN

In [84]:
model_config['model_type'] = 'RNN'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [85]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-RNN_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.5322 | Train Acc : 0.7361
	 Epoch : 0 | Valid Loss : 0.4961 | Valid Acc : 0.7688
	 Saved at 1-epoch
	 Epoch : 1 | Train Loss : 0.4415 | Train Acc : 0.8044
	 Epoch : 1 | Valid Loss : 0.4858 | Valid Acc : 0.7735
	 Saved at 2-epoch
	 Epoch : 2 | Train Loss : 0.3849 | Train Acc : 0.8365
	 Epoch : 2 | Valid Loss : 0.4854 | Valid Acc : 0.7836
	 Epoch : 3 | Train Loss : 0.341 | Train Acc : 0.8608
	 Epoch : 3 | Valid Loss : 0.5108 | Valid Acc : 0.7822
	 Epoch : 4 | Train Loss : 0.3154 | Train Acc : 0.8713
	 Epoch : 4 | Valid Loss : 0.5239 | Valid Acc : 0.7712


In [86]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.486 | Test Acc : 0.7836


### LSTM

In [87]:
model_config['model_type'] = 'LSTM'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [88]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-LSTM_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.4904 | Train Acc : 0.7643
	 Epoch : 0 | Valid Loss : 0.4668 | Valid Acc : 0.7949
	 Saved at 1-epoch
	 Epoch : 1 | Train Loss : 0.3954 | Train Acc : 0.8258
	 Epoch : 1 | Valid Loss : 0.4596 | Valid Acc : 0.7896
	 Saved at 2-epoch
	 Epoch : 2 | Train Loss : 0.3537 | Train Acc : 0.8465
	 Epoch : 2 | Valid Loss : 0.4553 | Valid Acc : 0.7974
	 Epoch : 3 | Train Loss : 0.3182 | Train Acc : 0.8647
	 Epoch : 3 | Valid Loss : 0.4742 | Valid Acc : 0.7992
	 Epoch : 4 | Train Loss : 0.2858 | Train Acc : 0.8793
	 Epoch : 4 | Valid Loss : 0.5173 | Valid Acc : 0.7945


In [89]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.4519 | Test Acc : 0.802


### GRU

In [90]:
model_config['model_type'] = 'GRU'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [91]:
N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-GRU_
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.4789 | Train Acc : 0.7718
	 Epoch : 0 | Valid Loss : 0.4318 | Valid Acc : 0.804
	 Epoch : 1 | Train Loss : 0.3835 | Train Acc : 0.8341
	 Epoch : 1 | Valid Loss : 0.439 | Valid Acc : 0.8025
	 Epoch : 2 | Train Loss : 0.3361 | Train Acc : 0.8559
	 Epoch : 2 | Valid Loss : 0.4563 | Valid Acc : 0.8021
	 Epoch : 3 | Train Loss : 0.2974 | Train Acc : 0.8755
	 Epoch : 3 | Valid Loss : 0.4627 | Valid Acc : 0.8008
	 Epoch : 4 | Train Loss : 0.264 | Train Acc : 0.8918
	 Epoch : 4 | Valid Loss : 0.556 | Valid Acc : 0.7907


In [92]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.4386 | Test Acc : 0.8028
