In [1]:
import torch
import torch.nn as nn

### RNN
- 문장의 context를 바탕으로 추론

In [2]:
class RNNClassifier(nn.Module):

    def __init__(
        self,
        input_size,
        word_vec_size,
        hidden_size,
        n_classes,
        n_layers=4,
        dropout_p=.3
    ):
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.n_layers = n_layers
        self.dropout_p = dropout_p

        super().__init__()

        self.emb = nn.Embedding(input_size, word_vec_size)
        self.rnn = nn.LSTM(
            input_size=word_vec_size,
            hidden_size=hidden_size,
            num_layers=n_layers,
            dropout=dropout_p,
            batch_first=True,
            bidirectional=True
        )
        self.generator = nn.Linear(hidden_size * 2, n_classes)
        # Use LogSoftmax + NLLLoss
        self.activation = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # |x| = (batch_size, length)
        x = self.emb(x)
        # |x| = (batch_size, length, word_vec_size)
        x, _ = self.rnn(x)
        # |x| = (batch_size, length, hidden_size * 2)
        # |x[:,-1]| = (bs, 1, hidden_size * 2)
        y = self.activation(self.generator(x[:, -1]))
        # |y| = (batch_size, n_classes)

        return y

### Convolutional Neural Networks
- 문장 내 단어의 패턴을 인식
- x -> Embedding
  - |CNN_input| = (length, word_vec_size)
  - |kernel_size| = (#filters, windows, word_vec_size)
  - |CNN_output| = (#filters, length - windows + 1, 1)
  - ` windows = 패턴 내 단어의 갯수, hyperparameter
- CNN -> Sentence Embedding

In [3]:
class CNNClassifier(nn.Module):

    def __init__(
        self,
        input_size,
        word_vec_size,
        n_classes,
        use_batch_norm=False,
        dropout_p=.5,
        window_sizes=[3, 4, 5],
        n_filters=[100, 100, 100]
    ):
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.n_classes = n_classes
        self.use_batch_norm = use_batch_norm
        self.dropout_p = dropout_p
        # window_size = how many words a pattern cover
        self.window_sizes = window_sizes
        # n_filters = how many patterns to cover
        self.n_filters = n_filters

        super().__init__()

        self.emb = nn.Embedding(input_size, word_vec_size)
        # Use nn.ModuleList to register each sub-modules
        # 단순 list에 담을경우, module에서 layer로 인식을 못해서 optimizer에서 parameter를 못가져옴
        self.feature_extractors = nn.ModuleList()

        for window_size, n_filter in zip(window_sizes, n_filters):
            self.feature_extractors.append(
                nn.Sequential(
                    nn.Conv2d(
                        in_channels=1, # 만약 컬러그림이었으면 3채널
                        out_channels=n_filter,
                        kernel_size=(window_size, word_vec_size),
                    ),
                    nn.ReLU(),
                    nn.BatchNorm2d(n_filter) if use_batch_norm else nn.Dropout(dropout_p)
                )
            )
                                  # 차후에 concat 해서 진행할 거라서
        self.generator = nn.Linear(sum(n_filters), n_classes)
        self.activation= nn.LogSoftmax(dim=-1)

    def forward(self, x):
        # |x| = (batch_size, length)
        x = self.emb(x)
        # |x| = (batch_size, length, word_Vec_size)
        min_length = max(self.window_sizes)
        if min_length > x.size(1):
            # .new() x와 같은 tensor 타입, 같은 device에 있는 tnesor 생성
            pad = x.new(x.size(0), min_length - x.size(1), self.word_vec_size).zero_()
            # |pad| = (batch_size, min_length - length, word_vec_size)
            x = torch.cat([x, pad], dim=1)
            # |x| = (batch_size, min_length, word_vec_size)
        
        # cnn의 적용을 위해 적절한 shape로 바꿔줌
        # self.Conv2d에서 in_channel 1로 지정한 이유
        x = x.unsqueeze(1)
        # |x| = (batch_size, 1, length, word_vec_size)

        cnn_outs = []
        for block in self.feature_extractors:
            cnn_out = block(x)
            # |cnn_out| = (batch_size, n_filter, length - window_size + 1, 1)
            
            # 학습하는 layer가 아니기 때문에 functional로 선언 가능
            cnn_out = nn.functional.max_pool1d(
                input=cnn_out.squeeze(-1),
                kernel_size=cnn_out.size(-2) # (batch_size, n_filter, 1)
            ).squeeze(-1)
            # |cnn_out| = (batch_size, n_filter)
            cnn_outs += [cnn_out]
        cnn_outs = torch.cat(cnn_outs, dim=-1)
        # |cnn_outs| = (batch_size, sum(n_filters))
        y = self.activation(self.generator(cnn_outs))
        # |y| = (batch_size, n_classes)

        return y

## Trainer

In [7]:
from copy import deepcopy

import numpy as np

import torch

from ignite.engine import Engine
from ignite.engine import Events
from ignite.metrics import RunningAverage
from ignite.contrib.handlers.tqdm_logger import ProgressBar

VERBOSE_SILENT = 0
VERBOSE_EPOCH_WISE = 1
VERBOSE_BATCH_WISE = 2


class MyEngine(Engine):

    def __init__(self, func, model, crit, optimizer, config):
        # Ignite Engine does not have objects in below lines.
        # Thus, we assign class variables to access these object, during the procedure.
        self.model = model
        self.crit = crit
        self.optimizer = optimizer
        self.config = config

        super().__init__(func) # Ignite Engine only needs function to run.

        self.best_loss = np.inf
        self.best_model = None

        self.device = next(model.parameters()).device

    @staticmethod
    def train(engine, mini_batch):
        # You have to reset the gradients of all model parameters
        # before to take another step in gradient descent.
        engine.model.train() # Because we assign model as class variable, we can easily access to it.
        engine.optimizer.zero_grad()

        x, y = mini_batch.text, mini_batch.label
        x, y = x.to(engine.device), y.to(engine.device)
        
        # 정제되지 않은 text여서, 무의미하게 긴 문장 축소
        x = x[:, :engine.config.max_length]

        # Take feed-forward
        y_hat = engine.model(x)

        loss = engine.crit(y_hat, y)
        loss.backward()

        # Calculate accuracy only if 'y' is LongTensor,
        # which means that 'y' is one-hot representation.
        if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
            accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
        else:
            accuracy = 0

        # Take a step of gradient descent.
        engine.optimizer.step()

        return {
            'loss': float(loss),
            'accuracy': float(accuracy),
        }

    @staticmethod
    def validate(engine, mini_batch):
        engine.model.eval()

        with torch.no_grad():
            x, y = mini_batch.text, mini_batch.label
            x, y = x.to(engine.device), y.to(engine.device)

            x = x[:, :engine.config.max_length]

            y_hat = engine.model(x)

            loss = engine.crit(y_hat, y)

            if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
                accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
            else:
                accuracy = 0

        return {
            'loss': float(loss),
            'accuracy': float(accuracy),
        }

    @staticmethod
    def attach(train_engine, validation_engine):
        def attach_running_average(engine, metric_name):
            RunningAverage(output_transform=lambda x: x[metric_name]).attach(
                engine,
                metric_name,
            )


        training_metric_names = ['loss', 'accuracy']

        for metric_name in training_metric_names:
            attach_running_average(train_engine, metric_name)

        # If the verbosity is set, progress bar would be shown for mini-batch iterations.
        # Without ignite, you can use tqdm to implement progress bar.
        pbar = ProgressBar(bar_format=None, ncols=120)
        pbar.attach(train_engine, training_metric_names)

        # If the verbosity is set, statistics would be shown after each epoch.
        @train_engine.on(Events.EPOCH_COMPLETED)
        def print_train_logs(engine):
            print('Epoch {} - loss={:.4e} accuracy={:.4f}'.format(
                engine.state.epoch,
                engine.state.metrics['loss'],
                engine.state.metrics['accuracy'],
            ))


        validation_metric_names = ['loss', 'accuracy']

        for metric_name in validation_metric_names:
            attach_running_average(validation_engine, metric_name)
            
        # Do same things for validation engine.
        pbar = ProgressBar(bar_format=None, ncols=120)
        pbar.attach(validation_engine, validation_metric_names)

        @validation_engine.on(Events.EPOCH_COMPLETED)
        def print_valid_logs(engine):
            print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}'.format(
                engine.state.metrics['loss'],
                engine.state.metrics['accuracy'],
                engine.best_loss,
            ))

    @staticmethod
    def check_best(engine):
        loss = float(engine.state.metrics['loss'])
        if loss <= engine.best_loss: # If current epoch returns lower validation loss,
            engine.best_loss = loss  # Update lowest validation loss.
            engine.best_model = deepcopy(engine.model.state_dict()) # Update best model weights.

    @staticmethod
    def save_model(engine, train_engine, config, **kwargs):
        torch.save(
            {
                'model': engine.best_model,
                'config': config,
                **kwargs
            }, config.model_fn
        )


class Trainer():

    def __init__(self, config):
        self.config = config

    def train(
        self,
        model, crit, optimizer,
        train_loader, valid_loader,
    ):
        train_engine = MyEngine(
            MyEngine.train,
            model, crit, optimizer, self.config
        )
        validation_engine = MyEngine(
            MyEngine.validate,
            model, crit, optimizer, self.config
        )

        MyEngine.attach(
            train_engine,
            validation_engine
        )

        def run_validation(engine, validation_engine, valid_loader):
            validation_engine.run(valid_loader, max_epochs=1)

        train_engine.add_event_handler(
            Events.EPOCH_COMPLETED, # event
            run_validation, # function
            validation_engine, valid_loader, # arguments
        )
        validation_engine.add_event_handler(
            Events.EPOCH_COMPLETED, # event
            MyEngine.check_best, # function
        )
        # validation_engine.add_event_handler(
        #     Events.EPOCH_COMPLETED,
        #     MyEngine.save_model,
        #     train_engine, self.config,
        # )

        train_engine.run(
            train_loader,
            max_epochs=self.config.n_epochs,
        )

        model.load_state_dict(validation_engine.best_model)

        return model


## DataLoader

In [8]:
import torchtext
version = list(map(int, torchtext.__version__.split('.')))
if version[0] <= 0 and version[1] < 9:
    from torchtext import data
else:
    from torchtext.legacy import data


class DataLoader(object):
    def __init__(
        self, train_fn,
        batch_size=64,
        valid_ratio=.2,
        device=-1,
        max_vocab=999999,
        min_freq=1,
        use_eos=False,
        shuffle=True,
    ):
        super().__init__()

        # Define field of the input file.
        # The input file consists of two fields.
        self.label = data.Field(
            sequential=False,
            use_vocab=True,
            unk_token=None
        )
        self.text = data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
            eos_token='<EOS>' if use_eos else None,
        )
        # Those defined two columns will be delimited by TAB.
        # Thus, we use TabularDataset to load two columns in the input file.
        # We would have two separate input file: train_fn, valid_fn
        # Files consist of two columns: label field and text field.
        train, valid = data.TabularDataset(
            path=train_fn,
            format='tsv', 
            fields=[
                ('label', self.label),
                ('text', self.text),
            ],
        ).split(split_ratio=(1 - valid_ratio))
        
        # Those loaded dataset would be feeded into each iterator:
        # train iterator and valid iterator.
        # We sort input sentences by length, to group similar lengths.
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device='cuda:%d' % device if device >= 0 else 'cpu',
            shuffle=shuffle,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
        )

        # At last, we make a vocabulary for label and text field.
        # It is making mapping table between words and indice.
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)


## Train.py

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim


class Arg :

    def __init__(self):
        self.model_fn = './model.pth'
        self.train_fn = './review.sorted.uniq.refined.tok.shuf.train.tsv'
        self.gpu_id = 0
        self.min_vocab_freq = 5
        self.max_vocab_size = 999999
        self.batch_size = 128
        self.n_epochs = 5
        self.word_vec_size = 256
        self.hidden_size = 512
        self.n_layers = 4
        self.dropout = .3
        self.max_length = 256
        self.rnn = True
        self.cnn = True
        self.use_batch_norm = True
        self.window_sizes = [3, 4, 5]
        self.n_filters = [100, 100, 100]

def main(config):
    loaders = DataLoader(
        train_fn=config.train_fn,
        batch_size=config.batch_size,
        min_freq=config.min_vocab_freq,
        max_vocab=config.max_vocab_size,
        device=config.gpu_id
    )

    print(
        '|train| =', len(loaders.train_loader.dataset),
        '|valid| =', len(loaders.valid_loader.dataset),
    )
    
    vocab_size = len(loaders.text.vocab)
    n_classes = len(loaders.label.vocab)
    print('|vocab| =', vocab_size, '|classes| =', n_classes)

    if config.rnn is False and config.cnn is False:
        raise Exception('You need to specify an architecture to train. (--rnn or --cnn)')

    if config.rnn:
        # Declare model and loss.
        model = RNNClassifier(
            input_size=vocab_size,
            word_vec_size=config.word_vec_size,
            hidden_size=config.hidden_size,
            n_classes=n_classes,
            n_layers=config.n_layers,
            dropout_p=config.dropout,
        )
        optimizer = optim.Adam(model.parameters())
        crit = nn.NLLLoss()
        print(model)

        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        rnn_trainer = Trainer(config)
        rnn_model = rnn_trainer.train(
            model,
            crit,
            optimizer,
            loaders.train_loader,
            loaders.valid_loader
        )
    if config.cnn:
        # Declare model and loss.
        model = CNNClassifier(
            input_size=vocab_size,
            word_vec_size=config.word_vec_size,
            n_classes=n_classes,
            use_batch_norm=config.use_batch_norm,
            dropout_p=config.dropout,
            window_sizes=config.window_sizes,
            n_filters=config.n_filters,
        )
        optimizer = optim.Adam(model.parameters())
        crit = nn.NLLLoss()
        print(model)

        if config.gpu_id >= 0:
            model.cuda(config.gpu_id)
            crit.cuda(config.gpu_id)

        cnn_trainer = Trainer(config)
        cnn_model = cnn_trainer.train(
            model,
            crit,
            optimizer,
            loaders.train_loader,
            loaders.valid_loader
        )

    torch.save({
        'rnn': rnn_model.state_dict() if config.rnn else None,
        'cnn': cnn_model.state_dict() if config.cnn else None,
        'config': config,
        'vocab': loaders.text.vocab,
        'classes': loaders.label.vocab,
    }, config.model_fn)


if __name__ == '__main__':
    config = Arg()
    main(config)


|train| = 118314 |valid| = 29579
|vocab| = 12986 |classes| = 2
RNNClassifier(
  (emb): Embedding(12986, 256)
  (rnn): LSTM(256, 512, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True)
  (generator): Linear(in_features=1024, out_features=2, bias=True)
  (activation): LogSoftmax(dim=-1)
)


  0%|                                                                                           | 1/925 [00:00…

Epoch 1 - loss=2.1011e-01 accuracy=0.9242


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=1.7917e-01 accuracy=0.9362 best_loss=inf


  0%|                                                                                           | 1/925 [00:00…

Epoch 2 - loss=1.7959e-01 accuracy=0.9374


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=1.5846e-01 accuracy=0.9441 best_loss=1.7917e-01


  0%|                                                                                           | 1/925 [00:00…

Epoch 3 - loss=1.5974e-01 accuracy=0.9445


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=1.4590e-01 accuracy=0.9484 best_loss=1.5846e-01


  0%|                                                                                           | 1/925 [00:00…

Epoch 4 - loss=1.4941e-01 accuracy=0.9506


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=1.6026e-01 accuracy=0.9485 best_loss=1.4590e-01


  0%|                                                                                           | 1/925 [00:00…

Epoch 5 - loss=1.0395e-01 accuracy=0.9659


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=1.6445e-01 accuracy=0.9479 best_loss=1.4590e-01
CNNClassifier(
  (emb): Embedding(12986, 256)
  (feature_extractors): ModuleList(
    (0): Sequential(
      (0): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1))
      (1): ReLU()
      (2): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1))
      (1): ReLU()
      (2): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): Sequential(
      (0): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1))
      (1): ReLU()
      (2): BatchNorm2d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (generator): Linear(in_features=300, out_features=2, bias=True)
  (activation): LogSoftmax(dim=-1)
)


  0%|                                                                                           | 1/925 [00:00…

Epoch 1 - loss=2.1976e-01 accuracy=0.9208


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=3.2039e-01 accuracy=0.8795 best_loss=inf


  0%|                                                                                           | 1/925 [00:00…

Epoch 2 - loss=1.7260e-01 accuracy=0.9406


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=2.0270e-01 accuracy=0.9227 best_loss=3.2039e-01


  0%|                                                                                           | 1/925 [00:00…

Epoch 3 - loss=1.5276e-01 accuracy=0.9469


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=4.0785e-01 accuracy=0.8642 best_loss=2.0270e-01


  0%|                                                                                           | 1/925 [00:00…

Epoch 4 - loss=1.1788e-01 accuracy=0.9605


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=3.4233e-01 accuracy=0.8921 best_loss=2.0270e-01


  0%|                                                                                           | 1/925 [00:00…

Epoch 5 - loss=9.4036e-02 accuracy=0.9661


  0%|3                                                                                          | 1/232 [00:00…

Validation - loss=3.0412e-01 accuracy=0.9076 best_loss=2.0270e-01


## classify.py

In [32]:
import sys
from tqdm import tqdm

def read_text():
    lines = []

    with open('./review.sorted.uniq.refined.tok.shuf.test.tsv', 'r') as f:
        for line in f.readlines() :
            lines += [line.split('\t')[1].strip()]

    # for line in sys.stdin:
    #     if line.strip() != '':
    #         lines += [line.strip().split(' ')]
    return lines[:30]

def define_field():
    return (
        data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
        ),
        data.Field(
            sequential=False,
            use_vocab=True,
            unk_token=None
        )
    )

def main(config):
    saved_data = torch.load(
        './model.pth',
        map_location='cuda:0'
    )

    train_config = saved_data['config']
    rnn_best = saved_data['rnn']
    cnn_best = saved_data['cnn']
    vocab = saved_data['vocab']
    classes = saved_data['classes']

    vocab_size = len(vocab)
    n_classes = len(classes)

    text_field, label_field = define_field()
    text_field.vocab = vocab
    label_field.vocab = classes

    lines = read_text()

    with torch.no_grad():
        # convert string to index(one-hot vector 인덱스로 받아옴)
        x = text_field.numericalize(
            text_field.pad(lines),
            device='cuda:0'
        )

        ensemble = []

        if rnn_best is not None:
            model = RNNClassifier(
                input_size=vocab_size,
                word_vec_size=train_config.word_vec_size,
                hidden_size=train_config.hidden_size,
                n_classes=n_classes,
                n_layers=train_config.n_layers,
                dropout_p=train_config.dropout
            )
            model.load_state_dict(rnn_best)
            ensemble += [model]

        if cnn_best is not None:
            model = CNNClassifier(
                input_size=vocab_size,
                word_vec_size=train_config.word_vec_size,
                n_classes=n_classes,
                use_batch_norm=train_config.use_batch_norm,
                dropout_p=train_config.dropout,
                window_sizes=train_config.window_sizes,
                n_filters=train_config.n_filters
            )
            model.load_state_dict(cnn_best)
            ensemble += [model]

        y_hats = []

        for model in ensemble:
            model.cuda(0)

            model.eval()

            y_hat = []
            for idx in tqdm(range(0, len(lines), 32)):
                y_hat += [model(x[idx:idx + 32])]
            y_hat = torch.cat(y_hat, dim=0)
            # |y_hat| = (len(lines), n_classes)

            y_hats += [y_hat]

        y_hats = torch.stack(y_hats).exp()

        y_hats = y_hats.sum(dim=0) / len(ensemble)

        probs, indice = y_hats.cpu().topk(1)
        for i in range(len(lines)):
            sys.stdout.write('%s\t%s\n' % (
                ' '.join([classes.itos[indice[i]]]),
                ' '.join(lines[i]))
            )


if __name__ == '__main__':
    main({})



100%|██████████| 1/1 [00:00<00:00, 14.11it/s]
100%|██████████| 1/1 [00:00<00:00, 907.47it/s]

positive	생 각   보 다   밝   아 요   ㅎ ㅎ
negative	쓸   대   가   없   네 요
negative	깔   금   해 요   .   가 벼 워   요   .   설 치   가   쉬 워 요   .   타   사 이 트   에   비 해   가 격   도   저 렴   하   답 니 다   .
negative	크 기   나   두 께   가   딱   제   가   원   하   던   사 이 즈   네 요   .   책 상   의 자   가   너 무   딱 딱   해 서   쿠 션   감   좋   은   방 석   이   필 요   하   던   차   에   좋   은   제 품   만 났   네 요   .   냄 새   얘 기   하   시   는   분   도   더 러   있   던 데   별 로   냄 새   안   나   요   .
positive	빠 르   고   괜 찬   습 니 다   .
positive	유 통   기 한   도   넉 넉   하   고   좋   아 요
positive	좋   은   가 격   에   좋   은   상 품   잘   쓰   겠   습 니 다   .
negative	사 이 트   에 서   늘   생 리 대   사   서   쓰   는 데   오 늘   처 럼   이 렇 게   비 닐   에   포 장   되   어   받   아   본   건   처 음   입 니 다   .   위 생   용 품   이   고   자 체   도   비 닐   포 장   이   건 만   소 형   박 스   에   라 도   넣   어   보 내   주   시   지   .   . .
negative	연 결   부 분   이   많 이   티   가   납 니 다   .   재 질   구 김   도   좀   있   습 니 다   .
negative	애 기   태 열   때 문   에   구 매   해 서   잘   쓰   고   있   습 니 다   .
positive	항 상   쓰   던  


