# Import requirements

In [1]:
!pip install Sentencepiece
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 14.0 MB/s 
[?25hInstalling collected packages: Sentencepiece
Successfully installed Sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 15.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 72.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 83.4 MB/s 


In [None]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW,
    
    AlbertTokenizer, AlbertForSequenceClassification
)

# 1. Preprocess

In [None]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [None]:
from google.colab import files
uploaded = files.upload()

Saving sentiment.train.1 to sentiment.train.1
Saving sentiment.train.0 to sentiment.train.0
Saving test_no_label.csv to test_no_label.csv
Saving sentiment.dev.1 to sentiment.dev.1
Saving sentiment.dev.0 to sentiment.dev.0


In [None]:
!ls

/bin/bash: ls3: command not found


In [None]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

it will take some times...
make id file finished!


In [None]:
train_pos[:10]

['2 5977 950 13 9 3',
 '2 18656 7705 365 13 9 3',
 '2 59 67 57 1954 621 18 17 1392 5262 56 25 510 254 13 9 3',
 '2 32 13 22 18 21 254 13865 69 20538 7298 13 9 3',
 '2 14 1138 25 4753 13 9 3',
 '2 254 748 950 13 9 3',
 '2 254 365 13 9 3',
 '2 11554 16 208 25 27269 17 7503 16 621 18 13 9 3',
 '2 374 209 26 4311 54 748 16385 18 17 3911 13 9 3',
 '2 14 78 978 1879 5289 13 9 3']

In [None]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
            # print('pos_sent: ', self.label, pos_sent)
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

In [None]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [None]:
# for i, item in reversed(list(enumerate(train_dataset))):
#     print(item)
#     if i == 1:
#         break

# for i, item in enumerate(train_dataset):
#     print(item)
#     if i == 1:
#         break

In [None]:
# def collate_fn_style(samples):
#     # print('samples : ',samples)
#     input_ids, labels = zip(*samples)
#     # print('inputs : ', input_ids)
#     # print('labels : ', labels)

#     # batch 사이즈중 가장 긴 문장의 길이 추출
#     max_len = max(len(input_id) for input_id in input_ids)

#     # 조금 더 잘 pad 를 하기 위해 문장 정리
#     sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]
    
#     # 위에 오류
#     # sorted_indices = range(len(input_ids))

#     # https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html, 길이를 맞추기 위해 pad 를 해줌. 전체적인 dim 이 같아짐.
#     input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
#                              batch_first=True)
    
#     # 불필요한 영역에는 attention 을 두지 않도록 attention mask 생성.
#     attention_mask = torch.tensor(
#         [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
#          sorted_indices])

#     # attention_mask = []
#     # for seq in input_ids:
#     #   seq_mask = [float(i>0) for i in seq]
#     #   attention_mask.append(seq_mask)
    
#     # attention_mask = torch.tensor(attention_mask)
      
#     token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
#     position_ids = torch.tensor([list(range(len(input_ids[index]))) for index  in sorted_indices])
#     labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])

#     return input_ids, attention_mask, token_type_ids, position_ids, labels


def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)
    # sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1] 
    sorted_indices = range(len(input_ids))

    # https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html,
    # 길이를 맞추기 위해 pad 를 해줌. batch 마다 적용
    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    # attention_mask = torch.tensor(
    #     [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
    #      sorted_indices])

    attention_mask = []
    for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_mask.append(seq_mask)
    attention_mask = torch.tensor(attention_mask)

    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])
    labels = torch.tensor(np.stack(labels, axis=0)[sorted_indices])
    

    return input_ids, attention_mask, token_type_ids, position_ids, labels

In [None]:
train_batch_size=128 # 32-> 128
eval_batch_size=256 # 64-> 256

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

In [None]:
# for idx, i in enumerate(train_loader):
#     input_ids, attention_mask, token_type_ids, position_ids, labels = i
#     print(i)
#     if idx == 0:
#       break

[tensor([[   2,   48,  209,  ...,    0,    0,    0],
        [   2, 2170,   14,  ...,    0,    0,    0],
        [   2,   13, 1373,  ...,    0,    0,    0],
        ...,
        [   2,   31, 1905,  ...,    0,    0,    0],
        [   2, 1138,   23,  ...,    0,    0,    0],
        [   2,  207,   13,  ...,    0,    0,    0]]), tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]]), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), tensor([[ 0,  1,  2,  ..., 19, 20, 21],
        [ 0,  1,  2,  ..., 19, 20, 21],
        [ 0,  1,  2,  ..., 19, 20, 21],
        ...,
        [ 0,  1,  2,  ..., 19, 20, 21],
        [ 0,  1,  2,  ...

In [None]:
# random seed
random_seed=33 # 42 -> 33
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# GPU 에 얹어주는 작업
model.to(device)

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [None]:
model.train()
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [None]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [None]:
train_epoch = 3
lowest_valid_loss = 9999.

# early stopping
patience_limit = 2
patience_check = 0


for epoch in range(train_epoch):

    # tqdm 은 Bar 형태로 학습량을 알려주는 module.
    with tqdm(train_loader, unit="batch") as tepoch:


        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")

            # 모든 데이터를 GPU 로 옮겨줌.
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()

            # forward-propagation 연산 진행
            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss

            # back-propagation 연산
            loss.backward()

            # 가중치 update
            optimizer.step()  

            # tqdm 에서 loss 를 출력하기 위해 loss 를 넣어줌.
            tepoch.set_postfix(loss=loss.item())


            # int(len(train_loader) / 5) == 0 마다 validation set 으로 evaluate.
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model.eval()
                    valid_losses = []
                    predictions = []
                    target_labels = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        # logits : tensor([[ 2.0392, -1.4302], [-1.1802,  2.7419], ...])
                        logits = output.logits

                        # loss :  tensor(0.1518, device='cuda:0', grad_fn=<NllLossBackward0>)
                        loss = output.loss
                        valid_losses.append(loss.item())

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        predictions += batch_predictions
                        target_labels += batch_labels


                # valid_losses : [0.07377569377422333, 0.0990561693906784, ..., 0.02395477145910263], len == 16
                # predictions :  [0, 0, 0, ..., 0] , len == 4000
                # target_labels : [1, 1, 1, ..., 0] , len == 4000
                # np.array(predictions) == np.array(target_labels) : [ True ...  True  True  True]
                # (np.array(predictions) == np.array(target_labels)).mean() : 0.9705

                acc = compute_acc(predictions, target_labels)
                valid_loss = sum(valid_losses) / len(valid_losses)
                if lowest_valid_loss > valid_loss:
                    lowest_valid_loss = valid_loss
                    print('Acc for model which have lower valid loss: ', acc, ', current loss is', valid_loss, ', lowest_valid_loss :', lowest_valid_loss)
                    torch.save(model.state_dict(), "./pytorch_model.bin")

                # early stopping 코드 추가
                    patience_check = 0
                else:
                    print('Lower than previous accuracy, accuracy is : ', acc, ', current loss is', valid_loss)
                    patience_limit += 1
                    if patience_check >= patience_limit: # early stopping 조건 만족 시 조기 종료
                        print('Ended training for bigger patience_check than limit')
                        break


Epoch 0:  20%|█▉        | 692/3463 [01:57<07:42,  6.00batch/s, loss=0.0767]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.49it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.75it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.41it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.79it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  6.97it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.03it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.30it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.48it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.53it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.05it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.18it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.21it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.25it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.86it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.04it/s][A
Epoch 0:  20%|██        | 693/

Acc for model which have lower valid loss:  0.9775 , current loss is 0.06242226355243474 , lowest_valid_loss : 0.06242226355243474


Epoch 0:  40%|███▉      | 1384/3463 [03:57<06:24,  5.41batch/s, loss=0.0125]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.26it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.72it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.38it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.79it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.02it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.09it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.35it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.54it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.56it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.10it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.23it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.28it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.27it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.81it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  6.96it/s][A
Epoch 0:  40%|████      | 138

Lower than previous accuracy, accuracy is :  0.977 , current loss is 0.06658012693515047


Epoch 0:  60%|█████▉    | 2076/3463 [05:57<03:50,  6.01batch/s, loss=0.0793]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.48it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.89it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:01,  6.55it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.94it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.16it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.25it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.45it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.59it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.63it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.12it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.25it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.28it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.25it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.84it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.04it/s][A
Epoch 0:  60%|█████▉    | 207

Lower than previous accuracy, accuracy is :  0.968 , current loss is 0.08190078008919954


Epoch 0:  80%|███████▉  | 2768/3463 [07:56<01:58,  5.84batch/s, loss=0.0342]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.59it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.95it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:01,  6.57it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.93it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.07it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.19it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.42it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.56it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.58it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.07it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.19it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.23it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.19it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.76it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  6.95it/s][A
Epoch 0:  80%|███████▉  | 276

Acc for model which have lower valid loss:  0.97875 , current loss is 0.057931277668103576 , lowest_valid_loss : 0.057931277668103576


Epoch 0: 100%|█████████▉| 3460/3463 [09:56<00:00,  5.59batch/s, loss=0.0435]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.54it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.92it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.50it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.91it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.15it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.16it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.37it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.53it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.59it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.05it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.19it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.20it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.20it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.78it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  6.94it/s][A
Epoch 0: 100%|█████████▉| 346

Lower than previous accuracy, accuracy is :  0.97575 , current loss is 0.060932592721655965


Epoch 0: 100%|██████████| 3463/3463 [09:59<00:00,  5.78batch/s, loss=0.0481]
Epoch 1:  20%|█▉        | 692/3463 [01:57<08:11,  5.64batch/s, loss=0.0735]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.19it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.71it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.31it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.81it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.06it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.09it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.33it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.54it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.58it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.08it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.17it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.18it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.23it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.86it/s][A
Eval:  94%

Lower than previous accuracy, accuracy is :  0.979 , current loss is 0.06464705849066377


Epoch 1:  40%|███▉      | 1384/3463 [03:56<05:32,  6.24batch/s, loss=0.0539]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.51it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.75it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.37it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.74it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  6.99it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.11it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.36it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.54it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.51it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.06it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.18it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.21it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.21it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.80it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.01it/s][A
Epoch 1:  40%|████      | 138

Lower than previous accuracy, accuracy is :  0.9755 , current loss is 0.07133784180041403


Epoch 1:  60%|█████▉    | 2076/3463 [05:56<03:59,  5.79batch/s, loss=0.0116]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.33it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.79it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.45it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.89it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.12it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.22it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.44it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.52it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.50it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.05it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.13it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.15it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.21it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.85it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  6.96it/s][A
Epoch 1:  60%|██████    | 207

Lower than previous accuracy, accuracy is :  0.978 , current loss is 0.06003027851693332


Epoch 1:  80%|███████▉  | 2768/3463 [07:56<01:57,  5.90batch/s, loss=0.0151] 
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.40it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.79it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.44it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.90it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.14it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.22it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.40it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.44it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.49it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.03it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.09it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.09it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.09it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.73it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  6.94it/s][A
Epoch 1:  80%|███████▉  | 27

Lower than previous accuracy, accuracy is :  0.9795 , current loss is 0.06021966290427372


Epoch 1: 100%|█████████▉| 3460/3463 [09:55<00:00,  5.65batch/s, loss=0.0781]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.59it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.99it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:01,  6.61it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.96it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.16it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.22it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.45it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.51it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.55it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.06it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.13it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.13it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.19it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.78it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  6.96it/s][A
Epoch 1: 100%|█████████▉| 346

Lower than previous accuracy, accuracy is :  0.97075 , current loss is 0.08041687426157296


Epoch 1: 100%|██████████| 3463/3463 [09:58<00:00,  5.79batch/s, loss=0.0482]
Epoch 2:  20%|█▉        | 692/3463 [01:57<07:43,  5.98batch/s, loss=0.0444]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.44it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.74it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.41it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.78it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.03it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.08it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.33it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.50it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.47it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.02it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.08it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.10it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.18it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.82it/s][A
Eval:  94%

Lower than previous accuracy, accuracy is :  0.97425 , current loss is 0.06766957091167569


Epoch 2:  40%|███▉      | 1384/3463 [03:57<06:00,  5.76batch/s, loss=0.0129]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.36it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.88it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:01,  6.52it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.92it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.15it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.22it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.46it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.62it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.64it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.14it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.25it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.26it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.29it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.89it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.09it/s][A
Epoch 2:  40%|████      | 138

Lower than previous accuracy, accuracy is :  0.97575 , current loss is 0.06255476688966155


Epoch 2:  60%|█████▉    | 2076/3463 [05:56<03:55,  5.90batch/s, loss=0.0565]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  3.99it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.55it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:02,  6.27it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.78it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.06it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.17it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.39it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.47it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.55it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.08it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.21it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.28it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.33it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.93it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.03it/s][A
Epoch 2:  60%|██████    | 207

Lower than previous accuracy, accuracy is :  0.97825 , current loss is 0.06794207583880052


Epoch 2:  80%|███████▉  | 2768/3463 [07:56<01:58,  5.86batch/s, loss=0.0194]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.54it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.95it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:01,  6.60it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.97it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.19it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.25it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.46it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.54it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.60it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.11it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.24it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.28it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.30it/s][A
Eval:  88%|████████▊ | 14/16 [00:01<00:00,  6.91it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.04it/s][A
Epoch 2:  80%|███████▉  | 277

Lower than previous accuracy, accuracy is :  0.97825 , current loss is 0.060896553099155426


Epoch 2: 100%|█████████▉| 3460/3463 [09:55<00:00,  5.23batch/s, loss=0.0983]
Eval:   0%|          | 0/16 [00:00<?, ?it/s][A
Eval:   6%|▋         | 1/16 [00:00<00:03,  4.57it/s][A
Eval:  12%|█▎        | 2/16 [00:00<00:02,  5.93it/s][A
Eval:  19%|█▉        | 3/16 [00:00<00:01,  6.55it/s][A
Eval:  25%|██▌       | 4/16 [00:00<00:01,  6.91it/s][A
Eval:  31%|███▏      | 5/16 [00:00<00:01,  7.05it/s][A
Eval:  38%|███▊      | 6/16 [00:00<00:01,  7.10it/s][A
Eval:  44%|████▍     | 7/16 [00:01<00:01,  7.39it/s][A
Eval:  50%|█████     | 8/16 [00:01<00:01,  7.48it/s][A
Eval:  56%|█████▋    | 9/16 [00:01<00:00,  7.54it/s][A
Eval:  62%|██████▎   | 10/16 [00:01<00:00,  7.07it/s][A
Eval:  69%|██████▉   | 11/16 [00:01<00:00,  7.21it/s][A
Eval:  75%|███████▌  | 12/16 [00:01<00:00,  7.19it/s][A
Eval:  81%|████████▏ | 13/16 [00:01<00:00,  7.26it/s][A
Eval:  88%|████████▊ | 14/16 [00:02<00:00,  6.87it/s][A
Eval:  94%|█████████▍| 15/16 [00:02<00:00,  7.05it/s][A
Epoch 2: 100%|█████████▉| 346

Lower than previous accuracy, accuracy is :  0.97725 , current loss is 0.062197313411161304


Epoch 2: 100%|██████████| 3463/3463 [09:58<00:00,  5.79batch/s, loss=0.0509]


In [None]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')

In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
test[:10]

['2 32 13 22 18 21 979 78 1496 17 78 11974 18 206 85 42 162 13 9 3',
 '2 86 17841 37 40 315 3839 13 9 3',
 '2 32 25 14 127 14351 7804 19 14 1152 13 9 3',
 '2 107 52 1676 21 9140 29 158 148 13 9 3',
 '2 31 23 4741 17 39 117 55 583 86 5733 17 5575 13 9 3',
 '2 14 104 584 95 420 30 23 1047 23 14 2364 13 9 3',
 '2 90 13 15 52 14 53 18 35 28 291 13 15 14 53 18 19 1630 183 13 9 3',
 '2 59 1499 32 70 431 26 42 17 50 253 15600 13 9 3',
 '2 3123 14 13533 144 13 103 22 38 166 143 184 20 170 14 1428 13 9 3',
 '2 59 57 40 5977 3155 16 22621 18 20 3538 37 13 9 3']

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)

    # 버그.
    # smaples 가 input id 만 가지고 있음. trainset 은 input_ids 와 labels 둘 다 가졌음. test_dataset 은 labels 이 없음.
    # input 의 순서를 바꿔주면 kaggle 에 내가 모르는 labels set 과 순서가 얻 바뀌게 된다.
    # sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1]
    sorted_indices = range(len(input_ids))

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    # attention_mask = torch.tensor(
    #     [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
    #      sorted_indices])

    attention_mask = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_mask.append(seq_mask)
    attention_mask = torch.tensor(attention_mask)

    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_batch_size = 32
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [None]:
with torch.no_grad():
    model.eval()
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions += batch_predictions


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:03,  9.20it/s][A
Test:  19%|█▉        | 6/32 [00:00<00:00, 29.91it/s][A
Test:  34%|███▍      | 11/32 [00:00<00:00, 36.96it/s][A
Test:  53%|█████▎    | 17/32 [00:00<00:00, 42.48it/s][A
Test:  69%|██████▉   | 22/32 [00:00<00:00, 44.44it/s][A
Test:  88%|████████▊ | 28/32 [00:00<00:00, 46.60it/s][A
                                                     [A

In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission2.csv', index=False)

# 새 섹션