In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "../data/"
SEED = 42
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
class Vocab:
    def __init__(self, tokens_list, specials = [], unk_token = None):
        self.id2token = set()
        for tokens in tokens_list:
            self.id2token.update(tokens)
        self.id2token = specials + sorted(list(self.id2token))
        self.token2id = { t:i for i, t in enumerate(self.id2token)  }
        self.unk_token = unk_token
    def __len__(self):
        return len(self.id2token)

    def __call__(self, tokens):
        return [
                self.token2id[t] if self.token2id.get(t) is not None else self.token2id[self.unk_token]
                for t in tokens
                ]

In [4]:
train = pd.read_csv(f"{DATA_PATH}review_train.csv")
test = pd.read_csv(f"{DATA_PATH}review_test.csv")

train.shape, test.shape

((2000, 3), (1000, 2))

In [5]:
%pip install kiwipiepy

Collecting kiwipiepy
  Downloading kiwipiepy-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting kiwipiepy-model<0.21,>=0.20 (from kiwipiepy)
  Downloading kiwipiepy_model-0.20.0.tar.gz (34.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.7/34.7 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading kiwipiepy-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: kiwipiepy-model
  Building wheel for kiwipiepy-model (setup.py) ... [?25l[?25hdone
  Created wheel for kiwipiepy-model: filename=kiwipiepy_model-0.20.0-py3-none-any.whl size=34818026 sha256=259bcffa78e2e589db752deb4faf9d8b46a52d0cc530ec9498b567ab0352272a
  Stored in directory: /root/.cache/pip/wheels/b6/b1/66/2be9840f

In [6]:
from kiwipiepy import Kiwi
kiwi = Kiwi()

In [7]:
result = kiwi.tokenize(train["review"])
train_list = [[t.form for t in tokens] for tokens in result]

In [8]:
result = kiwi.tokenize(test["review"])
test_list = [[t.form for t in tokens] for tokens in result]

In [9]:
len(train_list), len(test_list)

(2000, 1000)

# 단어 번호 부여

In [10]:
vocab = Vocab(train_list, ["<pad>", "<unk>"], "<unk>")
len(vocab)

4852

In [11]:
train_data = [vocab(t) for t in train_list]
test_data = [vocab(t) for t in test_list]

# 패딩

In [12]:
t_cnt = [len(t) for t in train_data]
np.mean(t_cnt), np.max(t_cnt), np.min(t_cnt)

(19.645, 94, 1)

In [13]:
max_len = np.max(t_cnt)
max_len

94

In [14]:
train_data = [t + [0] * (max_len - len(t)) if len(t) < max_len else t[:max_len] for t in train_data]
train_data = np.array(train_data)
train_data.shape

(2000, 94)

In [15]:
test_data = [t + [0] * (max_len - len(t)) if len(t) < max_len else t[:max_len] for t in test_data]
test_data = np.array(test_data)
test_data.shape

(1000, 94)

In [16]:
target = train["target"].to_numpy().reshape(-1, 1)
target.shape

(2000, 1)

# 데이터셋 클래스

In [17]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x, self.y = x, y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, i):
        item = {}
        item["x"] = torch.tensor(self.x[i])

        if self.y is not None:
            item["y"] = torch.Tensor(self.y[i])

        return item

In [18]:
dataset = ReviewDataset(train_data, target)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2)
batch = next(iter(dataloader))
batch

{'x': tensor([[3445, 4189, 3415, 3114, 1617, 3490, 2103, 1190,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
         [2824, 2103, 2987, 3986, 3337,  990, 2934, 2237,   29,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0, 

# torch.nn.TransformerEncoderLayer
- d_model: 기본값 없음(필수)
- nhead: 기본값 없음(필수)
- dim_feedforward: 기본값 2048
- batch_first: 기본값 False

# torch.nn.TransformerEncoder
- encoder_layer: encoder_layer 객체
- num_layers: layer 개수

In [19]:
d_model = 512
emb_layer = torch.nn.Embedding(len(vocab), d_model)
encoder_layer = torch.nn.TransformerEncoderLayer(d_model, 8, batch_first=True)
encoder = torch.nn.TransformerEncoder(encoder_layer, 6)
encoder(emb_layer(batch["x"])).shape # batch, seq, feature

torch.Size([2, 94, 512])

# 모델 클래스

In [20]:
class Net(torch.nn.Module):
    def __init__(self, vocab_size, d_model, max_len, device, nhead, dim_feedforward, num_layers):
        super().__init__()
        self.emb_layer = torch.nn.Embedding(vocab_size, d_model)
        # 포지셔널 임베딩 방식 적용
        self.pos = torch.arange(max_len).to(device)
        self.pos_emb_layer = torch.nn.Embedding(max_len, d_model)

        # 인코더 구성
        self.encoder_layer = torch.nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, batch_first=True)
        self.encoder = torch.nn.TransformerEncoder(self.encoder_layer, num_layers) # batch, seq, feature

        self.flatten = torch.nn.Flatten() # batch, seq * feature
        self.dropout = torch.nn.Dropout(0.5)
        self.fc = torch.nn.Linear(max_len * d_model, 1)

    def forward(self, x):
        x = self.emb_layer(x) # 임베딩
        pos = self.pos_emb_layer(self.pos) # 위치정보 임베딩
        x = torch.add(x, pos) # 위치정보 반영
        x = self.encoder(x) # 인코더에 전달, batch, seq, feature
        x = self.flatten(x) # batch, seq * feature
        x = self.dropout(x)
        return self.fc(x) # batch, 1에 대한 예측값

In [24]:
hp = {
    'vocab_size': len(vocab),
    'd_model': 256,
    'max_len': max_len,
    'device': "cpu",
    'nhead': 8,
    'dim_feedforward': 512,
    'num_layers': 1
}
model = Net(**hp)
pred = model(batch["x"])
pred

tensor([[-0.0267],
        [ 1.6329]], grad_fn=<AddmmBackward0>)

# 학습 루프

In [25]:
def train_loop(dataloader, model, loss_function, optimizer, device):
    epoch_loss = 0
    model.train()

    for batch in dataloader:
        pred = model(batch["x"].to(device))
        loss = loss_function(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

# 테스트 루프

In [26]:
@torch.no_grad()
def test_loop(dataloader, model, loss_function, device):
    epoch_loss = 0
    model.eval()

    act = torch.nn.Sigmoid()
    pred_list = []
    for batch in dataloader:
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None:
            loss = loss_function(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        pred = act(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    epoch_loss /= len(dataloader)

    return epoch_loss, pred

# 학습

In [29]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

n_splits = 5
batch_size = 32
epochs = 100
loss_function = torch.nn.BCEWithLogitsLoss()
cv = KFold(n_splits, shuffle=True, random_state=SEED)
hp = {
    'vocab_size': len(vocab),
    'd_model': 256,
    'max_len': max_len,
    'device': device,
    'nhead': 8,
    'dim_feedforward': 512,
    'num_layers': 1
}

In [None]:
is_holdout = False
reset_seeds(SEED)
score_list = []

for i, (tri, vai) in enumerate(cv.split(train_data)):
    # 학습 데이터
    train_dataset = ReviewDataset(train_data[tri], target[tri])
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 검증 데이터
    valid_dataset = ReviewDataset(train_data[vai], target[vai])
    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    # 모델 객체 및 옵티마이저 생성
    model = Net(**hp).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    patience = 0 # 조기 종료 조건을 주기 위한 변수
    best_score = 0 # 현재 최고 점수 / mse, mae 등은 np.inf로 초기화
    for _ in tqdm(range(epochs)):
        train_loss = train_loop(train_dataloader, model, loss_function, optimizer, device)
        valid_loss, pred = test_loop(valid_dataloader, model, loss_function, device)
        pred = (pred > 0.5).astype(int)
        score = accuracy_score(target[vai], pred)
        print(train_loss, valid_loss, score)

        patience += 1
        if score > best_score:
            best_score = score
            patience = 0
            torch.save(model.state_dict(), f"../output/model{i}.pt")

        if patience == 5:
            break

    score_list.append(best_score)
    print(f"ACC 최고점수: {best_score}")

    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

1.1688455247879028 0.7481332329603342 0.4925
0.7352928924560547 0.75497344823984 0.57
0.6900431597232819 0.8359761650745685 0.615
0.4157574170827866 0.7230402460465064 0.6275
0.3121890276670456 0.8441453553163089 0.6675
0.16872768014669418 0.9979861814242142 0.665
0.13300941836088895 1.997189347560589 0.6
0.1496505456417799 1.4515690390880291 0.6525
0.055383563302457334 1.2781913601435149 0.665
0.04376251245383173 1.6600218988381898 0.6525
ACC 최고점수: 0.6675


  0%|          | 0/100 [00:00<?, ?it/s]

1.5788658797740935 0.7575644025435815 0.5175
0.7338177394866944 0.9289927115807166 0.4925
0.6534370636940002 0.9450446367263794 0.5475
0.5177835243940353 0.8203781797335699 0.605
0.27321470469236375 0.7564774430715121 0.6675
0.17856101721525192 0.873147175862239 0.655
0.085124482922256 1.3273303646307726 0.6325
0.09768578585237264 1.0808375890438373 0.67
0.04701994081959129 1.0889375255658076 0.685
0.022943169539794327 1.2172338595757117 0.685
0.019084877900313588 1.4573684747402484 0.685
0.006786317810183391 1.3909779282716603 0.715
0.009047729574667755 1.5405396544016325 0.705
0.05107526845415123 1.1993495959502 0.7
0.07845718767610378 1.2970846043183253 0.685
0.05622975429520011 1.5393707477129424 0.6875
0.05468544753268361 1.2733392486205468 0.705
ACC 최고점수: 0.715


  0%|          | 0/100 [00:00<?, ?it/s]

1.3398889541625976 1.0602996716132531 0.5125
0.9400234544277191 0.9301004455639765 0.5
0.7949424076080323 0.7387773027786841 0.6175
0.5525032770633698 0.6522212303601779 0.6575
0.32859320133924486 0.7072477524097149 0.685
0.2665224906802177 0.8365707649634435 0.6475
0.15947559729218483 0.9152479011278886 0.6925
0.07169504461809993 1.014506344611828 0.69
0.0280920500587672 1.086044593499257 0.72
0.018375426293350757 1.1360142391461592 0.735
0.023124751111026853 1.3447032800087562 0.67
0.039277573425788435 1.4152727172924922 0.7025
0.04870607195189223 1.3643893828758826 0.6875
0.0654824550775811 1.5832693668512197 0.67
0.07343635377008467 1.3353621042691743 0.67
ACC 최고점수: 0.735


  0%|          | 0/100 [00:00<?, ?it/s]

1.440571321249008 0.703774305490347 0.545
0.7295774590969085 0.7207405796417823 0.5775
0.6452124404907227 0.9995751656018771 0.5725
0.4258888679742813 0.7100784870294424 0.645
0.2900964434444904 0.858534927551563 0.665
0.23177308171987535 1.0238430362481337 0.6925
0.14060151919722558 1.0233175204350398 0.68
0.047525594290345904 1.2345772293897777 0.68
0.03518637337721884 1.3798878651398878 0.6625
0.026899691340513527 1.2725975559307978 0.675
0.02155058258329518 1.5171914375745332 0.67
ACC 최고점수: 0.6925


  0%|          | 0/100 [00:00<?, ?it/s]

1.384877973794937 0.7968332538237939 0.4825
0.7879127752780914 0.7028959806148822 0.58
0.6622250407934189 1.0736814737319946 0.5525
0.5917326545715332 0.6863963237175574 0.6525
0.3216567996144295 0.7060662645560044 0.6725
0.21225369788706303 1.0178312888512244 0.6275
0.11409275520592928 1.0059321476862981 0.675
0.06822361637838185 1.163360779102032 0.69
0.0334315509069711 1.3303261078321016 0.685
0.01754764930345118 1.2975743137873137 0.6925
0.01149451453005895 1.5022401076096754 0.6675
0.005655324784456752 1.5347431989816518 0.66
0.005621144052711315 1.7468997744413524 0.675
0.003996913488372229 1.7117409981214082 0.6825
0.0022775738048221683 1.818321888263409 0.655
ACC 최고점수: 0.6925


# 테스트 데이터 예측

In [31]:
test_dataset = ReviewDataset(test_data)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
pred_list = []

for i in range(n_splits):
    model = Net(**hp).to(device)
    state_dict = torch.load(f"../output/model{i}.pt", weights_only=True)
    model.load_state_dict(state_dict)

    _, pred = test_loop(test_dataloader, model, None, device)
    pred_list.append(pred)

In [33]:
pred = np.mean(pred_list, axis=0)
pred = (pred > 0.5).astype(int)
pred.shape

(1000, 1)