In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = os.getcwd() + '/data/review/'
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [2]:
train = pd.read_csv(f"{DATA_PATH}review_train.csv")
test = pd.read_csv(f"{DATA_PATH}review_test.csv")
train.shape, test.shape

((2000, 3), (1000, 2))

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2000 non-null   object
 1   review  2000 non-null   object
 2   target  2000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [4]:
train.head()

Unnamed: 0,id,review,target
0,train_0,이런 최고의 영화를 이제서야 보다니,1
1,train_1,안봤지만 유승준나와서 비추.,0
2,train_2,시대를 못 따라간 연출과 촌스러운 영상미.,0
3,train_3,원소전 굿,1
4,train_4,ㅋㅋㅋㅋ 개봉영화평점단사람이1명 ㅋㅋㅋㅋ,1


In [5]:
!pip install kiwipiepy





In [6]:
from kiwipiepy import Kiwi

kiwi = Kiwi()

# 토큰화

In [7]:
gen = kiwi.tokenize(train["review"])
train_list = []
for tokens in gen:
    tmp = [ t.form  for t in tokens ]
    train_list.append(tmp)

In [8]:
train_list

[['이런', '최고', '의', '영화', '를', '이제서야', '보', '다니'],
 ['안', '보', '었', '지만', '유승준', '나오', '어서', '비추', '.'],
 ['시대', '를', '못', '따라가', 'ᆫ', '연출', '과', '촌', '스럽', '은', '영상미', '.'],
 ['원소', '전', '굿'],
 ['ㅋㅋㅋㅋ', '개봉', '영화', '평점', '단', '사람', '이', '1', '명', 'ㅋㅋㅋㅋ'],
 ['실화', '이', '라니', '너무', '가슴', '아프', '다', '...'],
 ['뭐',
  '이',
  '야',
  '이거',
  'ㅡㅡ',
  '폴',
  '워커',
  '믿',
  '고',
  '보',
  'ᆯ',
  '영화',
  '는',
  '분',
  '질',
  '뿐',
  '이',
  'ᆫ가'],
 ['갑자기',
  '생각나',
  '어서',
  '오늘',
  '다시',
  '보',
  '었',
  '는데',
  '...',
  '역시',
  '...',
  '말',
  '이',
  '필요',
  '없',
  '네요'],
 ['주', '님', '사랑', '하', 'ᆸ니다', '.', '행복', '하', 'ᆸ니다', '.'],
 ['하드보일드',
  '액션',
  '...',
  '그거',
  'ᆫ',
  '봐주',
  'ᆯ',
  '만',
  '한데',
  '...',
  '스토리',
  '가',
  '약하',
  '어',
  '...',
  '그래서',
  '설득력',
  '도',
  '떨어지',
  '고'],
 ['믿',
  '을',
  '수',
  '없',
  '어',
  '이렇',
  '게',
  '평점',
  '이',
  '높',
  '다니',
  'ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ'],
 ['스토리', '가', '좀', '아쉽', '기', 'ᆫ', '하', '지만', '나름', '설레', '면서', '보', 'ᆷ'],
 ['춘추전국시대',
  '에',
  '공자',
 

In [9]:
gen = kiwi.tokenize(test["review"])
test_list = []

for tokens in gen:
    tmp = [ t.form  for t in tokens ]
    test_list.append(tmp)
    

In [10]:
test_list

[['이',
  '영화',
  '를',
  '만드',
  'ᆫ',
  '의도',
  '가',
  '뭐',
  '이',
  '지',
  '?',
  '-',
  '_',
  '-',
  ';'],
 ['굿', '굿', '역시', '우디', '앨런', '!!'],
 ['가볍',
  '게',
  '보',
  '기에',
  '엄청',
  '괜찮',
  '은',
  '영화',
  '이',
  '네요',
  '남',
  '주',
  '여주',
  '둘',
  '다',
  '매력',
  '적',
  '이',
  'ᆷ',
  ':',
  ')'],
 ['눈물',
  '이',
  '흐르',
  'ᆫ다',
  '.',
  '저',
  '분',
  '들',
  '이',
  '있',
  '기에',
  '오늘',
  '의',
  '우리',
  '가',
  '있',
  '다'],
 ['이것', '이', '대체', '뭐', '이', '지', '지루', '하', '어서', '죽', '을', '뻔'],
 ['B',
  '급',
  '영화',
  '무시하나효',
  '?',
  '이거',
  'ᆫ',
  'C',
  '급',
  '이',
  'ᆸ니다',
  '그려',
  '~',
  '.',
  '어설프',
  '고',
  '조악하',
  'ᆫ',
  '졸작'],
 ['나',
  '가',
  '보',
  'ᆫ',
  '최고',
  '의',
  '영화',
  '중',
  '한',
  '편',
  '으로',
  '각인',
  '되',
  'ᆯ',
  '것',
  '이',
  '다',
  '.'],
 ['내',
  '인생',
  '최악',
  '의',
  '영화',
  '.',
  '박보영',
  '의',
  '매력',
  '이',
  '담기',
  'ᆫ',
  ',',
  '팬심',
  '으로',
  'ᆫ',
  '좋',
  '지만',
  '영화',
  '그',
  '자체',
  '를',
  '평가',
  '하',
  'ᆫ다면',
  ',',
  '1',
  '점',
  '도',
  '아깝',


# 어휘집

In [11]:
from torchtext.vocab import build_vocab_from_iterator
vocab = build_vocab_from_iterator(train_list, specials=["<pad>","<unk>"])
vocab.set_default_index( vocab["<unk>"] )
len(vocab)


# 학습데이터의 리스트만 가지고 있어야함



4853

# 단어번호 부여

In [12]:
train_list = [ vocab(tokens)  for tokens in train_list ]
test_list = [ vocab(tokens)  for tokens in test_list ] # train 학습 정보만을 사용해야함

# 패딩

In [13]:
max_len = max( len(lst) for lst in train_list )
max_len

94

In [14]:
train_data = [  lst + [0] * ( max_len-len(lst) )  if len(lst) < max_len else lst[:max_len]  for lst in train_list ]
train_data = np.array(train_data)
train_data.shape # batch, seq

(2000, 94)

In [15]:
test_data = [  lst + [0] * ( max_len-len(lst) )  if len(lst) < max_len else lst[:max_len]  for lst in test_list ]
test_data = np.array(test_data)
test_data.shape # batch, seq

(1000, 94)

# 정답 데이터

In [16]:
target = train["target"].to_numpy().reshape(-1,1)
target.shape

(2000, 1)

# 데이터셋 클래스

In [17]:
train_data.dtype

dtype('int32')

In [18]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, x , y=None):
        self.x = x # batch, seq
        self.y = y
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.tensor(self.x[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item

In [19]:
dt = ReviewDataset(train_data, target)
dl = torch.utils.data.DataLoader(dt, batch_size=2, shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[  83,   68,   12,    7,   27, 1189,   10,  210,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
         [  48,   10,    8,   61, 3923,   64,   34, 1592,    5,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0, 

# TransformerEncoderLayer
- 주요 파라미터
    - d_model: 임베딩 벡터 사이즈
    - nhead: 헤드수
    - dim_feedforward: 피드포워드 신경망 부분에 노드수
    - batch_first: True(batch, seq, features)
        - 기본값은 False

In [20]:
d_model = 512
nhead = 8
emb_layer = torch.nn.Embedding(len(vocab), d_model)
encoder_layer = torch.nn.TransformerEncoderLayer(d_model, nhead, batch_first=True)
encoder_layer

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

In [22]:
encoder_layer.forward

<bound method TransformerEncoderLayer.forward of TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)>

# torch.nn.TransformerEncoder
- TransformerEncoderLayer 객체를 여러개 쌓을 수 있게 해주는 클래스
- 주요 파라미터
    - encoder_layer: 첫번째 파라미터로 인코더레이어 객체를 전달
    - num_layers: 인코더 레이어를 얼마나 쌓을 것인가를 지정

In [23]:
encoder = torch.nn.TransformerEncoder(encoder_layer, 6)
encoder

TransformerEncoder(
  (layers): ModuleList(
    (0-5): 6 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (linear1): Linear(in_features=512, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)

In [24]:
encoder.forward

<bound method TransformerEncoder.forward of TransformerEncoder(
  (layers): ModuleList(
    (0-5): 6 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (linear1): Linear(in_features=512, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)>

In [25]:
len(batch["x"][0,])

94

In [26]:
x = emb_layer(batch["x"]) # b, s, f
x = encoder(x) # b ,s ,f
x.shape

torch.Size([2, 94, 512])

# 모델 클래스

In [27]:
class Net(torch.nn.Module):
    def __init__(self,
                 vocab_size, # 단어사전크기
                 max_len,  # 최대시퀀스길이
                 d_model=512, # d_model,헤드수
                 nhead=8, # 헤드수
                 dim_feedforward=2048, # 피드포워드 신경망 부분의 노드수
                 num_layers=6, # 인코더 레이어수
                 device="cpu"):
        super().__init__()

        self.emb_layer = torch.nn.Embedding(vocab_size, d_model) # 단어 임베딩

        # 포지셔널 임베딩
        self.pos = torch.arange(max_len).to(device)
        self.pos_emb_layer = torch.nn.Embedding(max_len, d_model) # 위치정보 임베딩

        # 인코더 레이어
        self.encoder_layer = torch.nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, batch_first=True)
        # 인코더
        self.encoder = torch.nn.TransformerEncoder(self.encoder_layer, num_layers) # b, s, f

        # self.flatten = torch.nn.Flatten() # b, s x f
        # self.dropout = torch.nn.Dropout(0.5)
        self.gl_pool = torch.nn.AdaptiveMaxPool1d(1)
        self.flatten = torch.nn.Flatten()

        self.fc_out = torch.nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.emb_layer(x) # 단어 임베딩 b, s , f
        pos = self.pos_emb_layer(self.pos) # 위치 정보 임베딩 s, f
        x = x + pos # 배치 방향으로 브로드 캐스팅 되서 더하기 연산 된다. b, s, f
        x = self.encoder(x) # b, s ,f
        x = x.permute(0,2,1) # b , f , s
        x = self.gl_pool(x) # b, f, 1
        x = self.flatten(x) # b, f
        # x = self.dropout(x)
        return self.fc_out(x)

In [28]:
model = Net(len(vocab), max_len)
model(batch["x"])

tensor([[1.1046],
        [1.2173]], grad_fn=<AddmmBackward0>)

In [30]:
model

Net(
  (emb_layer): Embedding(4853, 512)
  (pos_emb_layer): Embedding(94, 512)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)


In [33]:
model.forward

<bound method Net.forward of Net(
  (emb_layer): Embedding(4853, 512)
  (pos_emb_layer): Embedding(94, 512)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, o

# 학습 loop 함수

In [26]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train() # 학습 모드
    
    for batch in dataloader:
        pred = model( batch["x"].to(device) )
        loss = loss_fn( pred, batch["y"].to(device) )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

# 검증 및 테스트 loop 함수

In [27]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    epoch_loss = 0
    pred_list = []
    act_func = torch.nn.Sigmoid()
    model.eval() # 평가 모드

    for batch in dataloader:
        pred = model( batch["x"].to(device) )
        
        if batch.get("y") is not None:
            loss = loss_fn( pred, batch["y"].to(device) )
            epoch_loss += loss.item()

        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

# 하이퍼파라미터 정의

In [28]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
n_splits = 5
cv = KFold(n_splits, shuffle=True, random_state=SEED)

batch_size = 32 # 배치 사이즈
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 객체
epochs = 100 # 최대 가능한 에폭수

hp = {
    "vocab_size":len(vocab),
    "max_len":max_len,
    "d_model":256,
    "nhead":8,
    "dim_feedforward":512,
    "num_layers":1,
    "device":device
}

# 학습

In [29]:
DATA_PATH

'c:\\study\\04_NLP/data/review/'

In [30]:
is_holdout = False
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []
for i, (tri, vai) in enumerate( cv.split(train_data) ):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(train_data[tri], target[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(train_data[vai], target[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(**hp).to(device)
    optimizer = torch.optim.Adam( model.parameters() )

    best_score = 0 # 현재 최고 점수
    patience = 0 # 조기 종료 조건을 주기 위한 변수
    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        pred = (pred > 0.5).astype(int) # 이진분류 문제에서 클래스 번호 결정
        score = accuracy_score(target[vai], pred)

        #print(train_loss, valid_loss, score)
        if score > best_score:
            best_score = score # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"{DATA_PATH}weight/trans_model_{i}.pth") # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 100:
            break

    print(f"{i}번째 폴드 최고 정확도: {best_score}")
    best_score_list.append(best_score)

    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


0번째 폴드 최고 정확도: 0.7175


  0%|          | 0/100 [00:00<?, ?it/s]

1번째 폴드 최고 정확도: 0.7175


  0%|          | 0/100 [00:00<?, ?it/s]

2번째 폴드 최고 정확도: 0.675


  0%|          | 0/100 [00:00<?, ?it/s]

3번째 폴드 최고 정확도: 0.7175


  0%|          | 0/100 [00:00<?, ?it/s]

4번째 폴드 최고 정확도: 0.74


In [31]:
np.mean(best_score_list)

0.7135000000000001