#**준비 사항**

In [None]:
from google.colab import drive

drive.mount('/gdrive/')

In [3]:
!pip install transformers
!pip install torch

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 12.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 51.1MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 50.5MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=3e8c159669e3

In [4]:
import tensorflow as tf
import torch

from transformers import ElectraTokenizer
from transformers import ElectraForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

#**데이터 로드**

In [None]:
!git clone https://github.com/e9t/nsmc.git

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Total 14763 (delta 0), reused 0 (delta 0), pack-reused 14763[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 15.98 MiB/s, done.
Resolving deltas: 100% (1749/1749), done.
Checking out files: 100% (14737/14737), done.


In [None]:
# train/test set 데이터 로드

train = pd.read_csv("nsmc/ratings.txt", sep='\t')
test = pd.read_csv("ko_data.txt", sep='\t')

label이 0이면 부정, label이 1이면 긍정

#**전처리**

In [None]:
MAX_LEN = 128

def getInputs(dataset):
  data = dataset.copy(deep=True)

  if 'document' in data.columns:
    sentences = data['document']
  else:
    sentences = data['Sentence']

  sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
  
  tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v2-discriminator", do_lower_case=False)
  tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  return input_ids, attention_masks

In [None]:
def getIndex(dataset):
  data = dataset.copy(deep = True)
  input_index = data.index.tolist()
  return torch.tensor(input_index)

In [None]:
labels = train['label'].values
ratings_inputs, ratings_masks = getInputs(train)
test_inputs, test_masks = getInputs(test)

In [None]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(ratings_inputs, labels, random_state=2018, test_size=0.1)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(ratings_masks, ratings_inputs, random_state=2018, test_size=0.1)

In [None]:
# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)			

test_index = getIndex(test)
test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)

In [None]:
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_index, test_inputs, test_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

#**모델 생성**

In [5]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# ELECTRA 모델 생성

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v2-discriminator", num_labels = 2)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=487.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443133604.0, style=ProgressStyle(descri…




ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 3e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 4

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로직과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  5,625.    Elapsed: 0:10:27.
  Batch 1,000  of  5,625.    Elapsed: 0:20:53.
  Batch 1,500  of  5,625.    Elapsed: 0:31:20.
  Batch 2,000  of  5,625.    Elapsed: 0:41:47.
  Batch 2,500  of  5,625.    Elapsed: 0:52:15.
  Batch 3,000  of  5,625.    Elapsed: 1:02:45.
  Batch 3,500  of  5,625.    Elapsed: 1:13:15.
  Batch 4,000  of  5,625.    Elapsed: 1:23:46.
  Batch 4,500  of  5,625.    Elapsed: 1:34:17.
  Batch 5,000  of  5,625.    Elapsed: 1:44:47.
  Batch 5,500  of  5,625.    Elapsed: 1:55:17.

  Average training loss: 0.29
  Training epcoh took: 1:57:55

Running Validation...
  Accuracy: 0.87
  Validation took: 0:04:43

Training...
  Batch   500  of  5,625.    Elapsed: 0:10:30.
  Batch 1,000  of  5,625.    Elapsed: 0:21:00.
  Batch 1,500  of  5,625.    Elapsed: 0:31:31.
  Batch 2,000  of  5,625.    Elapsed: 0:41:59.
  Batch 2,500  of  5,625.    Elapsed: 0:52:28.
  Batch 3,000  of  5,625.    Elapsed: 1:02:56.
  Batch 3,500  of  5,625.    Elapsed: 1:13:25.

#**test set 평가**

In [None]:
tmp_test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)
test_result = test.copy(deep = True)
test_result['Predicted'] = 'default'
classes = [0, 1]

#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(tmp_test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_index, b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    idx = b_index.item()
    test_result['Predicted'][idx] = classes[np.argmax(logits)]
    

    nb_eval_steps += 1

print("")
print("Test took: {:}".format(format_time(time.time() - t0)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


  Batch   100  of    350.    Elapsed: 0:00:03.
  Batch   200  of    350.    Elapsed: 0:00:05.
  Batch   300  of    350.    Elapsed: 0:00:08.
  Batch   400  of    350.    Elapsed: 0:00:10.
  Batch   500  of    350.    Elapsed: 0:00:13.
  Batch   600  of    350.    Elapsed: 0:00:16.
  Batch   700  of    350.    Elapsed: 0:00:18.
  Batch   800  of    350.    Elapsed: 0:00:21.
  Batch   900  of    350.    Elapsed: 0:00:23.
  Batch 1,000  of    350.    Elapsed: 0:00:26.
  Batch 1,100  of    350.    Elapsed: 0:00:29.
  Batch 1,200  of    350.    Elapsed: 0:00:31.
  Batch 1,300  of    350.    Elapsed: 0:00:34.
  Batch 1,400  of    350.    Elapsed: 0:00:36.
  Batch 1,500  of    350.    Elapsed: 0:00:39.
  Batch 1,600  of    350.    Elapsed: 0:00:42.
  Batch 1,700  of    350.    Elapsed: 0:00:44.
  Batch 1,800  of    350.    Elapsed: 0:00:47.
  Batch 1,900  of    350.    Elapsed: 0:00:49.
  Batch 2,000  of    350.    Elapsed: 0:00:52.
  Batch 2,100  of    350.    Elapsed: 0:00:55.
  Batch 2,200

In [None]:
test_csv = test_result.to_csv('test.csv')

In [None]:
from google.colab import files

files.download('test.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**코드 참고: https://blog.naver.com/horajjan/221739630055**