<a href="https://colab.research.google.com/github/kimjjoy2/NSMC/blob/main/NSMC_KoELECTRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [91]:
# 네이버 영화리뷰 감정분석 데이터 다운로드
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [92]:
# transformers 설치
! pip install transformers



In [93]:
# import
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [94]:
# GPU 셑팅
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [95]:
# tokenizer, model 설치
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [96]:
import re

def text_cleansing(text):
  # E-mail 제거
  pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)'
  text = re.sub(pattern=pattern, repl=' ', string=text)

  # Url 제거
  pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$\-@\.&+:/?=]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
  text = re.sub(pattern=pattern, repl=' ', string=text)

  # 한글 자음, 모음 제거
  pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'
  text = re.sub(pattern=pattern, repl=' ', string=text)

  # \r, \n 제거
  pattern = '[\r|\n]'
  text = re.sub(pattern=pattern, repl=' ', string=text)
                
  # 특수문자 제거
  pattern = '[^\w\s]'
  text = re.sub(pattern=pattern, repl=' ', string=text)

  # 마지막으로 중복 스페이스 제거
  pattern = re.compile(r'\s+')
  text = re.sub(pattern=pattern, repl=' ', string=text)
                
  return text

In [97]:
text_cleansing('http://blog.naver.com/oroblast/220215679580 나쁜 인상은 아니지만 ㅋㅋ,\
                오랫동안 기억에 남아 ㅠㅠ 종종 떠올라서....조금은 사람을 피곤하게 만드는 영화. ^^')

' 나쁜 인상은 아니지만 오랫동안 기억에 남아 종종 떠올라서 조금은 사람을 피곤하게 만드는 영화 '

In [98]:
# 전처리
## NaN, 중복 제거
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file, sep, dup_field='document'):
  
    self.dataset = pd.read_csv(csv_file, sep=sep).dropna(axis=0) 
    if dup_field:
      self.dataset.drop_duplicates(subset=[dup_field], inplace=True)
    self.tokenizer = tokenizer

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:].values
    text = text_cleansing(row[0])

    if len(row) < 2:
      y = 0
    else:
      y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [99]:
# train, test dataset 준비
train_dataset = NSMCDataset("/content/nsmc/ratings_train.txt", '\t')
test_dataset = NSMCDataset("/content/nsmc/ratings_test.txt", '\t')
train_dataset[0]

                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000




(tensor([    2,  3079, 33345,  7082, 13215,  4065,  4116,  4150,  6933,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [100]:
# Train 환경 설정
epochs = 5
batch_size = 128
optimizer = AdamW(model.parameters(), lr=1e-5)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [101]:
#Training

losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))



Batch Loss: 69.10707527399063 Accuracy: tensor(0.5509, device='cuda:0')
Batch Loss: 132.6852662563324 Accuracy: tensor(0.6302, device='cuda:0')
Batch Loss: 184.6536005437374 Accuracy: tensor(0.6802, device='cuda:0')
Batch Loss: 230.98982188105583 Accuracy: tensor(0.7110, device='cuda:0')
Batch Loss: 274.6267458498478 Accuracy: tensor(0.7318, device='cuda:0')
Batch Loss: 316.35394367575645 Accuracy: tensor(0.7466, device='cuda:0')
Batch Loss: 356.7724584341049 Accuracy: tensor(0.7571, device='cuda:0')
Batch Loss: 396.54036071896553 Accuracy: tensor(0.7656, device='cuda:0')
Batch Loss: 434.3151884377003 Accuracy: tensor(0.7734, device='cuda:0')
Batch Loss: 471.88632410764694 Accuracy: tensor(0.7800, device='cuda:0')
Batch Loss: 508.91143107414246 Accuracy: tensor(0.7854, device='cuda:0')

Train Loss: 524.5880907252431 Accuracy: tensor(0.7874, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 35.43127501010895 Accuracy: tensor(0.8455, device='cuda:0')
Batch Loss: 69.99362386763096 Accuracy: tensor(0.8489, device='cuda:0')
Batch Loss: 103.5288502573967 Accuracy: tensor(0.8520, device='cuda:0')
Batch Loss: 137.87421499192715 Accuracy: tensor(0.8526, device='cuda:0')
Batch Loss: 171.999749943614 Accuracy: tensor(0.8520, device='cuda:0')
Batch Loss: 207.1256070882082 Accuracy: tensor(0.8512, device='cuda:0')
Batch Loss: 241.2078565210104 Accuracy: tensor(0.8514, device='cuda:0')
Batch Loss: 275.84064646065235 Accuracy: tensor(0.8511, device='cuda:0')
Batch Loss: 308.9022014886141 Accuracy: tensor(0.8518, device='cuda:0')
Batch Loss: 342.4180501550436 Accuracy: tensor(0.8521, device='cuda:0')
Batch Loss: 376.84874038398266 Accuracy: tensor(0.8522, device='cuda:0')

Train Loss: 391.05492383241653 Accuracy: tensor(0.8526, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 32.37973767518997 Accuracy: tensor(0.8573, device='cuda:0')
Batch Loss: 64.26474757492542 Accuracy: tensor(0.8598, device='cuda:0')
Batch Loss: 96.22224278748035 Accuracy: tensor(0.8608, device='cuda:0')
Batch Loss: 128.1199177801609 Accuracy: tensor(0.8623, device='cuda:0')
Batch Loss: 159.59329842031002 Accuracy: tensor(0.8626, device='cuda:0')
Batch Loss: 189.82957862317562 Accuracy: tensor(0.8639, device='cuda:0')
Batch Loss: 220.32567198574543 Accuracy: tensor(0.8650, device='cuda:0')
Batch Loss: 251.45565475523472 Accuracy: tensor(0.8652, device='cuda:0')
Batch Loss: 281.9277683198452 Accuracy: tensor(0.8659, device='cuda:0')
Batch Loss: 312.7619095593691 Accuracy: tensor(0.8661, device='cuda:0')
Batch Loss: 344.27640157938004 Accuracy: tensor(0.8657, device='cuda:0')

Train Loss: 357.9548992663622 Accuracy: tensor(0.8657, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 30.27489723265171 Accuracy: tensor(0.8702, device='cuda:0')
Batch Loss: 60.26538787782192 Accuracy: tensor(0.8726, device='cuda:0')
Batch Loss: 89.38130983710289 Accuracy: tensor(0.8745, device='cuda:0')
Batch Loss: 118.80826005339622 Accuracy: tensor(0.8744, device='cuda:0')
Batch Loss: 146.94555728137493 Accuracy: tensor(0.8763, device='cuda:0')
Batch Loss: 176.13868156075478 Accuracy: tensor(0.8764, device='cuda:0')
Batch Loss: 205.7573963701725 Accuracy: tensor(0.8761, device='cuda:0')
Batch Loss: 235.5367395579815 Accuracy: tensor(0.8756, device='cuda:0')
Batch Loss: 264.82060973346233 Accuracy: tensor(0.8755, device='cuda:0')
Batch Loss: 293.7229027748108 Accuracy: tensor(0.8756, device='cuda:0')
Batch Loss: 323.20567531883717 Accuracy: tensor(0.8755, device='cuda:0')

Train Loss: 335.88305404782295 Accuracy: tensor(0.8752, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 28.3016966432333 Accuracy: tensor(0.8787, device='cuda:0')
Batch Loss: 55.848214507102966 Accuracy: tensor(0.8818, device='cuda:0')
Batch Loss: 83.25853115320206 Accuracy: tensor(0.8827, device='cuda:0')
Batch Loss: 111.25274574756622 Accuracy: tensor(0.8828, device='cuda:0')
Batch Loss: 139.78401066362858 Accuracy: tensor(0.8817, device='cuda:0')
Batch Loss: 167.98809449374676 Accuracy: tensor(0.8817, device='cuda:0')
Batch Loss: 195.9541906118393 Accuracy: tensor(0.8820, device='cuda:0')
Batch Loss: 223.82260659337044 Accuracy: tensor(0.8818, device='cuda:0')
Batch Loss: 251.8782290071249 Accuracy: tensor(0.8818, device='cuda:0')
Batch Loss: 279.34421367943287 Accuracy: tensor(0.8820, device='cuda:0')
Batch Loss: 307.16702122986317 Accuracy: tensor(0.8820, device='cuda:0')

Train Loss: 319.78281730413437 Accuracy: tensor(0.8818, device='cuda:0')


In [102]:
# Test

model.eval()

test_correct = 0
test_total = 0
seq= 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  
  _, predicted = torch.max(y_pred, 1)

  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

  seq += 1
  #if seq>100: break

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.8768, device='cuda:0')


In [75]:
# 모델 저장

import os

model_dir = '/content/model/model_KoELECTRA/'

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

print("Saving model to %s" % model_dir)

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)


Saving model to /content/model/model_KoELECTRA/


('/content/model/model_KoELECTRA/tokenizer_config.json',
 '/content/model/model_KoELECTRA/special_tokens_map.json',
 '/content/model/model_KoELECTRA/vocab.txt',
 '/content/model/model_KoELECTRA/added_tokens.json')

In [90]:
# kaggle competition의 test set에 대한 prediction 결과

output_dir = '/content/output/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)\

final_dataset = NSMCDataset("/content/ko_data.csv", ',', dup_field=None)

wp = open('/content/output/ko_data_output_20201223.csv', 'w')
wp.write('Id,Predicted\n')

seq = 0

for text, attention_mask, y in final_dataset:
  output = model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))
  print(output)
  logits = output[0]
  logit = logits.detach().cpu().numpy()
  print(logit)
  wp.write('%d,%d\n' % (seq, np.argmax(logit)))
  seq += 1
  if seq > 10:
    break

wp.close() 

                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7114, -7.7849]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.71144  -7.784893]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7102, -7.7843]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.710239  -7.7843466]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7104, -7.7856]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.7104354 -7.7856197]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7114, -7.7851]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.711428 -7.785143]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7109, -7.7846]], device='cuda:0', grad_fn=<AddmmBackward>



SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7114, -7.7857]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.7113876 -7.785711 ]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7111, -7.7845]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.711133  -7.7844787]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7108, -7.7851]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.7108107 -7.78508  ]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7118, -7.7858]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.711816  -7.7858186]]
SequenceClassifierOutput(loss=None, logits=tensor([[ 6.7113, -7.7855]], device='cuda:0', grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
[[ 6.711316 -7.785481]]
