In [1]:
!pip install transformers
!pip install soynlp

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
import tensorflow as tf
import torch

from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertTokenizer
from soynlp.normalizer import repeat_normalize

import pandas as pd
import numpy as np
import random
import time
import datetime
import os
import re


In [5]:
data = pd.read_csv('final Deoksugung review.csv')
train_set, test_set = train_test_split(data, test_size=0.2,random_state=34)

print(train_set.shape,test_set.shape)

(197, 4) (50, 4)


In [6]:
# 반복 단어 축약
train_set['CONTENT'] = train_set['CONTENT'].apply(lambda sentence: repeat_normalize(sentence, num_repeats=2))
test_set['COTENT'] = test_set['CONTENT'].apply(lambda sentence: repeat_normalize(sentence, num_repeats=2))

#중복데이터제거
train_set.drop_duplicates(subset=['CONTENT'], inplace=True)
test_set.drop_duplicates(subset=['CONTENT'], inplace=True)

# 파일 저장
train_set.to_csv('preprocessed_train', encoding='utf-8')
test_set.to_csv('preprocessed_test', encoding='utf-8')

In [7]:
sentences = train_set['CONTENT'].values
sentences =["[CLS]"+ str(sentence) + "[SEP]" for sentence in sentences]
sentences[:10]

['[CLS]Its nice that you can watch it for 1000 won You can sit and listen to the sound of the water[SEP]',
 '[CLS]The night view of Deoksugung is also very good until 9pm[SEP]',
 '[CLS]Enter the Daehanmun Gate and enjoy the autumn in the city[SEP]',
 '[CLS]First time to explore Deoksugung Palace that I only heard from the lyrics[SEP]',
 '[CLS]A resting place in the city where the classics and the modern meet[SEP]',
 '[CLS]late Joseon Dynasty A piece of sad history[SEP]',
 '[CLS]It is nice to see Seokjojeon Hall and other palaces You must make a reservation to enter Seokjojeon Hall You dont have to make a reservation for the basement but there are few[SEP]',
 '[CLS]There is no one watching the work quietly and it is very noisy[SEP]',
 '[CLS]It was quiet and it was snowing during the holidays so it was nice to take a walk[SEP]',
 '[CLS]A cultural commentator gave detailed explanations starting near the entrance and saw the changing of the guards ceremony[SEP]']

In [8]:
#라벨 추출
labels = train_set['STAR'].values
labels

array([5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 4, 5, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 5, 5, 5, 4, 4, 5, 5,
       5, 5, 5, 5, 4, 4, 5, 5, 5, 5, 3, 3, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5,
       5, 5, 5, 4, 5, 4, 5, 4, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 2, 4, 5, 5, 5, 4, 5, 5, 5, 5,
       5, 5, 5, 4, 5, 1, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 4, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 3,
       5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5,
       4, 5, 3, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 4, 4, 5, 5, 5, 5, 5],
      dtype=int64)

In [9]:
print(len(sentences))
print(len(labels))

197
197


In [10]:

tokenizer = AutoTokenizer.from_pretrained('beomi/kcbert-large')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print(sentences[0])
print(tokenized_texts[0])

[CLS]Its nice that you can watch it for 1000 won You can sit and listen to the sound of the water[SEP]
['[CLS]', 'I', '##t', '##s', 'n', '##ic', '##e', 't', '##h', '##at', 'you', 'c', '##an', 'w', '##at', '##ch', 'i', '##t', 'f', '##or', '1000', 'w', '##on', 'Y', '##ou', 'c', '##an', 's', '##it', 'a', '##nd', 'l', '##is', '##t', '##en', 'to', 'the', 's', '##ou', '##nd', 'of', 'the', 'w', '##at', '##er', '[SEP]']


In [11]:
lengths=[]
for token in tokenized_texts:
    lengths.append(len(token))
    
print(max(lengths))

159


In [12]:
MAX_LEN = 150

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
print(input_ids[0])

input_ids=pad_sequences(input_ids, maxlen=MAX_LEN,dtype='long',truncating='post',padding='post')
print(input_ids[0])

[2, 42, 4401, 4225, 79, 20121, 4226, 85, 5145, 15728, 23380, 68, 12016, 88, 15728, 28371, 74, 4401, 71, 13755, 11487, 88, 11866, 58, 14798, 68, 12016, 84, 21518, 66, 26023, 77, 21539, 4401, 17697, 26159, 19941, 84, 14798, 26023, 29162, 19941, 88, 15728, 13146, 3]
[    2    42  4401  4225    79 20121  4226    85  5145 15728 23380    68
 12016    88 15728 28371    74  4401    71 13755 11487    88 11866    58
 14798    68 12016    84 21518    66 26023    77 21539  4401 17697 26159
 19941    84 14798 26023 29162 19941    88 15728 13146     3     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0

In [13]:
attention_masks=[]

for seq in input_ids:
    seq_mask=[float(i>0) for i in seq]
    attention_masks.append(seq_mask)
    
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [14]:
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

print(test_inputs[0])
print(test_labels[0])
print(test_masks[0])

tensor([    2,    42,  4401,  4225,    79, 20121,  4226,    85,  5145, 15728,
        23380,    68, 12016,    88, 15728, 28371,    74,  4401,    71, 13755,
        11487,    88, 11866,    58, 14798,    68, 12016,    84, 21518,    66,
        26023,    77, 21539,  4401, 17697, 26159, 19941,    84, 14798, 26023,
        29162, 19941,    88, 15728, 13146,     3,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [15]:
# 배치 사이즈
batch_size = 16

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [16]:
# 분류를 위한 BERT 모델 생성
model = BertForSequenceClassification.from_pretrained("beomi/kcbert-large", num_labels=2)

Some weights of the model checkpoint at beomi/kcbert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initial

In [17]:

# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 10

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 처음에 학습률을 조금씩 변화시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



NameError: name 'train_dataloader' is not defined