In [28]:
dataset_train = pd.read_csv("train_preprocess.csv")

In [29]:
dataset_test = dataset_train[700:]
dataset_train = dataset_train[:700]

In [30]:
dataset_test.reset_index(drop=True, inplace=True)

In [45]:
df_id = dataset_train["id"]
df_review = dataset_train["review"]
df_label = dataset_train["label"]

In [57]:
mystr = 'id\treview\tlabel\n' 
for i in range(len(df_review)):
   mystr += str(df_id[i]) + '\t' + str(df_review[i]) + '\t' + str(df_label[i]) + '\n'

In [58]:
df_id2 = dataset_test["id"]
df_review2 = dataset_test["review"]
df_label2 = dataset_test["label"]

In [59]:
mystr2 = 'id\treview\tlabel\n' 
for i in range(len(df_review2)):
   mystr2 += str(df_id2[i]) + '\t' +  str(df_review2[i]) + '\t' + str(df_label2[i]) + '\n'

In [60]:
with open('train.txt', 'w', encoding='utf-8') as f: 
  f.write(mystr)

In [61]:
with open('test.txt', 'w', encoding='utf-8') as f: 
  f.write(mystr2)

In [62]:
# HuggingFace transformers 설치
!pip install transformers



In [63]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [64]:
# GPU 사용
device = torch.device("cuda")

# Dataset 만들어서 불러오기 

In [65]:
class Dataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['review'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [66]:
train_dataset = Dataset("train.txt")
test_dataset = Dataset("test.txt")

                id       label
count   700.000000  700.000000
mean    741.194286    0.500000
std     370.177222    0.500358
min     174.000000    0.000000
25%     446.750000    0.000000
50%     683.500000    0.500000
75%     992.250000    1.000000
max    1577.000000    1.000000
                id       label
count   210.000000  210.000000
mean    740.490476    0.500000
std     371.349540    0.501195
min     178.000000    0.000000
25%     448.500000    0.000000
50%     658.500000    0.500000
75%     933.750000    1.000000
max    1575.000000    1.000000


# Create Model

In [67]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [68]:
#model.load_state_dict(torch.load("model.pt"))

In [69]:
# 모델 레이어 보기
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32200, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

# Learn

In [70]:
epochs = 30
batch_size = 128

In [71]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [72]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/6 [00:00<?, ?it/s]



Train Loss: 4.153777778148651 Accuracy: tensor(0.5043, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 4.139375269412994 Accuracy: tensor(0.5029, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 4.117312908172607 Accuracy: tensor(0.5514, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 4.0955506563186646 Accuracy: tensor(0.5957, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 4.0558600425720215 Accuracy: tensor(0.7171, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 4.001126945018768 Accuracy: tensor(0.8171, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.9285035729408264 Accuracy: tensor(0.8686, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.8346917629241943 Accuracy: tensor(0.8914, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.7304588556289673 Accuracy: tensor(0.8971, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.6244364976882935 Accuracy: tensor(0.8929, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.5356324315071106 Accuracy: tensor(0.9014, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.4309403896331787 Accuracy: tensor(0.9000, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.335951268672943 Accuracy: tensor(0.9000, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.255947470664978 Accuracy: tensor(0.8986, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.17874413728714 Accuracy: tensor(0.9043, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.0849900245666504 Accuracy: tensor(0.9086, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 3.00511234998703 Accuracy: tensor(0.9057, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.9458727836608887 Accuracy: tensor(0.9086, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.8565744757652283 Accuracy: tensor(0.9057, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.807108461856842 Accuracy: tensor(0.9114, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.7429873049259186 Accuracy: tensor(0.9114, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.6541576087474823 Accuracy: tensor(0.9157, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.5800066590309143 Accuracy: tensor(0.9171, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.564111292362213 Accuracy: tensor(0.9171, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.489231526851654 Accuracy: tensor(0.9200, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.4194483757019043 Accuracy: tensor(0.9214, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.3836301267147064 Accuracy: tensor(0.9200, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.329917997121811 Accuracy: tensor(0.9229, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.2981207370758057 Accuracy: tensor(0.9257, device='cuda:0')


  0%|          | 0/6 [00:00<?, ?it/s]

Train Loss: 2.2081437706947327 Accuracy: tensor(0.9243, device='cuda:0')


In [73]:
losses, accuracies

([4.153777778148651,
  4.139375269412994,
  4.117312908172607,
  4.0955506563186646,
  4.0558600425720215,
  4.001126945018768,
  3.9285035729408264,
  3.8346917629241943,
  3.7304588556289673,
  3.6244364976882935,
  3.5356324315071106,
  3.4309403896331787,
  3.335951268672943,
  3.255947470664978,
  3.17874413728714,
  3.0849900245666504,
  3.00511234998703,
  2.9458727836608887,
  2.8565744757652283,
  2.807108461856842,
  2.7429873049259186,
  2.6541576087474823,
  2.5800066590309143,
  2.564111292362213,
  2.489231526851654,
  2.4194483757019043,
  2.3836301267147064,
  2.329917997121811,
  2.2981207370758057,
  2.2081437706947327],
 [tensor(0.5043, device='cuda:0'),
  tensor(0.5029, device='cuda:0'),
  tensor(0.5514, device='cuda:0'),
  tensor(0.5957, device='cuda:0'),
  tensor(0.7171, device='cuda:0'),
  tensor(0.8171, device='cuda:0'),
  tensor(0.8686, device='cuda:0'),
  tensor(0.8914, device='cuda:0'),
  tensor(0.8971, device='cuda:0'),
  tensor(0.8929, device='cuda:0'),
  t

테스트 데이터셋 정확도 확인하기

In [74]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/14 [00:00<?, ?it/s]



Accuracy: tensor(0.9000, device='cuda:0')


In [75]:
# 모델 저장하기
torch.save(model.state_dict(), "model.pt")