In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [2]:
# GPU 사용
device = torch.device("cuda")

# Dataset 만들어서 불러오기

In [3]:
total_dataset = pd.read_csv('total_dataset.csv')

In [4]:
# Nan 제거
total_dataset.replace('', np.nan, inplace=True)
total_dataset.dropna(inplace=True) 
# 중복 제거
total_dataset.drop_duplicates(subset=['document'], inplace=True)

In [5]:
train_dataset, test_dataset = train_test_split(total_dataset, test_size = 0.2, random_state = 42)

In [6]:
print(train_dataset.shape)
print(test_dataset.shape)

(252830, 3)
(63208, 3)


In [7]:
class ReviewDataset(Dataset):
  
    def __init__(self, dataset):
        self.dataset = dataset
        
        self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")

        print(self.dataset.describe())
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 1:3].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
        )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [8]:
train_dataset = ReviewDataset(train_dataset)
test_dataset = ReviewDataset(test_dataset)

                  id          label
count  252830.000000  252830.000000
mean   220610.245200       0.591453
std    124501.500447       0.491566
min         1.000000       0.000000
25%    103259.500000       0.000000
50%    242805.500000       1.000000
75%    327442.750000       1.000000
max    411846.000000       1.000000
                  id         label
count   63208.000000  63208.000000
mean   220985.187239      0.591049
std    124628.280680      0.491644
min         0.000000      0.000000
25%    103728.500000      0.000000
50%    243701.500000      1.000000
75%    327973.000000      1.000000
max    411847.000000      1.000000


# 모델 생성

In [9]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [10]:
# model.load_state_dict(torch.load("model.pt"))

In [11]:
# 모델 레이어 보기
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

# 모델 학습

In [12]:
epochs = 4
batch_size = 32

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [14]:
losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/7901 [00:00<?, ?it/s]



Batch Loss: 68.03135925531387 Accuracy: tensor(0.5953, device='cuda:0')


KeyboardInterrupt: 

In [None]:
losses, accuracies