In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [19]:
tokenizer = BertTokenizer.from_pretrained("monologg/kobert")

In [11]:
train_df = pd.read_csv('/content/drive/MyDrive/NLP_commit/Comment_guard/unsmile_train_v1.0.tsv', delimiter='\t')
valid_df = pd.read_csv('/content/drive/MyDrive/NLP_commit/Comment_guard/unsmile_valid_v1.0.tsv', delimiter='\t')

In [21]:
# '문장' 열과 레이블 생성 (0: 부정적인 문장)
train_sentences = train_df['문장']
train_labels = torch.zeros(len(train_df), dtype=torch.long)

valid_sentences = valid_df['문장']
valid_labels = torch.zeros(len(valid_df), dtype=torch.long)


In [22]:
# 문장을 토크나이저로 변환하여 패딩 적용하는 함수 정의
def tokenize_sentences(sentences, max_length=50):
    tokens = tokenizer(sentences.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    return tokens

In [23]:
# 훈련 데이터와 검증 데이터 전처리
train_tokens = tokenize_sentences(train_sentences)
valid_tokens = tokenize_sentences(valid_sentences)

In [15]:
print(train_tokens)
print(valid_tokens)

{'input_ids': tensor([[2, 0, 0,  ..., 1, 1, 1],
        [2, 0, 0,  ..., 1, 1, 1],
        [2, 0, 0,  ..., 1, 1, 1],
        ...,
        [2, 0, 0,  ..., 1, 1, 1],
        [2, 0, 0,  ..., 1, 1, 1],
        [2, 0, 0,  ..., 1, 1, 1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
{'input_ids': tensor([[   2,    0, 7347,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,    0, 5782,  ...,    1,    1,    1],
        ...,
        [   2,    0,    0,  ...,    1,    1,    1],
        [   2,  497,    0,  ...,    1,    1,    1],
      

In [24]:
# 데이터셋 클래스 정의
class NegativeSentenceDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 'labels': self.labels[idx]}

In [25]:
# DataLoader 생성
train_dataset = NegativeSentenceDataset(train_tokens['input_ids'], train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

valid_dataset = NegativeSentenceDataset(valid_tokens['input_ids'], valid_labels)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [26]:
model = BertForSequenceClassification.from_pretrained("monologg/kobert", num_labels=2)  # 이진 분류

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# 옵티마이저 및 손실 함수 정의
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()



In [28]:
# 학습 및 평가 함수 정의
def train_epoch(model, dataloader, loss_fn, optimizer):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        input_ids, labels = batch['input_ids'], batch['labels']
        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, loss_fn):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, labels = batch['input_ids'], batch['labels']
            outputs = model(input_ids, labels=labels)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(dataloader), accuracy.

In [29]:
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, loss_fn, optimizer)
    valid_loss, valid_accuracy = evaluate(model, valid_dataloader, loss_fn)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: ignored