In [1]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from transformers import ElectraModel, ElectraTokenizer
from tqdm.notebook import tqdm
import torch.nn as nn
import torch.nn.init as init
from sklearn.metrics import f1_score
data = pd.read_csv('/home/kdt-admin/cleaned_data.csv').dropna(axis=0)
data.reset_index(drop=True, inplace=True)

In [2]:
data['문장'].nunique(), data['감정'].nunique()

(173247, 6)

In [3]:
data.drop_duplicates(subset=['문장'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [4]:
data.loc[(data['감정'] == "중립"), '감정'] = 0  
data.loc[(data['감정'] == "슬픔"), '감정'] = 1  
data.loc[(data['감정'] == "분노"), '감정'] = 2  
data.loc[(data['감정'] == "불안"), '감정'] = 3  
data.loc[(data['감정'] == "행복"), '감정'] = 4  
data.loc[(data['감정'] == "당황"), '감정'] = 5  

In [5]:
data

Unnamed: 0,감정,문장
0,2,일은 왜 해도 해도 끝이 없을까 ? 화가 난다 . 그냥 내가 해결하는 게 나아 . ...
1,2,이번 달에 또 급여가 깎였어 ! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나...
2,2,회사에 신입이 들어왔는데 말투가 거슬려 . 그런 애를 매일 봐야 한다고 생각하니까 ...
3,2,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜 . 일도 많은 데 정말 분하고...
4,2,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나 . 상사인 나에게...
...,...,...
173242,5,오마이갓 믿을 수가 없어
173243,5,하느님 이거 너무 힘들어요
173244,5,엄청난 쇼
173245,5,오머나 정말 놀랐습니다


In [6]:
# GPU 사용
device = torch.device("cuda")

In [7]:
class CsixDataset(Dataset):
  
  def __init__(self, data):
    self.dataset = data 
    # 중복제거
    self.dataset.drop_duplicates(subset=['문장'], inplace=True)
    self.tokenizer =  AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
    #"monologg/koelectra-small-v3-discriminator" small data
    #"monologg/koelectra-base-v3-discriminator" big data

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[1]
    y = row[0]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=75,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [8]:
# 데이터셋을 트레이닝과 테스트로 나눔
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 트레이닝 데이터셋과 테스트 데이터셋 클래스 인스턴스 생성
train_dataset = CsixDataset(train_data)
test_dataset = CsixDataset(test_data)

            감정                             문장
count   138597                         138597
unique       6                         138597
top          0  어머 이게 무슨 일이에요 저는 뭐가 잘못된 것 같아요
freq     36182                              1
           감정       문장
count   34650    34650
unique      6    34650
top         0  네 . 도련님
freq     9016        1


In [9]:
num_classes = 6  # 클래스 수에 맞게 설정
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator", num_labels=num_classes)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
text, attention_mask, y = train_dataset[0]
model(text.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))



SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0464,  0.0233, -0.0597, -0.0039, -0.0554, -0.0641]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [11]:
# 모델 레이어 보기
model = model.to(device)

In [12]:
epochs = 4
batch_size = 16

In [13]:
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)



In [14]:
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct_train += (predicted == y_batch).sum().item()
        total_train += len(y_batch)
        
        # Print batch accuracy
        batch_accuracy = correct_train / total_train
        print(f"Epoch {epoch+1}/{epochs}, Batch Accuracy: {batch_accuracy:.4f}")

    train_losses.append(train_loss / len(train_loader))
    train_accuracy = correct_train / total_train
    train_accuracies.append(train_accuracy)

    # Testing
    model.eval()
    test_loss = 0.0
    correct_test = 0
    total_test = 0
    
    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            loss = F.cross_entropy(y_pred, y_batch)

            test_loss += loss.item()

            _, predicted = torch.max(y_pred, 1)
            correct_test += (predicted == y_batch).sum().item()
            total_test += len(y_batch)
    
    test_losses.append(test_loss / len(test_loader))
    test_accuracy = correct_test / total_test
    test_accuracies.append(test_accuracy)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Loss: {test_losses[-1]:.4f}, Test Accuracy: {test_accuracy:.4f}")
        
    # Calculate F1 score
    
    predicted_labels = []
    true_labels = []

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            _, predicted = torch.max(y_pred, 1)

            predicted_labels.extend(predicted.cpu().numpy())
            true_labels.extend(y_batch.cpu().numpy())

    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    print("F1 Score:", f1)

  0%|          | 0/8663 [00:00<?, ?it/s]



Epoch 1/4, Batch Accuracy: 0.2500
Epoch 1/4, Batch Accuracy: 0.2500
Epoch 1/4, Batch Accuracy: 0.2500
Epoch 1/4, Batch Accuracy: 0.2500
Epoch 1/4, Batch Accuracy: 0.2625
Epoch 1/4, Batch Accuracy: 0.2812
Epoch 1/4, Batch Accuracy: 0.2411
Epoch 1/4, Batch Accuracy: 0.2500
Epoch 1/4, Batch Accuracy: 0.2361
Epoch 1/4, Batch Accuracy: 0.2437
Epoch 1/4, Batch Accuracy: 0.2386
Epoch 1/4, Batch Accuracy: 0.2344
Epoch 1/4, Batch Accuracy: 0.2260
Epoch 1/4, Batch Accuracy: 0.2366
Epoch 1/4, Batch Accuracy: 0.2333
Epoch 1/4, Batch Accuracy: 0.2344
Epoch 1/4, Batch Accuracy: 0.2316
Epoch 1/4, Batch Accuracy: 0.2326
Epoch 1/4, Batch Accuracy: 0.2401
Epoch 1/4, Batch Accuracy: 0.2406
Epoch 1/4, Batch Accuracy: 0.2411
Epoch 1/4, Batch Accuracy: 0.2386
Epoch 1/4, Batch Accuracy: 0.2418
Epoch 1/4, Batch Accuracy: 0.2422
Epoch 1/4, Batch Accuracy: 0.2375
Epoch 1/4, Batch Accuracy: 0.2404
Epoch 1/4, Batch Accuracy: 0.2384
Epoch 1/4, Batch Accuracy: 0.2388
Epoch 1/4, Batch Accuracy: 0.2349
Epoch 1/4, Bat

  0%|          | 0/2166 [00:00<?, ?it/s]

Epoch 1/4, Train Loss: 0.8086, Train Accuracy: 0.7142, Test Loss: 0.6425, Test Accuracy: 0.7728


  0%|          | 0/2166 [00:00<?, ?it/s]

F1 Score: 0.7709628524644369


  0%|          | 0/8663 [00:00<?, ?it/s]



Epoch 2/4, Batch Accuracy: 0.7500
Epoch 2/4, Batch Accuracy: 0.8438
Epoch 2/4, Batch Accuracy: 0.8750
Epoch 2/4, Batch Accuracy: 0.8125
Epoch 2/4, Batch Accuracy: 0.8000
Epoch 2/4, Batch Accuracy: 0.7500
Epoch 2/4, Batch Accuracy: 0.7679
Epoch 2/4, Batch Accuracy: 0.7500
Epoch 2/4, Batch Accuracy: 0.7431
Epoch 2/4, Batch Accuracy: 0.7312
Epoch 2/4, Batch Accuracy: 0.7330
Epoch 2/4, Batch Accuracy: 0.7344
Epoch 2/4, Batch Accuracy: 0.7356
Epoch 2/4, Batch Accuracy: 0.7366
Epoch 2/4, Batch Accuracy: 0.7417
Epoch 2/4, Batch Accuracy: 0.7422
Epoch 2/4, Batch Accuracy: 0.7426
Epoch 2/4, Batch Accuracy: 0.7361
Epoch 2/4, Batch Accuracy: 0.7434
Epoch 2/4, Batch Accuracy: 0.7344
Epoch 2/4, Batch Accuracy: 0.7292
Epoch 2/4, Batch Accuracy: 0.7386
Epoch 2/4, Batch Accuracy: 0.7391
Epoch 2/4, Batch Accuracy: 0.7422
Epoch 2/4, Batch Accuracy: 0.7375
Epoch 2/4, Batch Accuracy: 0.7356
Epoch 2/4, Batch Accuracy: 0.7361
Epoch 2/4, Batch Accuracy: 0.7388
Epoch 2/4, Batch Accuracy: 0.7371
Epoch 2/4, Bat

  0%|          | 0/2166 [00:00<?, ?it/s]

Epoch 2/4, Train Loss: 0.6118, Train Accuracy: 0.7819, Test Loss: 0.6072, Test Accuracy: 0.7861


  0%|          | 0/2166 [00:00<?, ?it/s]

F1 Score: 0.7836857549385257


  0%|          | 0/8663 [00:00<?, ?it/s]



Epoch 3/4, Batch Accuracy: 0.8750
Epoch 3/4, Batch Accuracy: 0.8750
Epoch 3/4, Batch Accuracy: 0.8542
Epoch 3/4, Batch Accuracy: 0.7812
Epoch 3/4, Batch Accuracy: 0.8000
Epoch 3/4, Batch Accuracy: 0.7812
Epoch 3/4, Batch Accuracy: 0.7857
Epoch 3/4, Batch Accuracy: 0.7734
Epoch 3/4, Batch Accuracy: 0.7847
Epoch 3/4, Batch Accuracy: 0.8000
Epoch 3/4, Batch Accuracy: 0.8068
Epoch 3/4, Batch Accuracy: 0.8073
Epoch 3/4, Batch Accuracy: 0.8173
Epoch 3/4, Batch Accuracy: 0.8170
Epoch 3/4, Batch Accuracy: 0.8125
Epoch 3/4, Batch Accuracy: 0.8086
Epoch 3/4, Batch Accuracy: 0.8051
Epoch 3/4, Batch Accuracy: 0.8056
Epoch 3/4, Batch Accuracy: 0.7961
Epoch 3/4, Batch Accuracy: 0.7969
Epoch 3/4, Batch Accuracy: 0.8036
Epoch 3/4, Batch Accuracy: 0.8011
Epoch 3/4, Batch Accuracy: 0.8098
Epoch 3/4, Batch Accuracy: 0.8125
Epoch 3/4, Batch Accuracy: 0.8125
Epoch 3/4, Batch Accuracy: 0.8053
Epoch 3/4, Batch Accuracy: 0.8032
Epoch 3/4, Batch Accuracy: 0.8058
Epoch 3/4, Batch Accuracy: 0.7996
Epoch 3/4, Bat

  0%|          | 0/2166 [00:00<?, ?it/s]

Epoch 3/4, Train Loss: 0.5492, Train Accuracy: 0.8042, Test Loss: 0.5932, Test Accuracy: 0.7910


  0%|          | 0/2166 [00:00<?, ?it/s]

F1 Score: 0.7892763153405569


  0%|          | 0/8663 [00:00<?, ?it/s]



Epoch 4/4, Batch Accuracy: 0.8125
Epoch 4/4, Batch Accuracy: 0.7500
Epoch 4/4, Batch Accuracy: 0.7917
Epoch 4/4, Batch Accuracy: 0.8125
Epoch 4/4, Batch Accuracy: 0.8375
Epoch 4/4, Batch Accuracy: 0.8542
Epoch 4/4, Batch Accuracy: 0.8571
Epoch 4/4, Batch Accuracy: 0.8438
Epoch 4/4, Batch Accuracy: 0.8403
Epoch 4/4, Batch Accuracy: 0.8250
Epoch 4/4, Batch Accuracy: 0.8352
Epoch 4/4, Batch Accuracy: 0.8333
Epoch 4/4, Batch Accuracy: 0.8269
Epoch 4/4, Batch Accuracy: 0.8304
Epoch 4/4, Batch Accuracy: 0.8375
Epoch 4/4, Batch Accuracy: 0.8398
Epoch 4/4, Batch Accuracy: 0.8419
Epoch 4/4, Batch Accuracy: 0.8472
Epoch 4/4, Batch Accuracy: 0.8520
Epoch 4/4, Batch Accuracy: 0.8500
Epoch 4/4, Batch Accuracy: 0.8393
Epoch 4/4, Batch Accuracy: 0.8381
Epoch 4/4, Batch Accuracy: 0.8342
Epoch 4/4, Batch Accuracy: 0.8307
Epoch 4/4, Batch Accuracy: 0.8300
Epoch 4/4, Batch Accuracy: 0.8341
Epoch 4/4, Batch Accuracy: 0.8403
Epoch 4/4, Batch Accuracy: 0.8393
Epoch 4/4, Batch Accuracy: 0.8297
Epoch 4/4, Bat

  0%|          | 0/2166 [00:00<?, ?it/s]

Epoch 4/4, Train Loss: 0.4979, Train Accuracy: 0.8230, Test Loss: 0.6037, Test Accuracy: 0.7906


  0%|          | 0/2166 [00:00<?, ?it/s]

F1 Score: 0.7882505968421499


In [15]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/2166 [00:00<?, ?it/s]



Accuracy: tensor(0.7906, device='cuda:0')


In [21]:
# 모델 저장하기
torch.save(model.state_dict(), "modelfinal.pt")

In [17]:
# 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [30]:
koelectra_tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

result = koelectra_tokenizer.tokenize("너는 내년 대선 때 투표할 수 있어?")
print(result)
print([koelectra_tokenizer.encode(token) for token in result])

['너', '##는', '내년', '대선', '때', '투표', '##할', '수', '있', '##어', '?']
[[2, 2267, 3], [2, 7, 7, 2331, 3], [2, 6821, 3], [2, 7167, 3], [2, 2468, 3], [2, 7070, 3], [2, 7, 7, 3758, 3], [2, 2967, 3], [2, 3249, 3], [2, 7, 7, 3114, 3], [2, 35, 3]]


In [18]:
# 감정 예측 함수
def predict_emotion(input_sentence):
    inputs = tokenizer(input_sentence, return_tensors="pt", max_length=75, padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits

    predicted_label = torch.argmax(logits, dim=1).item()
    emotions = ["중립", "슬픔", "분노", "불안", "행복", "당황"]
    predicted_emotion = emotions[predicted_label]

    return predicted_emotion

In [19]:
class_names = ["중립", "슬픔", "분노", "불안", "행복", "당황"]
def predict_with_prob(input_sentence):
    inputs = tokenizer(input_sentence, return_tensors="pt", max_length=75, padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    
    # 모델을 사용하여 감정 예측
    outputs = model(input_ids, attention_mask)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)

    # 예측된 감정
    predicted_class_idx = torch.argmax(logits, dim=1).item()
    predicted_emotion = class_names[predicted_class_idx]

    # 각 클래스(감정)별 확률
    class_probs = probs.squeeze().tolist()

    # 결과 반환
    return predicted_emotion, class_probs


In [31]:
# 질문 무한반복하기! 0 입력시 종료
end = 1
while end == 1:
    sentence = input("하고싶은 말을 입력해주세요 : ")
    if sentence == "0":
        break
    print("입력한 문장:", sentence)
    
    # 감정 예측 및 각 클래스(감정)별 확률 출력
    predicted_emotion, class_probs = predict_with_prob(sentence)
    print("예측된 감정:", predicted_emotion)
    print("각 클래스별 확률:")
    for class_name, prob in zip(class_names, class_probs):
        print(f"{class_name}: {prob*100:.2f}%")
    print("\n")



입력한 문장: 너무 슬프잖아...
예측된 감정: 슬픔
각 클래스별 확률:
중립: 6.78%
슬픔: 91.82%
분노: 0.26%
불안: 0.63%
행복: 0.29%
당황: 0.21%


