In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 7.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AdamW 
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
device = torch.device("cuda")

In [None]:
train_data = pd.read_csv("__file_location__/preprocessed_emo_class.csv", encoding="cp949")

In [None]:
X_train, X_test = train_test_split(train_data, test_size = 0.2)

In [None]:
X_test

Unnamed: 0,text,label
3562,나 요즘 주식을 시작했어.,2
547,"프로젝트를 진행하느라고 친구들을 못 만났는데, 이번 기회에 만나봐야겠어.",0
645,나 축하해 달라구.,0
6466,집에 와서 조금 쉬었더니 마음이 진정되었어.,4
7248,아직까지는 집 안으로 물이 들어오지 않았어. 내 몸도 아픈데는 없어.,5
...,...,...
1166,내가 너무나 좋아하는 인플루언서가 진행하는 이벤트가 있었는데 거기에 당첨되었어.,0
3145,어. 집에만 있으니까 우울한 거 같애.,2
6334,산책하다가 목줄이 끊어졌다니까?,4
5009,그래야 할 것 같아. 냄새가 너무 독해. 냄새를 맡을 수가 없어. 청소도 더 해야될...,3


In [None]:
len(X_train)

6708

In [None]:
train_dataset = []
for sen, label in zip(X_train['text'], X_train['label']):
  data_train = []
  data_train.append(sen)
  data_train.append(str(label))

  train_dataset.append(data_train)

In [None]:
train_dataset[:5]

[['휴가 첫 날 부터 엄청 비가 오기 시작했어.', '1'],
 ['아. 어 집에 방향제 없는데 그것도 같이 사와야겠다.', '3'],
 ['맞아. 성격이 별로 안 좋아.', '1'],
 ['노력해서 받은 결과라 기분이 좋아.', '0'],
 ['스트레스를 확 날려버릴 음악으로 좀 부탁해.', '1']]

In [None]:
print(len(X_train))
print(len(X_test))

6708
1678


In [None]:
train_data['text'].values

array(['그래 그것도 좋은 방법인 것 같아.',
       '그래야지. 친구들이랑 이제 맛있는 음식도 많이 먹고 재미난 것도 많이 하려구.', '나 드디어 프로젝트 끝났어!',
       ..., '몸은 괜찮은데 언제 나갈 수 있을지 모르겠어.', '나 엘리베이터에 갇혔어.',
       '자취방 엘리베이턴데 정전인가봐.'], dtype=object)

In [None]:
str(train_data.values[0][0])

'그래 그것도 좋은 방법인 것 같아.'

In [None]:
class TrainDataset(Dataset):
  
  def __init__(self, dataset):
    self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

    self.sentences = [str([i[0]]) for i in dataset]
    self.labels = [np.int32(i[1]) for i in dataset]

  def __len__(self):
    return (len(self.labels))
  
  def __getitem__(self, i):
    text = self.sentences[i]
    y = self.labels[i]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=64,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
train_dataset = TrainDataset(train_dataset)

In [None]:
train_dataset[0]



(tensor([    2,    61,    11, 10648,  3222,   599,  9262,  8516,  1838,  4050,
          2571,  4036,  8438, 11357,    18,    11,    63,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 1)

In [None]:
tok = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

In [None]:
tok.decode(train_dataset[0][0])



"[CLS] ['휴가 첫 날 부터 엄청 비가 오기 시작했어.'] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [None]:
from torch import nn

model = AutoModel.from_pretrained("beomi/KcELECTRA-base", num_labels=6)
# model.classifier = torch.nn.Sequential(
#                                         nn.Linear(768, 768, bias=True),
#                                         nn.Dropout(p=0.1, inplace=False),
model = model.to(device)

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
batch_size = 64
epochs = 5

In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [None]:
input_ids_batch, attention_masks_batch, y_batch = iter(train_dataloader).next()



In [None]:
print(input_ids_batch[1])
print(attention_masks_batch)
print(y_batch)

tensor([    2,    61,    11,  8523, 12466,  4006,    18,  2434, 19266,  4033,
        13235,  4422,    18,    11,    63,     3,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([4, 3, 0, 0, 5, 1, 3, 2, 2, 2, 2, 2, 5, 2, 2, 4, 0, 1, 5, 1, 1, 0, 4, 3,
        4, 0, 0, 0, 5, 4, 3, 3, 0, 3, 1, 0, 4, 3, 5, 4, 0, 4, 5, 3, 2, 3, 2, 5,
        3, 2, 4, 1, 1, 3, 0, 0, 3, 1, 3, 3, 3, 2, 4, 3], dtype=torch.int32)


In [None]:
tok = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")

In [None]:
input_ids_batch[0]

tensor([    2,    61,    11,  7983, 13509, 31436, 28977,  8179,  8155,  8516,
          702,  4679,  4006,    18,    11,    63,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])

In [None]:
tok.decode(input_ids_batch[0])

"[CLS] ['아니 아까 해피 산책시키다가 엄청 놀랬어.'] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [None]:
print(input_ids_batch.shape)
print(attention_masks_batch.shape)
print(y_batch.shape)

torch.Size([64, 64])
torch.Size([64, 64])
torch.Size([64])


In [None]:
model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0][:,-1,:].shape

torch.Size([64, 768])

In [None]:
test1 = np.array([[[0,1,2],[3,4,5],[6,7,8]],
                  [[9,10,11],[12,13,14],[15,16,17]],
                  [[18,19,20],[21,22,23],[24,25,26]]])

In [None]:
test1[:,-1,-1]

array([ 8, 17, 26])

In [None]:
losses = []
accuracies = []

#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

loss_fn = nn.CrossEntropyLoss()

for i in range(epochs):
  train_acc = 0.0
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_dataloader):
    optimizer.zero_grad()
    y_batch = y_batch.long().to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    y_pred = y_pred[:, -1, :]
    loss = loss_fn(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    train_acc += calc_accuracy(y_pred, y_batch)
    total += len(y_batch)

    batches += 1
    if batches % 50 == 0:
      print("epoch {} loss {} train acc {}".format(i+1, loss.data.cpu().numpy(), train_acc / (batches+1)))
  print("epoch {} loss {} train acc {}".format(i+1, loss.data.cpu().numpy(), train_acc / (batches+1)))
  model.eval()
  

  0%|          | 0/105 [00:00<?, ?it/s]



epoch 1 loss 3.847790479660034 train acc 0.14399509803921567
epoch 1 loss 2.8706347942352295 train acc 0.27645420792079206
epoch 1 loss 2.1791443824768066 train acc 0.2918632075471698


  0%|          | 0/105 [00:00<?, ?it/s]

epoch 2 loss 2.035689353942871 train acc 0.6752450980392157
epoch 2 loss 1.8723305463790894 train acc 0.7074566831683168
epoch 2 loss 1.6024911403656006 train acc 0.7100190493468795


  0%|          | 0/105 [00:00<?, ?it/s]

epoch 3 loss 0.7275426983833313 train acc 0.8458946078431373
epoch 3 loss 1.0226964950561523 train acc 0.8620049504950495
epoch 3 loss 1.445241928100586 train acc 0.8638198476052249


  0%|          | 0/105 [00:00<?, ?it/s]

epoch 4 loss 1.0372668504714966 train acc 0.8964460784313726
epoch 4 loss 0.6607805490493774 train acc 0.9130569306930693
epoch 4 loss 1.0294392108917236 train acc 0.9149129172714078


  0%|          | 0/105 [00:00<?, ?it/s]

epoch 5 loss 1.1582515239715576 train acc 0.9325980392156863
epoch 5 loss 0.6606323719024658 train acc 0.9395111386138614
epoch 5 loss 0.9732204079627991 train acc 0.9406748911465893


In [None]:
def predict(sentence):
    data = [sentence, '0']
    dataset_another = [data]
    logits = 0
    another_test = TrainDataset(dataset_another)
    test_dataloader = torch.utils.data.DataLoader(another_test)

    model.eval()

    for input_ids_batch, attention_masks_batch, y_batch in test_dataloader:
        y_batch = y_batch.long().to(device)
        out = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        out = out[:, -1, :]

        for i in out:
            logits = i
            logits = logits.detach().cpu().numpy()
            logits = np.argmax(logits)
    return logits

In [None]:
predict('아니 왜 나한테만 이러는거야 생각은 열심히 잘 했는데 결과가 왜이래?')



1

In [None]:
#torch.save(model.state_dict(), "__file_location__/emo_classify_model.pt")

In [None]:
X_test

Unnamed: 0,text,label
3562,나 요즘 주식을 시작했어.,2
547,"프로젝트를 진행하느라고 친구들을 못 만났는데, 이번 기회에 만나봐야겠어.",0
645,나 축하해 달라구.,0
6466,집에 와서 조금 쉬었더니 마음이 진정되었어.,4
7248,아직까지는 집 안으로 물이 들어오지 않았어. 내 몸도 아픈데는 없어.,5
...,...,...
1166,내가 너무나 좋아하는 인플루언서가 진행하는 이벤트가 있었는데 거기에 당첨되었어.,0
3145,어. 집에만 있으니까 우울한 거 같애.,2
6334,산책하다가 목줄이 끊어졌다니까?,4
5009,그래야 할 것 같아. 냄새가 너무 독해. 냄새를 맡을 수가 없어. 청소도 더 해야될...,3


In [None]:
predict('나 요즘 주식을 시작했어.')



2

In [None]:
X_test['text'].values[0]

'나 요즘 주식을 시작했어.'

In [65]:
predicted_a = []
for i in tqdm(range(len(X_test))):
  p = predict(X_test['text'].values[i])
  predicted_a.append(p)

  0%|          | 0/1678 [00:00<?, ?it/s]



In [69]:
print(predicted_a)

[2, 0, 0, 4, 5, 3, 2, 4, 4, 5, 3, 4, 5, 5, 4, 4, 0, 4, 1, 5, 2, 4, 4, 4, 0, 4, 5, 3, 1, 3, 0, 4, 5, 1, 0, 1, 1, 2, 1, 2, 5, 2, 3, 1, 0, 1, 4, 0, 1, 5, 1, 5, 0, 5, 5, 4, 0, 2, 3, 4, 1, 2, 5, 3, 0, 1, 0, 3, 4, 2, 2, 1, 0, 2, 4, 5, 0, 3, 1, 3, 4, 1, 2, 3, 1, 2, 3, 5, 1, 2, 3, 5, 1, 0, 4, 3, 3, 0, 4, 5, 4, 0, 4, 4, 4, 4, 0, 3, 0, 2, 4, 5, 1, 1, 0, 5, 5, 1, 2, 4, 0, 1, 5, 1, 2, 4, 4, 5, 1, 4, 5, 2, 0, 1, 1, 4, 2, 3, 1, 0, 3, 3, 5, 2, 0, 1, 1, 0, 5, 4, 0, 5, 3, 5, 5, 4, 3, 2, 5, 4, 4, 3, 1, 5, 1, 2, 3, 4, 3, 5, 4, 3, 0, 0, 0, 0, 4, 3, 2, 2, 4, 0, 2, 2, 4, 1, 1, 3, 4, 2, 0, 2, 3, 0, 1, 3, 3, 3, 3, 2, 5, 0, 2, 4, 1, 2, 3, 4, 3, 2, 2, 3, 5, 0, 1, 1, 5, 2, 2, 4, 1, 5, 5, 2, 0, 0, 0, 3, 4, 3, 2, 1, 5, 3, 3, 3, 0, 3, 0, 5, 0, 1, 2, 4, 0, 3, 2, 1, 2, 3, 3, 0, 5, 0, 0, 5, 3, 0, 1, 2, 2, 2, 2, 3, 2, 5, 0, 5, 0, 2, 5, 5, 3, 4, 5, 2, 0, 2, 3, 2, 0, 5, 3, 3, 2, 4, 3, 1, 3, 0, 3, 4, 5, 0, 2, 4, 0, 2, 4, 0, 4, 3, 3, 1, 1, 4, 5, 5, 2, 0, 0, 0, 1, 1, 5, 0, 2, 2, 5, 4, 0, 0, 4, 4, 0, 3, 1, 2, 2, 0, 2, 2, 3, 

In [71]:
print(X_test['label'].values.tolist())

[2, 0, 0, 4, 5, 3, 2, 4, 4, 5, 3, 4, 5, 5, 4, 4, 0, 4, 1, 5, 2, 4, 4, 4, 0, 4, 5, 3, 1, 3, 0, 4, 5, 1, 0, 1, 1, 2, 1, 2, 5, 2, 3, 1, 0, 1, 4, 0, 1, 5, 1, 5, 0, 5, 5, 4, 0, 2, 3, 4, 1, 2, 5, 1, 0, 1, 0, 3, 4, 2, 2, 1, 0, 2, 4, 5, 0, 3, 1, 3, 4, 1, 2, 3, 1, 2, 3, 5, 1, 0, 4, 5, 1, 0, 4, 3, 2, 0, 4, 5, 4, 0, 4, 4, 4, 4, 0, 3, 0, 2, 5, 5, 1, 5, 0, 5, 5, 1, 2, 4, 0, 5, 5, 1, 2, 4, 4, 5, 3, 4, 5, 2, 0, 1, 5, 4, 2, 3, 2, 0, 3, 3, 5, 2, 0, 1, 0, 0, 5, 4, 0, 5, 3, 5, 5, 4, 3, 2, 5, 5, 4, 3, 1, 5, 2, 2, 3, 4, 3, 5, 4, 3, 0, 0, 0, 0, 4, 3, 2, 2, 4, 0, 2, 1, 4, 1, 0, 3, 4, 2, 0, 2, 3, 0, 1, 3, 3, 3, 3, 2, 5, 0, 2, 4, 1, 2, 0, 4, 1, 2, 2, 3, 5, 0, 1, 1, 5, 2, 2, 4, 1, 5, 5, 2, 0, 0, 4, 3, 4, 3, 2, 1, 5, 3, 3, 2, 0, 3, 0, 5, 0, 1, 2, 4, 0, 3, 2, 1, 2, 3, 3, 0, 5, 0, 0, 5, 3, 0, 1, 2, 2, 2, 2, 3, 2, 5, 0, 5, 2, 2, 5, 5, 3, 4, 5, 2, 0, 2, 3, 3, 0, 5, 3, 3, 2, 4, 3, 1, 3, 0, 3, 4, 5, 0, 2, 4, 0, 2, 4, 0, 4, 3, 3, 1, 1, 4, 5, 5, 2, 0, 0, 0, 1, 1, 5, 0, 2, 2, 5, 4, 0, 0, 4, 4, 0, 3, 2, 2, 2, 2, 2, 2, 3, 

In [72]:
predicted_a == X_test['label'].values.tolist()

False

In [73]:
answer = 0
for x, y in zip(predicted_a, X_test['label'].values.tolist()):
  if x == y:
    answer += 1


In [75]:
answer/len(predicted_a)

0.9165673420738975