In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 14.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 67.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.6 MB/s 


In [None]:
import torch
import pandas as pd
import torch.nn as nn
from tqdm import tqdm, tqdm_notebook
import numpy as np
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer
from transformers import AdamW

In [None]:
#GPU 사용
device = torch.device("cuda:0")
device

device(type='cuda', index=0)

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/train_data.csv")
valid_data = pd.read_csv("/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/valid_data.csv")
test_data = pd.read_csv("/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/test_data.csv")

In [None]:
d = pd.read_csv("/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/train_data.csv")

In [None]:
classes = train_data['label'].unique().tolist()
classes

['기쁨', '불안', '슬픔', '분노']

In [None]:
for i in range(len(train_data)):
    idx = classes.index(train_data.iloc[i][0])
    if idx == 2:
      train_data.iloc[i][0] = 1
    else:
      train_data.iloc[i][0] = 0

In [None]:
for i in range(len(valid_data)):
    idx = classes.index(valid_data.iloc[i][0])
    if idx == 2:
      valid_data.iloc[i][0] = 1
    else:
      valid_data.iloc[i][0] = 0

for i in range(len(test_data)):
    idx = classes.index(test_data.iloc[i][0])
    if idx == 2:
      test_data.iloc[i][0] = 1
    else:
      test_data.iloc[i][0] = 0

In [None]:
for i in range(len(d)):
    idx = classes.index(d.iloc[i][0])
    if idx == 2:
      d.iloc[i][0] = 1
    else:
      d.iloc[i][0] = 0

In [None]:
depression = d[d["label"] == 1]
notdepression = d[d["label"] == 0]

In [None]:
class BERTDataset(Dataset):
  def __init__(self, data, max_len):
    super(BERTDataset, self).__init__()
    self.data = data
    self.max_len = max_len
    self.tokenizer = AutoTokenizer.from_pretrained("klue/bert-base", use_fast = True)

    self.inputs = [self.convert_token([data.iloc[i][1]]) for i in range(len(self.data))]
    self.label = [np.int32(data.iloc[i]['label']) for i in range(len(self.data))]

  def convert_token(self, data):
    token = self.tokenizer.encode(data[0])
    attention_mask = [1] * len(token) + [0] * (self.max_len - len(token))
    token = token + self.tokenizer.convert_tokens_to_ids(["[PAD]"] * (self.max_len - len(token)))
    return [np.int32(attention_mask), np.int32(token)]

  def __getitem__(self, idx):
    return self.inputs[idx][0], self.inputs[idx][1], self.label[idx]
  
  def __len__(self):
    return len(self.label)

In [None]:
train_data["label"].value_counts()

0    20980
1    13539
Name: label, dtype: int64

In [None]:
train_dataset = BERTDataset(train_data, 128)

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
depression_dataset = BERTDataset(depression, 128)
notdepression_dataset = BERTDataset(notdepression, 128)

In [None]:
valid_dataset = BERTDataset(valid_data, 128)
test_dataset = BERTDataset(test_data, 128)

In [None]:
class EmotionClassifier(nn.Module):
  def __init__(self, num_classes = 1):
    super(EmotionClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained("klue/bert-base")
    for param in self.bert.parameters():
      param.requires_grad = True
    #for param in self.bert.encoder.layer[11].parameters():
    #  param.requires_grad = True
    #self.bert.pooler.dense.requires_grad = True
    #self.bert.requires_grad = True
    self.classifier = nn.Linear(768, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask, positive = None, positive_attention_mask= None, negetive= None, negetive_mask= None):
    x = self.bert(input_ids, attention_mask).pooler_output
    anc = x
    if positive is not None:
      pos = torch.mean(self.bert(positive, positive_attention_mask).pooler_output, dim = 0).unsqueeze(dim = 0)
      neg = torch.mean(self.bert(negetive, negetive_mask).pooler_output, dim = 0).unsqueeze(dim = 0)
    x = self.sigmoid(self.classifier(x))
    if positive is not None:
      return x, pos, neg, anc
    return x

In [None]:
model = EmotionClassifier().cuda()

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
max_len = 128
batch_size = 32
warmup_ratio = 0.1
num_epochs = 20
max_grad_norm = 1
log_interval = 256
learning_rate = 5e-5

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)
loss_fn = nn.BCELoss()



In [None]:
triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)

In [None]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle = True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=2)

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    train_acc = ((X.squeeze(dim = 1) > 0.5) == Y).sum().data.cpu().numpy()/X.size()[0]
    return train_acc

In [None]:
best_acc = 0
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    depression_dataloader = torch.utils.data.DataLoader(depression_dataset, batch_size=8, num_workers=2, shuffle = True)
    notdepression_dataloader = torch.utils.data.DataLoader(notdepression_dataset, batch_size=8, num_workers=2, shuffle = True) 
    for batch_id, (attention_mask, token_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.float().to(device)
        positive_mask, positive, _ = next(iter(depression_dataloader)) # depress
        negetive_mask, negetive, _ = next(iter(notdepression_dataloader)) # not depress
        out, pos, neg, anc = model(token_ids, attention_mask, positive.to(device), positive_mask.to(device), negetive.to(device), negetive_mask.to(device))
        loss = loss_fn(out.squeeze(dim=1), label) + 0.05 * triplet_loss(anc[label == 1][0].unsqueeze(dim=0), pos, neg) + 0.05 * triplet_loss(anc[label == 0][0].unsqueeze(dim=0), neg, pos)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("EPOCH {} [{}/{}]  >>>  loss : {:.6f}\t  train_acc : {:.3f}".format(e+1, batch_id+1,len(train_dataloader),
                                                                           loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("EPOCH {}  >>>  loss : {:.6f}\t  train_acc : {:.3f}".format(e+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (attention_mask, token_ids, label) in enumerate(tqdm_notebook(valid_dataloader)):
        token_ids = token_ids.long().to(device)
        attention_mask = attention_mask.long().to(device)
        label = label.float().to(device)
        out = model(token_ids, attention_mask)
        test_acc += calc_accuracy(out, label)
    print("EPOCH {}  >>>  test_acc : {:.3f}".format(e+1, test_acc / (batch_id+1)))
    if best_acc < test_acc:
        torch.save(model.state_dict(), '/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/BERTKLUE_BINARY.pt')
        best_acc = test_acc

In [None]:
import time

In [None]:
model = EmotionClassifier().cuda()
model.load_state_dict(torch.load('/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/BERTKLUE_BINARY.pt'))
model.eval()
st = time.time()
test_acc = 0
for batch_id, (attention_mask, token_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    attention_mask = attention_mask.long().to(device)
    label = label.float().to(device)
    out = model(token_ids, attention_mask)
    test_acc += ((out.squeeze(dim = 1) > 0.5) == label).sum().data.cpu().numpy()
ed = time
print("TEST  >>>  test_acc : {:.3f}".format(test_acc / len(test_dataloader.dataset)))

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/4 [00:00<?, ?it/s]

TEST  >>>  test_acc : 0.890


In [None]:
# service를 위한 코드
class EmotionClassifierService():
  def __init__(self):
    self.model = EmotionClassifier().cuda()
    self.model.load_state_dict(torch.load('/content/drive/MyDrive/자연어처리음성인식/조상연/BERT 감정분류/BERTKLUE_BINARY.pt'))
    self.tokenizer = AutoTokenizer.from_pretrained("klue/bert-base", use_fast = True)
    self.classes = ['슬프지않음', '슬픔']

  def classify(self, x):
    self.model.eval()
    token_ids = self.tokenizer.encode(x)
    out = self.model(torch.tensor(token_ids).unsqueeze(dim=0).cuda(), torch.tensor([1] * len(token_ids)).unsqueeze(dim=0).cuda())
    class_id = 1 if out > 0.5 else 0
    return self.classes[class_id]

In [None]:
m = EmotionClassifierService()

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
m.classify("어머니랑 아버지가 이혼하신대. 너무 슬퍼.")

'슬픔'