In [None]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 14.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 67.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 71.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [None]:
!pip install einops

Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [None]:
import os
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from PIL import Image
from torchvision import transforms
import torchvision
from tqdm import tqdm, tqdm_notebook
import sys
from einops import repeat
from transformers import AutoModel, AutoTokenizer

In [None]:
way2_train = pd.read_csv('/content/drive/MyDrive/자연어처리음성인식/조상연/음성데이터(감정분류 or 음성인식)/way2_train.csv')
way2_test = pd.read_csv('/content/drive/MyDrive/자연어처리음성인식/조상연/음성데이터(감정분류 or 음성인식)/way2_test.csv')

In [None]:
way2_train.head()

Unnamed: 0,sentence,wav_file_path,mel_spectrum_path,ms_spectrum_img_path,mfccs_path,label
0,우리 아빠는 나한테 제대로 된 선물 한 번 준 적 없으셔.,audio_files/M_000001.wav,ms_files/M_000001.npy,ms_files_img/M_000001.png,mfccs_files/M_000001.npy,1
1,아무도 내 생일은 안 챙겨줘.,audio_files/M_000002.wav,ms_files/M_000002.npy,ms_files_img/M_000002.png,mfccs_files/M_000002.npy,1
2,내가 내 생각을 말하려고 할 때마다 아빠는 나를 때리셨어.,audio_files/M_000003.wav,ms_files/M_000003.npy,ms_files_img/M_000003.png,mfccs_files/M_000003.npy,1
3,이렇게 작은 집에 사는 사람은 나밖에 없을 것 같아.,audio_files/M_000004.wav,ms_files/M_000004.npy,ms_files_img/M_000004.png,mfccs_files/M_000004.npy,1
4,우리 집은 왜 이렇게 가난한 건지 모르겠어.,audio_files/M_000005.wav,ms_files/M_000005.npy,ms_files_img/M_000005.png,mfccs_files/M_000005.npy,1


In [None]:
class SentimentDataset(Dataset):
  def __init__(self, data, max_len, transform = None):
    super(SentimentDataset, self).__init__()
    self.data = data
    self.max_len = max_len
    self.base_path = "/content/drive/MyDrive/자연어처리음성인식/조상연/음성데이터(감정분류 or 음성인식)/"
    self.tokenizer = AutoTokenizer.from_pretrained("klue/bert-base", use_fast = True)

    self.inputs = [(self.base_path + self.data.iloc[i]["mel_spectrum_path"]) for i in range(len(self.data))]
    self.text = [self.convert_token([data.iloc[i]["sentence"]]) for i in range(len(self.data))]
    self.label = [np.int32(self.data.iloc[i]['label']) for i in range(len(self.data))]
    self.transform = transform

  def convert_token(self, data):
    token = self.tokenizer.encode(data[0])
    attention_mask = [1] * len(token) + [0] * (self.max_len - len(token))
    token = token + self.tokenizer.convert_tokens_to_ids(["[PAD]"] * (self.max_len - len(token)))
    return [np.int32(attention_mask), np.int32(token)]

  def __getitem__(self, idx):
    data = np.load(self.inputs[idx])
    return self.transform(data), self.label[idx], self.text[idx][0], self.text[idx][1] # spectrum, label, attention_mask, token
  
  def __len__(self):
    return len(self.label)

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Resize((128,256))]
)

In [None]:
train_dataset = SentimentDataset(way2_train, 128, transform)

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
test_dataset = SentimentDataset(way2_test, 128, transform)

In [None]:
class BERTEmotionClassifier(nn.Module):
  def __init__(self, num_classes = 1):
    super(BERTEmotionClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained("klue/bert-base")
    for param in self.bert.parameters():
      param.requires_grad = True
    #for param in self.bert.encoder.layer[11].parameters():
    #  param.requires_grad = True
    #self.bert.pooler.dense.requires_grad = True
    #self.bert.requires_grad = True
    self.classifier = nn.Linear(768, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    x = self.bert(input_ids, attention_mask).pooler_output
    x = self.sigmoid(self.classifier(x))
    return x

In [None]:
class EmotionClassifier(nn.Module):
  def __init__(self, num_classes = 1):
    super(EmotionClassifier, self).__init__()
    self.resnet = torchvision.models.resnet50(pretrained = True)
    self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 768)
    self.sigmoid = nn.Sigmoid()
    self.conv = nn.Conv2d(1, 3, 3, 1, 1)

    self.bert = BERTEmotionClassifier().cuda()
    self.bert.load_state_dict(torch.load('/content/drive/MyDrive/자연어처리음성인식/1차 prototype/BERTKLUE_BINARY.pt'))
    self.bert = self.bert.bert

    self.classifier = nn.Linear(768 * 2, 1)

  def forward(self, x, token_ids, attention_mask):
    x1 = self.bert(token_ids, attention_mask).pooler_output
    x2 = self.conv(x)
    x2 = self.resnet(x2)
    x = torch.cat([x1, x2], dim = 1)
    x = self.sigmoid(self.classifier(x))
    return x

In [None]:
def calc_accuracy(X,Y):
    train_acc = ((X > 0.5) == Y).sum().data.cpu().numpy()/X.size()[0]
    return train_acc

In [None]:
def train():
    """
    returns history dictionary that contains train_loss, valid_loss as list
    """
    batch_size = 32
    epoch = 20
    best_acc = 0
    model = EmotionClassifier().cuda()
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle = True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=2)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    history = {
        'train_loss': [],
        'valid_loss': [],
    }
    criterion = nn.BCELoss()
    for e in range(epoch):
        model.train()
        train_loss = 0
        test_acc = 0
        pbar = tqdm(enumerate(train_loader), file=sys.stdout)
        for batch_idx, (data, target,attention_mask, token_ids) in pbar:
            data, target = data.cuda(), target.cuda()
            #print(data.shape, target.shape)
            token_ids = token_ids.long().cuda()
            attention_mask = attention_mask.long().cuda()
            output = model(data, token_ids, attention_mask)
            optimizer.zero_grad()
            loss = criterion(output.squeeze(dim=1), target.float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            pbar.set_postfix(epoch=f'epoch {e + 1} of {epoch}', loss=f'{train_loss / (batch_idx + 1)}')
        pbar.close()

        train_loss = train_loss / len(train_loader)
        history['train_loss'].append(train_loss)

        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target, attention_mask, token_ids) in enumerate(tqdm_notebook(test_loader)):
                data, target = data.cuda(), target.cuda()
                output = model(data, token_ids.cuda(), attention_mask.cuda())
                loss = criterion(output.squeeze(dim=1), target.float())
                valid_loss += loss.item()
                test_acc += ((output > 0.5).squeeze(dim=1) == target).sum().data.cpu().numpy()
                #print(output.shape)
        history['valid_loss'].append(valid_loss)
        print(test_acc)
        if best_acc < test_acc:
          torch.save(model.state_dict(), "/content/drive/MyDrive/자연어처리음성인식/조상연/음성데이터(감정분류 or 음성인식)/Speech_ensemb.pt")
          best_acc = test_acc
    return history

In [None]:
train()