## **0. Set Up**

In [1]:
#!pip uninstall numpy
#!pip install numpy==1.23.1
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip install kobert-transformers
!pip install transformers
!pip install gluonnlp
!pip install mxnet
!pip install datasets
!pip install tensorboardX

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-su66sf4o/kobert-tokenizer_1841a89cf6f0486aa0e4863274f02147
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-su66sf4o/kobert-tokenizer_1841a89cf6f0486aa0e4863274f02147
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 5c46b1c68e4755b54879431bd302db621f4d2f47
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=5b89f2aa4810b57ce03a5ac4d878ecbfe5b281e52b42ec35645784f3b838e9dd
  Stored in directory: /tmp/pip-ephem-wheel-cache-677o_yai/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [2]:
from google.colab import drive
drive.mount("/content/drive/")

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import load_dataset

from transformers import AdamW, BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers.optimization import get_cosine_schedule_with_warmup
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel
from tensorboardX import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import numpy as np
import pandas as pd
import gluonnlp as nlp
from tqdm import tqdm, tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive/


In [3]:
device = torch.device("cude:0") if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

Using cpu device


## **1. Data Preprocessing**

In [4]:
final_data = pd.read_csv('https://github.com/ohgzone/file1/raw/main/aihub_coupus.csv')

final_data['문장'] = final_data['문장'].str.replace('[^가-힣 ]', '', regex=True)
final_data['문장'] = final_data['문장'].str.strip()
final_data.dropna(subset=['문장'], inplace=True)
final_data.drop_duplicates(subset=['문장'], inplace=True)

list1 = final_data['감정'].value_counts().index.values
label2class = {la: cl for cl, la in enumerate(list1)}
class2label = {cl: la for cl, la in enumerate(list1)}
final_data['label'] = final_data['감정'].map(label2class)

features = final_data['문장'].values
labels = final_data['label'].values

## **2. Model Definition**

In [5]:
class BERTDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len, pad=True, pair=False):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.pair = pair

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding='max_length' if self.pad else False,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)
        token_type_ids = encoding['token_type_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return input_ids, attention_mask, token_type_ids, torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

In [6]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
summary = SummaryWriter()

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [8]:
max_len = 128
batch_size = 64
warmup_ratio = 0.1
max_grad_norm = 1
log_interval = 200
learning_rate = 1e-5

In [9]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=41)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

train_data = BERTDataset(x_train, y_train, tokenizer, max_len, pad=True, pair=False)
test_data = BERTDataset(x_test, y_test, tokenizer, max_len, pad=True, pair=False)

train_size = int(0.9 * len(train_data))
valid_size = len(train_data) - train_size
train_, valid_ = random_split(train_data, [train_size, valid_size])

train_dataloader = DataLoader(train_, batch_size=batch_size, shuffle=True, num_workers=0)
val_dataloader = DataLoader(valid_, batch_size=batch_size, shuffle=False, num_workers=0)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=0)

(41259,) (10315,) (41259,) (10315,)


In [10]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=6, dr_rate=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, token_ids, attention_mask, segment_ids):
        _, pooler_output = self.bert(input_ids=token_ids,
                                     token_type_ids=segment_ids,
                                     attention_mask=attention_mask,
                                     return_dict=False)
        if self.dr_rate:
            pooler_output = self.dropout(pooler_output)
        return self.classifier(pooler_output)

def calc_accuracy(X, Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy() / max_indices.size()[0]
    return train_acc

In [11]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [13]:
num_epochs = 3

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = optim.AdamW(optimizer_grouped_parameters, lr=1e-5)

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

## **3. Training**

In [14]:
for e in range(num_epochs):
    train_acc = 0.0
    val_acc = 0.0
    model.train()
    lossF = 0

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        train_acc += calc_accuracy(out, label)
        lossF = loss.data.cpu().numpy()

        if batch_id % log_interval == 0:
            print("Epoch {} Batch ID {} Loss {:.4f} Train Acc {:.4f}".format(
                e+1, batch_id+1, lossF, train_acc / (batch_id+1)
            ))

    print("Epoch {} Train Acc {:.4f}".format(e+1, train_acc / len(train_dataloader)))

    model.eval()
    with torch.no_grad():
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            label = label.long().to(device)

            out = model(token_ids, valid_length, segment_ids)
            val_acc += calc_accuracy(out, label)

    print("Epoch {} Validation Acc {:.4f}".format(e+1, val_acc / len(val_dataloader)))

    summary.add_scalar('Train Accuracy', train_acc / len(train_dataloader), e+1)
    summary.add_scalar('Validation Accuracy', val_acc / len(val_dataloader), e+1)
    summary.add_scalar('Loss', lossF, e+1)

  0%|          | 0/581 [00:00<?, ?it/s]

Epoch 1 Batch ID 1 Loss 1.8581 Train Acc 0.1406
Epoch 1 Batch ID 201 Loss 1.7396 Train Acc 0.1842
Epoch 1 Batch ID 401 Loss 1.3542 Train Acc 0.2960
Epoch 1 Train Acc 0.3625


  0%|          | 0/65 [00:00<?, ?it/s]

Epoch 1 Validation Acc 0.5365


  0%|          | 0/581 [00:00<?, ?it/s]

Epoch 2 Batch ID 1 Loss 1.3117 Train Acc 0.6094
Epoch 2 Batch ID 201 Loss 1.3182 Train Acc 0.5520
Epoch 2 Batch ID 401 Loss 1.1044 Train Acc 0.5531
Epoch 2 Train Acc 0.5573


  0%|          | 0/65 [00:00<?, ?it/s]

Epoch 2 Validation Acc 0.5594


  0%|          | 0/581 [00:00<?, ?it/s]

Epoch 3 Batch ID 1 Loss 1.2093 Train Acc 0.6250
Epoch 3 Batch ID 201 Loss 1.1904 Train Acc 0.5836
Epoch 3 Batch ID 401 Loss 1.0119 Train Acc 0.5857
Epoch 3 Train Acc 0.5837


  0%|          | 0/65 [00:00<?, ?it/s]

Epoch 3 Validation Acc 0.5603


In [15]:
torch.save(model, '/content/drive/MyDrive/2024-2_비정형데이터분석/Project/model.pt')
torch.save(model.state_dict(), '/content/drive/MyDrive/2024-2_비정형데이터분석/Project/model_state_dict.pt')
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
}, '/content/drive/MyDrive/2024-2_비정형데이터분석/Project/koBERT_trained.tar')

In [16]:
model.eval()
test_acc = 0.0
with torch.no_grad():
    for batch_id, (token_ids, attention_mask, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.to(device)
        segment_ids = segment_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        out = model(token_ids, attention_mask, segment_ids)
        test_acc += calc_accuracy(out, label)

print("Test Accuracy {:.4f}".format(test_acc / len(test_dataloader)))

  0%|          | 0/162 [00:00<?, ?it/s]

Test Accuracy 0.5618


## **4. Predict**

In [17]:
import re

def clean_text(text):
    text = re.sub(r'[^가-힣\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_lyrics(sentences, tokenizer, max_len):
    input_ids_list = []
    attention_mask_list = []
    token_type_ids_list = []

    for sentence in sentences:
        encoding = tokenizer(
            sentence,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids_list.append(encoding['input_ids'].squeeze(0))
        attention_mask_list.append(encoding['attention_mask'].squeeze(0))
        token_type_ids_list.append(encoding['token_type_ids'].squeeze(0))

    return input_ids_list, attention_mask_list, token_type_ids_list

In [18]:
mydata = pd.read_excel("/content/drive/MyDrive/2024-2_비정형데이터분석/Project/비정형_노래가사데이터_멜론.xlsx")
mydata['가사'] = mydata['가사'].astype(str).replace("\n", " ")
mydata['가사'] = mydata['가사'].astype(str).apply(clean_text)

input_ids, attention_mask, token_type_ids = preprocess_lyrics(mydata['가사'], tokenizer, max_len)

input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)
token_type_ids = torch.stack(token_type_ids).to(device)

In [19]:
model.eval()
predicted_labels = []

with torch.no_grad():
    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]
        batch_token_type_ids = token_type_ids[i:i+batch_size]

        outputs = model(batch_input_ids, batch_attention_mask, batch_token_type_ids)
        _, predicted = torch.max(outputs, dim=1)
        predicted_labels.extend(predicted.cpu().numpy())

In [22]:
mydata['감정_label'] = predicted_labels

class2label = {0: '기쁨', 1: '분노', 2: '상처', 3: '슬픔', 4: '불안', 5: '당황'}
mydata['감정'] = mydata['감정_label'].map(class2label)

mydata

Unnamed: 0,곡명,가사,감정_label,감정
0,APT.,아파트 아파트 아파트 아파트 아파트 아파트 아파트 아파트 아파트 아파트 아파트 아파...,5,당황
1,Whiplash,집중해 좀 더 이유 넌 이해 못 해 왜 이제야 흔들린 채 무리해도 어디서나 거침없어...,1,분노
2,POWER,억까 짤 퍼다 샬라샬라하다가 난 자유로워 나는 나다워서 아름다워 애들이 나보고 개꿀...,1,분노
3,UP (KARINA Solo),다들 뻔해 또 거짓말 번듯한 말 어디든 뛰어다녀 여기저기 다 가져갈게 한순간에 타올...,1,분노
4,HAPPY,그런 날이 있을까요 마냥 좋은 그런 날이요 내일 걱정 하나 없이 웃게 되는 그런 날...,5,당황
...,...,...,...,...
95,내가 S면 넌 나의 N이 되어줘,내가 면 넌 나의 이 되어줘어떤 순간에도 너를 찾을 수 있게반대가 끌리는 천만번째 ...,1,분노
96,Get A Guitar,너와 내 느낌대로 시작해 봐 발 박자를 맞추고 손은 모두 집중해 줘 너와 맞추는 눈...,5,당황
97,Super Shy,떨리는 지금도 떨리는 지금도 우리 둘이 나란히 보이지 봐 내 눈이 갑자기 빛나지 누...,0,기쁨
98,Over The Moon,너는 나의 미래 내 시공의 질서 널 안고 입 맞출 때 내 겨울이 녹아내려 데려가 줘...,5,당황


In [23]:
mydata.to_excel("/content/drive/MyDrive/2024-2_비정형데이터분석/Project/mydata_result.xlsx", index=False)