<a href="https://colab.research.google.com/github/iamnotwhale/24W-dialect/blob/main/dialect_to_standard_with_kobart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 필요 라이브러리 불러오기

In [1]:
!pip install transformers pandas tokenizers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m286.7/290.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.28.0


In [3]:
import json
import pandas as pd
import glob
from collections import defaultdict
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer
from tokenizers import Tokenizer
from transformers import pipeline
import warnings
import os
import torch

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
dir = "/content/drive/MyDrive/daiv_audio"

## 모델 파라미터 설정

In [8]:
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer

# 모델 학습 후에 가중치 저장할 폴더
model_path=dir+'/saved_model'
# 모델 초기 가중치 로드할 곳
model_name = "gogamza/kobart-base-v2"
# 데이터셋 파일
data_root=dir

# 내가 학습한 모델 가중치의 유무에 따라 분기 처리
if os.path.exists(f'{model_path}/pytorch_model.bin'):
    print("Use Customized Model")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
else:
    print("Use Pretrained Model")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

# Training Arguments

args = {
    'num_train_epochs': 3,
    'per_device_train_batch_size': 32,
    'per_device_eval_batch_size': 32,
    'overwrite_output_dir': True,
    'eval_steps': 10000,
    'save_steps': 10000,
    'warmup_steps': 5,
    'evaluation_strategy': "steps",
    'prediction_loss_only': True,
    'save_total_limit': 3
}

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Use Pretrained Model


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


cuda


In [9]:
from torch.utils.data import Dataset

class TextStyleTransferDataset(Dataset):
    def __init__(self, df,tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row=self.df.iloc[index]
        text1=row[0] #표준어
        text2=row[1] #사투리
        target_style_name = '표준어'

        # Tokenizer를 허깅페이스의 레포지토리에서 가져왔기 때문에
        # 테스트 코드를 참고해서 모델 입력 형식을 만들었다.
        encoder_text = f"{target_style_name} 말투로 변환:{text1}"
        decoder_text = f"{text2}{self.tokenizer.eos_token}"
        model_inputs = self.tokenizer(encoder_text, max_length=64, truncation=True)

        with self.tokenizer.as_target_tokenizer():
            labels = tokenizer(decoder_text, max_length=64, truncation=True)
        model_inputs['labels'] = labels['input_ids']
        del model_inputs['token_type_ids']

        return model_inputs

# 만들어뒀던 데이터셋을 불러와서 데이터프레임을 만들어준다
def make_df(data_root):
    df = pd.read_csv(f'{data_root}/data_fixed.tsv',sep='\t')
    # 주로 쓰이는 방법 같지는 않지만,, Train Data와 Test Data를 8:2 비율로 나눠준다
    rate=int(len(df)*0.1)
    df_train,df_test = df[rate:],df[:rate]

    print(f'Train DataFrame length : {len(df_train)},Test DataFrame length : {len(df_test)}')
    return df_train,df_test

def make_dataset(df):
    df_train,df_test = df

    train_dataset = TextStyleTransferDataset(df_train,tokenizer)
    test_dataset = TextStyleTransferDataset(df_test,tokenizer)

    return train_dataset,test_dataset

## 학습

In [10]:
from transformers import Seq2SeqTrainingArguments,Seq2SeqTrainer,\
                         DataCollatorForSeq2Seq
warnings.filterwarnings("ignore")


df = make_df(data_root)
train_dataset,test_dataset=make_dataset(df)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)


training_args = Seq2SeqTrainingArguments(
    **args,
    output_dir=model_path,
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


Train DataFrame length : 302736,Test DataFrame length : 33637


In [11]:
print(trainer)

<transformers.trainer_seq2seq.Seq2SeqTrainer object at 0x78cfdb77ef50>


In [12]:
# 모델 학습 진행
try:
    trainer.train()
except Exception as e:
    print(f"Failed to train model caused by {e}")


Step,Training Loss,Validation Loss
10000,0.053,0.080377
20000,0.0337,0.072007


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}


In [13]:
try:
    trainer.save_model(model_path)
    print("Model saved successfully.")
except Exception as e:
    print(f"Failed to save model caused by {e}")

Non-default generation parameters: {'forced_eos_token_id': 1}


Model saved successfully.


## Test (Validation)

AI hub에 있는 텍스트 데이터에는 테스트용(라벨 없는거)이 따로 없긴 하다.
그래서 이전에 그냥 trainset에서 8:2로 split시켜서 validation용으로 퉁치긴 했지만
일단 나중에 오디오 -> 텍스트(STT)실행하고 나서 어차피 데이터는 바꿔서 껴야되기 때문에 그때 validation도 상황에 맞게 데이터 변경해야함


In [14]:
from transformers import pipeline

nlg_pipeline=pipeline('text2text-generation',model=model_path,tokenizer=model_name)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [15]:
def generate_text(pipe, text, num_return_sequences, max_length):
    target_style_name = "표준어"
    text = f"{target_style_name} 말투로 변환:{text}"
    out = pipe(text, num_return_sequences=num_return_sequences, max_length=max_length)
    #num_return_sequences의 값에 따라서 반환되는 텍스트의 개수가 바뀐다. 만약 3으로 지정했다면 길이가 3인 리스트에 담겨서 값이 반환될 것임!
    return [x['generated_text'] for x in out]


print("Write 'q' to exit")
while True:
    src_text=input("Dialect to translate(입력받을 사투리) : ")
    if src_text == 'q':
        break
    target_text_ko=generate_text(nlg_pipeline,src_text,num_return_sequences=1,max_length=64)[0]
    print(f"Translated Standard (표준어로 출력): {target_text_ko}")

Write 'q' to exit
Dialect to translate(입력받을 사투리) : 뭐라카노
Translated Standard (표준어로 출력): 뭐라고 하지
Dialect to translate(입력받을 사투리) : 지금 시간이 몇신데
Translated Standard (표준어로 출력): 지금 시간이 몇신데
Dialect to translate(입력받을 사투리) : 정구지랑 찌짐이랑 해서
Translated Standard (표준어로 출력): 부추랑 지짐이랑 해서
Dialect to translate(입력받을 사투리) : 쫌 쫌생이 같이 굴지 마라
Translated Standard (표준어로 출력): 조금 조금생이 같이 굴지 마라
Dialect to translate(입력받을 사투리) : 먼 놈의 정구지가 이래 맛있노
Translated Standard (표준어로 출력): 먼 놈의 부추가 이래 맛있니
Dialect to translate(입력받을 사투리) : 저 할마시가 죽을 때가 됐나 와 저카노
Translated Standard (표준어로 출력): 저 할머니가 죽을 때가 됐나 왜 저러나
Dialect to translate(입력받을 사투리) : 가가 가가
Translated Standard (표준어로 출력): 걔가 걔가
Dialect to translate(입력받을 사투리) : 가가가 가
Translated Standard (표준어로 출력): 걔가 걔
Dialect to translate(입력받을 사투리) : 가가가가
Translated Standard (표준어로 출력): 걔가서
Dialect to translate(입력받을 사투리) : 끝낫제?
Translated Standard (표준어로 출력): 끝났지?
Dialect to translate(입력받을 사투리) : 오매
Translated Standard (표준어로 출력): 아주
Dialect to translate(입력받을 사투리) : 혼자 하믄 재미없다카이
Translated Stand

In [18]:
torch.save(model, "/content/drive/MyDrive/daiv_audio/kobart_epoch3_trained.pth")