In [1]:
!pip install rouge
!pip install datasets==1.0.2
!pip install transformers==4.24.0
!pip install transformer-utils
!pip install packaging
!pip install wandb



In [2]:
import datasets
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import itertools
import re

from rouge import Rouge
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    EarlyStoppingCallback,BartTokenizerFast

)
from datasets import Dataset
from tqdm import tqdm

In [3]:
model_checkpoints = "/aiffel/aiffel/Korean_Conversation_Summary/checkpoint/non_MLM_test/checkpoint-1750"

tokenizer = BartTokenizerFast.from_pretrained(model_checkpoints)
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints) 

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower() # 텍스트 소문자화
    sentence = re.sub(r'\([^)]*\)', '', sentence) # 괄호로 닫힌 문자열 (..) 제거
    #sentence = re.sub(r'[#@]+[가-힣A-Za-z#]+', ' ', sentence)
    sentence = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]+[/ㄱ-ㅎㅏ-ㅣ]', '', sentence) # 여러개 자음과 모음을 삭제한다.
    sentence = re.sub("[^가-힣a-z0-9#@]", " ", sentence) # 영어 외 문자(숫자, 특수문자 등) 공백으로 변환
    sentence = re.sub(r'[" "]+', " ", sentence) # 여러개 공백을 하나의 공백으로 바꿉니다.
    sentence = sentence.strip() # 문장 양쪽 공백 제거
    # 스폐셜 토큰 적용할 거면 여기 위에 영어 외 문자 공백으로 만들때 스폐셜 토큰을 넘어갈 수 있도록 지정해주면된다.
    # 그리고 세번째 정규표현식을 지워야 할 것이다. 
    return sentence

In [5]:
train_df = pd.read_csv('data/train_total.csv')
val_df = pd.read_csv('data/val_total.csv')

In [6]:
train_Category = train_df['Category'].unique()
val_Category = val_df['Category'].unique()
def categori_ext(data, Category, tv):
    df = pd.DataFrame()
    for c in Category:
        df = pd.concat([df, data[data['Category'] == c].iloc[0:int(len(data[data['Category'] == c])*0.05)]], axis = 0)
        df.reset_index(inplace=True, drop=True)
    return df

In [7]:
train_df = categori_ext(train_df, train_Category, 'train')
val_df = categori_ext(val_df, val_Category, 'val')

In [8]:
train_df.head()

Unnamed: 0,Id,Text,Summary,Category
0,fd321028-d5b4-55f7-9e20-2eaa262f9154,"['그럼 날짜는 가격 큰 변동 없으면 6.28-7.13로 확정할까?', '우리 비행...","비행기 표 가격에 대해 이야기하며, 특가 이벤트를 기다리고 있다.",상거래(쇼핑)
1,c51be2e4-c8d0-5cea-b1ae-cde1fe8f8ab6,"['Kf마스크만 5부제 하는거지?', '응. 면마스크는 아무때나 사도될껀?', '면...",비염이 있어서 싸게 나온 일회용 부직포 마스크를 사두려고 한다.,상거래(쇼핑)
2,e90e721f-00d1-5114-aa5d-5f1061472a29,['아 근데 케이크 업체들 봤는데 중앙동쪽 거기는 맛만있고 디자인은 그냥그런것같애'...,케이크 업체 중 중앙동 쪽은 맛만 있고 디자인은 별로고 고잔동 케이크 업체는 배달도...,상거래(쇼핑)
3,b215f3a2-d647-59f9-8410-1274ee5edd97,"['칫솔사야하는데 쓱으로 살까?', '뭘 칫솔사는것까지 물어보시남ㅋㅋㅋ', '아 그...",칫솔을 3개월에 하나씩 바꿔서 왕 칫솔 사러 신세계(쓱) 가자고 했다.,상거래(쇼핑)
4,0bda61b6-1396-5a2a-a049-0b4035e40d59,"['잠도안오네ㅐ얼릉 고구마츄 먹고싶단', '그게 그렇게 맛있었어??? 아주 여보 빼...",잠도 안 와서 고구마 말랭이를 양심상 하나만 먹으려고 한다.,상거래(쇼핑)


In [9]:
# 전체 Text 데이터에 대한 전처리 (1)
train_text = []
train_summary = []

for tt in tqdm(train_df['Text']):
    train_text.append(preprocess_sentence(tt))

for ts in tqdm(train_df['Summary']):
      train_summary.append(preprocess_sentence(ts))


100%|██████████| 13994/13994 [00:00<00:00, 32835.55it/s]
100%|██████████| 13994/13994 [00:00<00:00, 86534.30it/s]


In [10]:
# 전체 Text 데이터에 대한 전처리 (1)
val_text = []
val_summary = []

for vt in tqdm(val_df['Text']):
    val_text.append(preprocess_sentence(vt))

for vs in tqdm(val_df['Summary']):
      val_summary.append(preprocess_sentence(vs))


100%|██████████| 1746/1746 [00:00<00:00, 31527.84it/s]
100%|██████████| 1746/1746 [00:00<00:00, 86434.56it/s]


In [11]:
train_df = pd.DataFrame(zip(train_text,train_summary), columns=['Text', 'Summary'])
val_df = pd.DataFrame(zip(val_text,val_summary), columns=['Text', 'Summary'])

In [12]:
train_df.head()

Unnamed: 0,Text,Summary
0,그럼 날짜는 가격 큰 변동 없으면 6 28 7 13로 확정할까 우리 비행포함 15일...,비행기 표 가격에 대해 이야기하며 특가 이벤트를 기다리고 있다
1,kf마스크만 5부제 하는거지 응 면마스크는 아무때나 사도될껀 면마스크말고 부직포 마...,비염이 있어서 싸게 나온 일회용 부직포 마스크를 사두려고 한다
2,아 근데 케이크 업체들 봤는데 중앙동쪽 거기는 맛만있고 디자인은 그냥그런것같애 그러...,케이크 업체 중 중앙동 쪽은 맛만 있고 디자인은 별로고 고잔동 케이크 업체는 배달도...
3,칫솔사야하는데 쓱으로 살까 뭘 칫솔사는것까지 물어보시남 아 그 왕칫솔 또 사려나 싶...,칫솔을 3개월에 하나씩 바꿔서 왕 칫솔 사러 신세계 가자고 했다
4,잠도안오네 얼릉 고구마츄 먹고싶단 그게 그렇게 맛있었어 아주 여보 빼이보릿 되버렸네...,잠도 안 와서 고구마 말랭이를 양심상 하나만 먹으려고 한다


In [13]:
# DF > data Set으로 전환
train_data = Dataset.from_pandas(train_df) 
val_data = Dataset.from_pandas(val_df)
test_samples = Dataset.from_pandas(val_df)

In [14]:
print(train_data)
print(val_data)
print(test_samples)

Dataset(features: {'Text': Value(dtype='string', id=None), 'Summary': Value(dtype='string', id=None)}, num_rows: 13994)
Dataset(features: {'Text': Value(dtype='string', id=None), 'Summary': Value(dtype='string', id=None)}, num_rows: 1746)
Dataset(features: {'Text': Value(dtype='string', id=None), 'Summary': Value(dtype='string', id=None)}, num_rows: 1746)


In [15]:
max_input = 128
max_target = 32
batch_size = 4
ignore_index = -100# tokenizer.pad_token_id

In [16]:
 def add_ignored_data(inputs, max_len, ignore_index):
        if len(inputs) < max_len:
            pad = [ignore_index] *(max_len - len(inputs)) # ignore_index즉 -100으로 패딩을 만들 것인데 max_len - lne(inpu)
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:max_len]

        return inputs

In [17]:
def add_padding_data(inputs, max_len):
        pad_index = tokenizer.pad_token_id
        if len(inputs) < max_len:
            pad = [pad_index] *(max_len - len(inputs))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:max_len]

        return inputs 

In [18]:
def preprocess_data(data_to_process):
    label_id= []
    label_ids = []
    dec_input_ids = []
    input_ids = []

    for i in range(len(data_to_process['Text'])):
        input_ids.append(add_padding_data(tokenizer.encode(data_to_process['Text'][i], add_special_tokens=False), max_input))
    for i in range(len(data_to_process['Summary'])):
        label_id.append(tokenizer.encode(data_to_process['Summary'][i]))  
        label_id[i].append(tokenizer.eos_token_id)  
    for i in range(len(data_to_process['Summary'])):  
        dec_input_id = [tokenizer.eos_token_id]
        dec_input_id += label_id[i][:-1]
        dec_input_ids.append(add_padding_data(dec_input_id, max_target))  
    for i in range(len(data_to_process['Summary'])):
        label_ids.append(add_ignored_data(label_id[i], max_target, ignore_index))
   
    return {'input_ids': input_ids,
            'attention_mask' : (np.array(input_ids) != tokenizer.pad_token_id).astype(int),
            'decoder_input_ids': dec_input_ids,
            'decoder_attention_mask': (np.array(dec_input_ids) != tokenizer.pad_token_id).astype(int),
            'labels': label_ids}

In [19]:
special_words = [
                "#@주소#", "#@이모티콘#", "#@이름#", "#@URL#", "#@소속#",
                "#@기타#", "#@전번#", "#@계정#", "#@url#", "#@번호#", "#@금융#", "#@신원#",
                "#@장소#", "#@시스템#사진#", "#@시스템#동영상#", "#@시스템#기타#", "#@시스템#검색#",
                "#@시스템#지도#", "#@시스템#삭제#", "#@시스템#파일#", "#@시스템#송금#", "#@시스템#",
                ]

tokenizer.add_special_tokens({"additional_special_tokens": special_words})
model.resize_token_embeddings(len(tokenizer))

Embedding(30022, 768)

In [20]:
train_tokenize_data = train_data.map(preprocess_data, batched = True, remove_columns=['Text', 'Summary'])
val_tokenize_data = val_data.map(preprocess_data, batched = True, remove_columns=['Text', 'Summary'])

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [21]:
val_tokenize_data['decoder_input_ids'][100]

[1,
 11270,
 11042,
 12024,
 24341,
 23374,
 14036,
 26787,
 14036,
 26550,
 14075,
 14304,
 14036,
 14489,
 14455,
 15382,
 14058,
 15313,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3]

In [22]:
# # set special tokens
#from transformers import EncoderDecoderConfig
model.config.decoder_start_token_id = tokenizer.bos_token_id                                             
model.config.eos_token_id = tokenizer.eos_token_id

# sensible parameters for beam search
# set decoding params                               
model.config.max_length = 32 # 256은 쿠다 메모리 오류 생김
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 5
#model.config.suppress_tokens = [23782, 14338, 22554, 234]


In [23]:
model.config

BartConfig {
  "_name_or_path": "/aiffel/aiffel/Korean_Conversation_Summary/checkpoint/non_MLM_test/checkpoint-1750",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 1,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "NEGATIVE

In [24]:
rouge = Rouge()
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    return rouge.get_scores(pred_str, label_str, avg=True)
   

In [25]:
training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoint/non_pretrained_tf_test",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,  # demo
    per_device_eval_batch_size=64,
    learning_rate=3e-05,
    warmup_steps=50,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True, # 생성기능을 사용하고 싶다고 지정한다.
    logging_dir="logs2",
    logging_steps=50,
    save_total_limit=3,
    max_steps=100

    #evaluation_strategy = "steps",# step별로 2버 loss가 오르는거 아니면 계속 반복하는듯
    #load_best_model_at_end = True,

)

In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # 데이터 일괄 처리?
"""
DataCollatorForSeq2Seq 를 사용하여 예제 배치를 생성 하십시오 . 
또한 일괄 처리에서 가장 긴 요소의 길이로 텍스트와 레이블을 동적으로 채워서 균일한 길이가 되도록 합니다.
tokenizer를 설정하여 함수 에서 텍스트를 채울 수 있지만 padding=True동적 패딩이 더 효율적입니다.
"""

'\nDataCollatorForSeq2Seq 를 사용하여 예제 배치를 생성 하십시오 . \n또한 일괄 처리에서 가장 긴 요소의 길이로 텍스트와 레이블을 동적으로 채워서 균일한 길이가 되도록 합니다.\ntokenizer를 설정하여 함수 에서 텍스트를 채울 수 있지만 padding=True동적 패딩이 더 효율적입니다.\n'

In [27]:
trainer = Seq2SeqTrainer(
    model, 
    training_args,
    train_dataset=train_tokenize_data,
    eval_dataset=val_tokenize_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
   # compute_metrics=compute_metrics,
   # callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


max_steps is given, it will override any value given in num_train_epochs


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
trainer.train()

***** Running training *****
  Num examples = 13994
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 123876864
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mjx7789[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,8.154
100,7.6847




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=7.919373474121094, metrics={'train_runtime': 34.8736, 'train_samples_per_second': 45.88, 'train_steps_per_second': 2.867, 'total_flos': 121947291648000.0, 'train_loss': 7.919373474121094, 'epoch': 0.11})

In [29]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1746
  Batch size = 64


{'eval_loss': 7.60614538192749,
 'eval_runtime': 11.1589,
 'eval_samples_per_second': 156.467,
 'eval_steps_per_second': 2.509,
 'epoch': 0.11}

In [30]:
inputs = tokenizer(
        test_samples["Text"],
        padding="max_length",
        truncation=True,
        max_length=max_target,
        return_tensors="pt",
    )
inputs

{'input_ids': tensor([[11926, 15249, 13363,  ..., 25985, 14542,  9031],
        [ 9545,  9698, 14334,  ..., 11723, 12332, 14267],
        [ 9694,  9489, 11224,  ..., 13468, 13328, 15321],
        ...,
        [11696, 15584, 10496,  ..., 15242, 13090, 28666],
        [12191, 12719, 23901,  ..., 16592, 14039,  1700],
        [17886, 12005, 29068,  ..., 14116, 10785, 14031]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [31]:
input_ids = inputs.input_ids.to(model.device)
input_ids

tensor([[11926, 15249, 13363,  ..., 25985, 14542,  9031],
        [ 9545,  9698, 14334,  ..., 11723, 12332, 14267],
        [ 9694,  9489, 11224,  ..., 13468, 13328, 15321],
        ...,
        [11696, 15584, 10496,  ..., 15242, 13090, 28666],
        [12191, 12719, 23901,  ..., 16592, 14039,  1700],
        [17886, 12005, 29068,  ..., 14116, 10785, 14031]], device='cuda:0')

In [32]:
attention_mask = inputs.attention_mask.to(model.device)
attention_mask

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')

In [33]:
model.generate

<bound method GenerationMixin.generate of BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30022, 768)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30022, 768)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

In [35]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["Text"],
        padding="max_length",
        truncation=True,
        max_length=max_target,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, num_beams=5,no_repeat_ngram_size=3,
                            attention_mask=attention_mask, 
                            pad_token_id=tokenizer.pad_token_id,
                            bos_token_id=tokenizer.bos_token_id,
                            eos_token_id=tokenizer.eos_token_id,)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)


OSError: Can't load the configuration of '/aiffel/aiffel/Korean_Conversation_Summary/checkpoint/non_MLM_test/checkpoint-1750'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/aiffel/aiffel/Korean_Conversation_Summary/checkpoint/non_MLM_test/checkpoint-1750' is the correct path to a directory containing a config.json file

In [36]:
#summaries_before_tuning = []
#for test_sample in tqdm(test_samples):
#    summaries_before_tuning.append(generate_summary(test_sample, model_before_tuning)[1])
#summaries_before_tuning = list(itertools.chain(*summaries_before_tuning))    
    
summaries_after_tuning=[]
for test_sample in tqdm(test_samples):
    summaries_after_tuning.append(generate_summary(test_sample, model)[1])
summaries_after_tuning = list(itertools.chain(*summaries_after_tuning))

100%|██████████| 1746/1746 [04:30<00:00,  6.46it/s]


In [37]:
rouge.get_scores(summaries_after_tuning, test_samples["Summary"], avg=True)

{'rouge-1': {'r': 0.018944756250867518,
  'p': 0.036500436371570424,
  'f': 0.024139580439880532},
 'rouge-2': {'r': 5.206706237634073e-05,
  'p': 9.545628102329132e-05,
  'f': 6.738090163577065e-05},
 'rouge-l': {'r': 0.018944756250867518,
  'p': 0.036500436371570424,
  'f': 0.024139580439880532}}

In [None]:
for i in range(0, len(summaries_after_tuning), 100):
    print('idx_{} '.format(i))
    #print("Summary before \n", summaries_before_tuning[i])
    print()
    print("Summary after \n", summaries_after_tuning[i])
    print()
    print("Target summary \n", test_samples["Summary"][i])
    print()
    print('Text', test_samples["Text"][i])
    print('-'*100)
    print()  