In [1]:
%config Completer.use_jedi = False
import pandas as pd
from tqdm import tqdm
import pandas as pd
from datasets import Dataset, DatasetDict

## dataset

In [2]:
import transformers
from datasets import load_dataset, load_metric, load_from_disk
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
# df = pd.read_excel('new_keyword_data2.xlsx', engine = 'openpyxl')
# df

In [31]:
# selected_columns = ['abstract(ddt_bounding_box_distance)', 'new_keyword-gpt4']
# filter_df = df[selected_columns]
# f_df = filter_df.dropna(axis=0)
# f_df

In [5]:
train_df = f_df[:4000]
test_df = f_df[4000:4400]
validation_df = f_df[4400:]

In [6]:
# pandas DataFrame을 Hugging Face Dataset으로 변환
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

In [7]:
data = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['abstract(ddt_bounding_box_distance)', 'new_keyword-gpt4', '__index_level_0__'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['abstract(ddt_bounding_box_distance)', 'new_keyword-gpt4', '__index_level_0__'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['abstract(ddt_bounding_box_distance)', 'new_keyword-gpt4', '__index_level_0__'],
        num_rows: 125
    })
})

In [9]:
metric = load_metric('rouge')
model_checkpoints = 'facebook/bart-large-xsum'

## Data tokenization

In [10]:
max_input = 512
max_target = 128
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

In [11]:
def preprocess_data(data_to_process):
    # 'abstract(ddt_bounding_box_distance)' 텍스트를 입력으로 받음
    inputs = [abstract for abstract in data_to_process['abstract(ddt_bounding_box_distance)']]
    # 입력 텍스트 토큰화
    model_inputs = tokenizer(inputs, max_length=max_input, padding='max_length', truncation=True)

    # 'new_keyword'를 타겟으로 설정하고 토큰화
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(data_to_process['new_keyword-gpt4'], max_length=max_target, padding='max_length', truncation=True)
    
    # 토큰화된 'new_keyword'를 레이블로 설정
    model_inputs['labels'] = targets['input_ids']
    
    # 반환 값에는 입력 토큰, 어텐션 마스크, 레이블이 포함됨
    return model_inputs


In [12]:
temp_tokenize_data = data.map(preprocess_data, batched=True, remove_columns=['abstract(ddt_bounding_box_distance)', 'new_keyword-gpt4'])

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
# 각 데이터셋에서 '__index_level_0__' 열을 제거
tokenize_data = temp_tokenize_data.map(lambda x: x, batched=True, remove_columns=['__index_level_0__'])

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
tokenize_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 125
    })
})

## Training process

In [15]:
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

In [16]:
batch_size=1

In [17]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
def compute_rouge(pred):
    predictions, labels = pred
    #decode the predictions
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    #decode labels
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #compute results
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
    #get %
    res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k, v in res.items()}

In [19]:
args = transformers.Seq2SeqTrainingArguments(
    'conversation-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    fp16=True
    )

In [20]:
from transformers import TrainerCallback

class SaveBestModelCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.best_loss = None

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # validation loss 추적
        val_loss = metrics["eval_loss"]
        if self.best_loss is None or val_loss < self.best_loss:
            self.best_loss = val_loss
            # 새로운 최적 모델 저장
            print(f"New best model found at epoch {state.epoch}. Saving model.")
            model.save_pretrained('./best_bart_deep/')
            tokenizer.save_pretrained('./best_bart_deep/')
            tokenizer.save_vocabulary('./best_bart_deep/')

In [21]:
trainer = transformers.Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge,
    callbacks=[SaveBestModelCallback()]
)

Using amp half precision backend


In [22]:
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 1250


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.479049,42.2467,19.9123,35.4025,35.4215,38.176
2,No log,0.446398,42.9221,20.2985,36.3011,36.3646,30.824
3,No log,0.456017,43.239,19.78,35.4058,35.5643,34.184
4,0.536000,0.446125,42.7596,19.4458,35.2326,35.3606,32.136
5,0.536000,0.447598,42.8173,19.373,35.3195,35.4109,30.464
6,0.536000,0.462855,44.3015,19.8598,35.8213,36.0108,32.592
7,0.536000,0.477572,44.1004,20.5887,35.5675,35.6306,33.192
8,0.201700,0.485754,43.4528,19.9241,35.1775,35.2952,32.456
9,0.201700,0.49834,43.1696,19.9694,34.634,34.7257,32.512
10,0.201700,0.501812,42.4788,19.157,34.2624,34.4006,32.584


***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
Configuration saved in ./best_bart_deep/config.json


New best model found at epoch 1.0. Saving model.


Model weights saved in ./best_bart_deep/pytorch_model.bin
tokenizer config file saved in ./best_bart_deep/tokenizer_config.json
Special tokens file saved in ./best_bart_deep/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
Configuration saved in ./best_bart_deep/config.json


New best model found at epoch 2.0. Saving model.


Model weights saved in ./best_bart_deep/pytorch_model.bin
tokenizer config file saved in ./best_bart_deep/tokenizer_config.json
Special tokens file saved in ./best_bart_deep/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
Saving model checkpoint to conversation-summ/checkpoint-500
Configuration saved in conversation-summ/checkpoint-500/config.json
Model weights saved in conversation-summ/checkpoint-500/pytorch_model.bin
tokenizer config file saved in conversation-summ/checkpoint-500/tokenizer_config.json
Special tokens file saved in conversation-summ/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
Configuration saved in ./best_bart_deep/config.json


New best model found at epoch 4.0. Saving model.


Model weights saved in ./best_bart_deep/pytorch_model.bin
tokenizer config file saved in ./best_bart_deep/tokenizer_config.json
Special tokens file saved in ./best_bart_deep/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
Saving model checkpoint to conversation-summ/checkpoint-1000
Configuration saved in conversation-summ/checkpoint-1000/config.json
Model weights saved in conversation-summ/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in conversation-summ/checkpoint-1000/tokenizer_config.json
Special tokens file saved in conversation-summ/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4
***** Running Evaluation *****
  Num examples = 125
  Batch size = 4


Trainin

TrainOutput(global_step=1250, training_loss=0.32634609069824216, metrics={'train_runtime': 9564.6327, 'train_samples_per_second': 4.182, 'train_steps_per_second': 0.131, 'total_flos': 4.334209204224e+16, 'train_loss': 0.32634609069824216, 'epoch': 10.0})

## Evaluate

In [23]:
aabs_list = []
kkey_list = []
for aabs, kkey in zip(validation_df['abstract(ddt_bounding_box_distance)'], validation_df['new_keyword-gpt4']):
    aabs_list.append(aabs)
    kkey_list.append(kkey)

In [24]:
model_inputs = tokenizer(aabs_list[0], max_length=max_input, padding='max_length', truncation=True)

In [25]:
model_inputs

{'input_ids': [0, 873, 47796, 35, 3112, 8, 28094, 910, 1182, 1848, 14770, 1722, 7948, 36, 282, 12240, 43, 946, 372, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 1437, 4363, 21543, 1437, 1437, 1437, 1437, 4198, 13, 3617, 2975, 142, 9, 49, 1337, 27115, 868, 36, 5564, 991, 44431, 1437, 1437, 49041, 43941, 3611, 11, 935, 8, 73, 368, 2472, 433, 6, 53, 49, 2228, 1236, 3358, 267, 6, 2241, 37423, 15, 10, 1810, 1186, 9, 7909, 45451, 1626, 1189, 10, 1233, 910, 1182, 42123, 10, 267, 5457, 361, 12938, 10, 1539, 4, 49, 2187, 819, 16, 22635, 30, 4249, 295, 605, 364, 321, 396, 12, 1090, 8652, 14001, 742, 434, 295, 12240, 16807, 1274, 4, 21887, 6, 14497, 4675, 9, 910, 1182, 1848, 4543, 139, 6, 295, 12240, 195, 155, 417, 14770, 366, 700, 2580, 1437, 1437, 112, 15, 10, 3143, 9, 7909, 45451, 1626, 30, 21495, 10490, 28808, 36, 5618, 43, 36, 1916, 73, 13753, 17, 27, 43, 1437, 1437, 1437, 1437, 515, 9, 5, 295, 605, 6, 8, 31345, 293, 8, 34774, 25510, 3611, 9, 5018, 595, 5018, 13171, 15, 4363, 21543, 2592, 295, 

In [26]:
raw_pred, _, _ = trainer.predict([model_inputs])

***** Running Prediction *****
  Num examples = 1
  Batch size = 4


In [27]:
raw_pred

array([[    2,  9518, 12573,  2893,  6157, 13690,  1722,  1885,     6,
        30169, 43262,  6748, 21553,     6,   289,  5906,  4203, 38665,
            6, 23124,    12, 46552, 39848,     2,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1]])

In [28]:
tokenizer.decode(raw_pred[0])

'</s>Carbon Cloth Nanowire, Atomic Layer Deposition, Heterojunction, Charge-Transfer Efficiency</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [29]:
kkey_list[0]

'Atomic Layer Deposition, Hydrothermal Growth, Versatile Substrates, SnO2 Nanowires, Catalysis'