## Load Dataset

In [75]:
from transformers import (
    AutoTokenizer,
    BartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from tokenizers import Tokenizer
from typing import Dict, List, Optional
from torch.utils.data import Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import display
from typing import Dict

In [76]:
diary = pd.read_csv("diary_cleansing.csv")

In [77]:
diary.head()

Unnamed: 0,diary,summary,hashtag
0,아침 일찍 일어나서 운동복을 입고 운동을 시작했다. 처음에는 조금 어색하고 귀찮은 ...,매일 규칙적인 운동 습관을 만들기 위해 노력한 하루였다. 다양한 운동을 시도하며 체...,#매일운동 #규칙적운동 #건강한삶 #체력향상 #운동습관
1,아침에는 건강한 식단으로 시작하기 위해 곡물과 채소가 풍부한 식사를 준비했다. 신선...,건강한 식습관을 채택하고 유지하기 위해 노력한 하루였다. 곡물과 채소를 중심으로 균...,#건강한식습관 #식단관리 #영양균형 #천천히식사 #건강한삶
2,언어 학습 앱을 다운로드하고 기초부터 차근차근 공부했다. 알파벳과 발음부터 시작하여...,새로운 언어를 배우기 위해 꾸준한 학습과 실전 연습을 통해 언어 실력을 향상시키는 ...,#새로운언어배우기 #언어학습 #꾸준한노력 #언어실력향상 #자신감키우기
3,"먼저, 관련 서적과 온라인 자료를 찾아 읽으며 기초 지식을 쌓았다. 이해가 어려운 ...",새로운 기술이나 도메인에 대해 공부하여 전문성을 향상시키는 노력을 하루 동안 기록했다.,#전문성향상 #새로운기술공부 #지식습득 #실전경험 #꾸준한학습
4,일어나서부터 긴장과 설렘이 가득한 마음으로 시험장에 도착했어요. 시험 시작 전에는 ...,자격증 취득을 통해 전문가 레벨의 업무 능력을 갖추기 위한 시험을 보았어요. 긴장과...,#자격증취득 #전문가능력 #노력과학습 #자부심 #발전


In [78]:
diary.shape

(3004, 3)

In [79]:
diary = diary.sample(frac = 1, random_state = 42) # Shuffle Data

In [80]:
from sklearn.model_selection import train_test_split

train_size = int(len(diary) * 0.9)
diary_train = diary[:train_size]
diary_test = diary[train_size:]

In [81]:
print(diary_train.shape)
print(diary_test.shape)

In [82]:
diary_train.to_csv("diary_train.csv", index = False)
diary_test.to_csv("diary_test.csv", index = False)

In [83]:
!pip install datasets accelerate transformers[torch]


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
!pip install transformers --upgrade




[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [87]:
from datasets import load_dataset, DatasetDict
diary_dataset_train = load_dataset("csv", data_files = "diary_train.csv")
diary_dataset_test = load_dataset("csv", data_files = "diary_test.csv")

In [88]:
# diary열 제거
diary_dataset_train = diary_dataset_train['train'].remove_columns('diary')
diary_dataset_test = diary_dataset_test['train'].remove_columns('diary')

In [89]:
diary_dataset = DatasetDict({
    "train": diary_dataset_train,
    "test": diary_dataset_test
})

In [90]:
diary_dataset

DatasetDict({
    train: Dataset({
        features: ['summary', 'hashtag'],
        num_rows: 2703
    })
    test: Dataset({
        features: ['summary', 'hashtag'],
        num_rows: 301
    })
})

In [91]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [92]:
device

device(type='cuda')

## Hash Tag

In [114]:
# Load Model and Tokenize
model_name = "gogamza/kobart-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [94]:
max_input_length = 150
max_target_length = 50

def preprocess_function(ex):
  model_inputs = tokenizer(ex['summary'], max_length = max_input_length, padding = 'max_length', truncation = True)

  # 타겟을 위한 토크나이저 설정
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(ex['hashtag'], max_length = max_target_length, padding = 'max_length', truncation = True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

In [95]:
tokenized_dataset = diary_dataset.map(preprocess_function)

Map:   0%|          | 0/2703 [00:00<?, ? examples/s]



Map:   0%|          | 0/301 [00:00<?, ? examples/s]

In [96]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['summary', 'hashtag', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2703
    })
    test: Dataset({
        features: ['summary', 'hashtag', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 301
    })
})

In [97]:
tokenized_dataset = tokenized_dataset.remove_columns(diary_dataset['train'].column_names)

In [98]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2703
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 301
    })
})

In [99]:
# Train data max input tokens
max(len(x) for x in tokenized_dataset['train']['input_ids'])

150

In [100]:
# Test data max input tokens
max(len(x) for x in tokenized_dataset['test']['input_ids'])

150

In [101]:
# Train data max output tokens
max(len(x) for x in tokenized_dataset['train']['labels'])

50

In [102]:
# Test data max output tokens
max(len(x) for x in tokenized_dataset['test']['labels'])

50

In [103]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

## Metrics : Rouge score

In [104]:
from datasets import load_metric
rouge_score = load_metric('rouge')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [105]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    rouge_output = rouge_score.compute(predictions=decoded_preds, references=decoded_labels)
    return rouge_output

## Train

In [106]:
!pip install ipywidgets


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [107]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [115]:
batch_size = 8
epochs = 50
logging_steps = len(tokenized_dataset['train']) // batch_size
model_path = "C:/Users/user/Documents/EWHA/log/log/modelling/"

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    evaluation_strategy = 'steps',
    learning_rate = 5.6e-5,
    weight_decay = 0.001,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=1000,
    warmup_steps=300,
    prediction_loss_only=True,
    predict_with_generate = True,
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub = True
    )

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    callbacks=[early_stopping_callback],
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

In [116]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.1562,0.696432
1000,0.4915,0.61866
1500,0.2739,0.709572
2000,0.1806,0.731984
2500,0.1091,0.819082


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2500, training_loss=0.6422441787719727, metrics={'train_runtime': 296.8485, 'train_samples_per_second': 455.283, 'train_steps_per_second': 56.931, 'total_flos': 1785712061952000.0, 'train_loss': 0.6422441787719727, 'epoch': 7.4})

In [117]:
evaluation_results = trainer.evaluate()

In [119]:
evaluation_results

{'eval_loss': 0.6186603903770447,
 'eval_runtime': 1.1594,
 'eval_samples_per_second': 259.612,
 'eval_steps_per_second': 32.775,
 'epoch': 7.4}

In [120]:
trainer.save_model('./hash_tag')

Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}


In [121]:
trainer.push_to_hub(commit_message = "Hash-tag Producer", tags = 'kobart-hashtag')

Non-default generation parameters: {'forced_eos_token_id': 1}


CommitInfo(commit_url='https://huggingface.co/jjae/modelling/commit/9e6c9cf4ed802adbb25c56d3348355660d416394', commit_message='Hash-tag Producer', commit_description='', oid='9e6c9cf4ed802adbb25c56d3348355660d416394', pr_url=None, pr_revision=None, pr_num=None)

In [122]:
tokenizer.push_to_hub(repo_id = 'modelling')

README.md:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/jjae/modelling/commit/ac53691df758679a91773362846c257578fd2f51', commit_message='Upload tokenizer', commit_description='', oid='ac53691df758679a91773362846c257578fd2f51', pr_url=None, pr_revision=None, pr_num=None)

In [123]:
def make_tag(text, label):
  # 입력 문장을 토큰화하여 인코딩
  input_ids = tokenizer.encode(text, return_tensors="pt").to(device)

  # 모델에 입력 전달하여 디코딩
  output = model.generate(input_ids = input_ids, bos_token_id = model.config.bos_token_id,
                          eos_token_id = model.config.eos_token_id, length_penalty = 2.0, max_length = 50, num_beams = 2)

  # 디코딩된 출력을 토크나이저를 사용하여 텍스트로 변환
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

  print("입력 문장:", text,'\n')
  print("모델 출력:", decoded_output,'\n')
  print("정답 레이블:", label)

In [124]:
make_tag(diary_dataset['test']['summary'][0], diary_dataset['test']['hashtag'][0])

In [125]:
make_tag(diary_dataset['test']['summary'][-1], diary_dataset['test']['hashtag'][-1])

In [126]:
make_tag(diary_dataset['test']['summary'][1], diary_dataset['test']['hashtag'][1])

In [124]:
make_tag(diary_dataset['test']['summary'][2], diary_dataset['test']['hashtag'][2])

입력 문장: 환상적인 세계에서의 일상 생활을 경험한 하루였어요. 그곳에서의 경험은 삶에 활기와 창의성을 불어넣어주었고, 현실에서도 새로운 경험을 할 수 있는 가능성을 열어주었어요. 

모델 출력: #환상적인세계 #일상적인일상 #활기찬시간 #창의성발휘 #새로운경험 

정답 레이블: #환상적인세계 #일상의흥미 #창의성과열정 #현실과환상의만남 #새로운경험
