In [11]:
pip install openpyxl



In [13]:
import tensorflow as tf
from transformers import PreTrainedTokenizerFast, TFGPT2LMHeadModel
import numpy as np
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# KoGPT2 모델과 토크나이저 로드
model_name = 'skt/kogpt2-base-v2'
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
model = TFGPT2LMHeadModel.from_pretrained(model_name, from_pt=True)
print(f"Loaded model: {model.config}")

# 데이터 로드 및 전처리
feed = pd.read_excel("/content/drive/MyDrive/자연어 처리 학습 피드백.xlsx", names=["각도차이", "피드백"])
angle_differences = feed.loc[:, "각도차이"].apply(eval)
feedbacks = feed.loc[:, "피드백"]

# 각도 차이 데이터 정규화
angle_differences_flat = [angle for angles in angle_differences for angle in angles]
angle_differences_normalized = (np.array(angle_differences_flat) - np.mean(angle_differences_flat)) / np.std(angle_differences_flat)

# 정규화된 값을 원래 리스트 구조로 재구성
start = 0
angle_differences_normalized_list = []
for angles in angle_differences:
    end = start + len(angles)
    angle_differences_normalized_list.append(angle_differences_normalized[start:end].tolist())
    start = end

# 데이터 준비
data = []
for angles, feedback in zip(angle_differences_normalized_list, feedbacks):
    angle_str = ' '.join([f"{angle:.2f}" for angle in angles])
    prompt = f"각도 차이: {angle_str}"
    item = {
        "prompt": prompt,
        "completion": feedback
    }
    data.append(item)

# 데이터 증강 (간단한 예시)
# augmented_data = data.copy()
# for item in data:
#     augmented_item = item.copy()
#     augmented_item['completion'] = f"다음은 자세에 대한 피드백입니다: {item['completion']}"
#     augmented_data.append(augmented_item)

# data = augmented_data

# 데이터를 JSON 파일로 저장
with open('training_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# 데이터 전처리 함수 (max_length 증가)
def preprocess_data(data, max_length=256):
    inputs = []
    labels = []
    for item in data:
        prompt = item['prompt']
        completion = item['completion']
        text = f"{prompt} 피드백: {completion}"
        inputs.append(text)
        labels.append(completion)

    input_encodings = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")
    input_ids = input_encodings['input_ids']
    attention_mask = input_encodings['attention_mask']

    label_encodings = tokenizer(labels, truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")
    label_ids = label_encodings['input_ids']

    prompt_lengths = [len(tokenizer.encode(input_text.split('피드백:')[0])) for input_text in inputs]
    mask = tf.sequence_mask(prompt_lengths, maxlen=max_length)
    label_ids = tf.where(mask, tf.constant(-100, shape=label_ids.shape), label_ids)
    label_ids = tf.where(label_ids == tokenizer.pad_token_id, -100, label_ids)

    return input_ids, attention_mask, label_ids

# 데이터 로드 및 전처리
with open('training_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# 데이터 분할 (train, validation, test)
train_data, test_data = train_test_split(training_data, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

train_inputs, train_attention_mask, train_labels = preprocess_data(train_data)
val_inputs, val_attention_mask, val_labels = preprocess_data(val_data)
test_inputs, test_attention_mask, test_labels = preprocess_data(test_data)

# TensorFlow 데이터셋 생성
train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_inputs, "attention_mask": train_attention_mask},
    {"labels": train_labels}
)).shuffle(len(train_data)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": val_inputs, "attention_mask": val_attention_mask},
    {"labels": val_labels}
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_inputs, "attention_mask": test_attention_mask},
    {"labels": test_labels}
)).batch(16)

# 학습률 스케줄러 및 옵티마이저 설정 (learning rate 조정)
lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
    initial_learning_rate=1e-4,
    first_decay_steps=1000,
    t_mul=2.0,
    m_mul=0.9,
    alpha=0.1
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

# 커스텀 손실 함수 (가중치 추가)
def custom_loss(labels, logits):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.not_equal(labels, -100)
    active_loss = tf.reduce_sum(tf.cast(mask, tf.float32), axis=-1)
    labels = tf.boolean_mask(labels, mask)
    logits = tf.boolean_mask(logits, mask)
    loss = loss_fn(labels, logits)
    loss = tf.reduce_sum(loss) / tf.reduce_sum(active_loss)
    return loss

# 모델 컴파일
model.compile(optimizer=optimizer, loss=custom_loss)

# 조기 종료 콜백
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# 모델 학습
history = model.fit(
    train_dataset,
    epochs=200,  # 에폭 수 증가
    validation_data=val_dataset,
    callbacks=[early_stopping]  # model_checkpoint 제거
)

# 테스트 데이터로 평가
test_loss = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}")

# 모델 저장 방식 변경
model.save_pretrained("./fine_tuned_kogpt2_tf")
tokenizer.save_pretrained("./fine_tuned_kogpt2_tf")

# 새로운 각도 차이에 대한 피드백 생성 함수 (생성 파라미터 조정)
def generate_feedback(model, tokenizer, angle_differences, max_length=150):
    normalized_angles = (angle_differences - np.mean(angle_differences_flat)) / np.std(angle_differences_flat)
    angle_str = ' '.join([f"{angle:.2f}" for angle in normalized_angles])
    input_text = f"다음 각도 차이에 대한 자세 피드백을 생성해주세요: {angle_str}\n피드백:"
    input_ids = tokenizer.encode(input_text, return_tensors='tf')

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    feedback = generated_text.split("피드백:")[1].strip()

    return feedback

# 새로운 각도 차이에 대한 피드백 생성 예시
new_angle_differences = [16, -15, 23, -16, 24, -16, 12, -18]
feedback = generate_feedback(model, tokenizer, new_angle_differences)
print(f"새로운 각도 차이: {new_angle_differences}")
print(f"생성된 피드백: {feedback}")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.5.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'lm_head.weight', 'transformer.h.0.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.6.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initial

Loaded model: GPT2Config {
  "_name_or_path": "skt/kogpt2-base-v2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "created_date": "2021-04-28",
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "license": "CC-BY-NC-SA 4.0",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 3,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific