In [6]:
import tensorflow as tf
from transformers import PreTrainedTokenizerFast, TFGPT2LMHeadModel
import numpy as np
import json

# KoGPT2 모델과 토크나이저 로드
model_name = 'skt/kogpt2-base-v2'
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
model = TFGPT2LMHeadModel.from_pretrained(model_name, from_pt=True)

# 임의로 생성한 각도차이
angle_differences = [
    [16, -12, 23, -8, 5, -10, 18, -7],
    [-20, 15, -18, 10, -5, 8, -12, 6],
    [8, -6, 12, -15, 20, -18, 7, -9],
    [-14, 10, -7, 5, -22, 17, -8, 13],
    [19, -11, 25, -13, 6, -9, 14, -16],
]

feedbacks = [
    "오른쪽 팔꿈치와 오른쪽 무릎과 오른쪽 골반을 더 구부려주시고 왼쪽 팔꿈치를 더 펴주세요.",
    "오른쪽 팔꿈치와 오른쪽 무릎을 더 펴주세요. 왼쪽 팔꿈치는 더 구부려주세요.",
    "왼쪽 무릎과 왼쪽 발목을을 더 펴주세요. 오른쪽 발목은 더 구부려주세요.",
    "오른쪽 팔꿈치를 조금 더 펴주세요. 왼쪽 팔꿈치를 조금 구부려주세요 오른쪽 발목은 더 펴주시고 왼쪽 발목은 더 구부려주세요.",
    "오른쪽 팔꿈치와 오른쪽 무릎을 더 구부려주세요. 왼쪽 팔꿈치와 왼쪽 무릎과 왼쪽 골반을 조금 더 펴주세요."
]

angle_differences = np.array(angle_differences)
feedbacks = feedbacks

# 데이터 준비 및 JSON 파일로 저장
data = []
for angles, feedback in zip(angle_differences, feedbacks):
    angle_str = ' '.join([f"{angle}" for angle in angles])
    prompt = f"각도 차이: {angle_str}"
    item = {
        "prompt": prompt,
        "completion": feedback
    }
    data.append(item)

with open('training_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# 데이터 전처리 함수
def preprocess_data(data, max_length=128):
    inputs = []
    labels = []
    for item in data:
        prompt = item['prompt']
        completion = item['completion']
        text = f"{prompt} 피드백: {completion}"
        inputs.append(text)
        labels.append(completion)

    # 입력 인코딩
    input_encodings = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")
    input_ids = input_encodings['input_ids']
    attention_mask = input_encodings['attention_mask']

    # 레이블 인코딩
    label_encodings = tokenizer(labels, truncation=True, padding='max_length', max_length=max_length, return_tensors="tf")
    label_ids = label_encodings['input_ids']

    # 레이블에서 prompt 부분을 -100으로 마스킹
    prompt_lengths = [len(tokenizer.encode(input_text.split('피드백:')[0])) for input_text in inputs]
    max_prompt_length = max(prompt_lengths)

    mask = tf.sequence_mask(prompt_lengths, maxlen=max_length)
    label_ids = tf.where(mask, tf.constant(-100, shape=label_ids.shape), label_ids)

    # 패딩 토큰을 -100으로 변경
    label_ids = tf.where(label_ids == tokenizer.pad_token_id, -100, label_ids)

    return input_ids, attention_mask, label_ids

# 데이터 로드 및 전처리
with open('training_data.json', 'r', encoding='utf-8') as f:
    training_data = json.load(f)

input_ids, attention_mask, labels = preprocess_data(training_data)

# TensorFlow 데이터셋 생성
tf_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": input_ids, "attention_mask": attention_mask},
    {"labels": labels}
))

# 데이터셋 분할 및 배치 설정
train_size = int(0.9 * len(tf_dataset))
tf_dataset = tf_dataset.shuffle(len(tf_dataset))
train_dataset = tf_dataset.take(train_size).batch(16)
val_dataset = tf_dataset.skip(train_size).batch(16)

# 모델 컴파일
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer)

#학습
def train_step(input_ids, attention_mask, labels):
    with tf.GradientTape() as tape:
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, training=True)
        loss = outputs.loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# 학습 루프
num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataset:
        inputs, labels = batch
        loss = train_step(inputs['input_ids'], inputs['attention_mask'], labels['labels'])
        total_loss += loss
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataset)}")

    # 검증
    val_loss = 0
    for batch in val_dataset:
        inputs, labels = batch
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels['labels'])
        val_loss += outputs.loss
    print(f"Validation Loss: {val_loss/len(val_dataset)}")
# 모델 저장
model.save_pretrained("./fine_tuned_kogpt2_tf")
tokenizer.save_pretrained("./fine_tuned_kogpt2_tf")

# 새로운 각도 차이에 대한 피드백 생성 함수
def generate_feedback(model, tokenizer, angle_differences, max_length=100):
    angle_str = ' '.join([f"{angle}" for angle in angle_differences])
    input_text = f"각도 차이: {angle_str} 피드백:"
    input_ids = tokenizer.encode(input_text, return_tensors='tf')

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    feedback = generated_text.split("피드백:")[1].strip()

    return feedback

# 새로운 각도 차이에 대한 피드백 생성 예시
new_angle_differences = [16, -15, 23, -16, 24, -16, 12, -18]
feedback = generate_feedback(model, tokenizer, new_angle_differences)
print(f"새로운 각도 차이: {new_angle_differences}")
print(f"생성된 피드백: {feedback}")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.4.attn.masked_bias', 'transformer.h.11.attn.masked_bias', 'lm_head.weight', 'transformer.h.8.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.2.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initial

Epoch 1/100, Loss: [12.687686]
Validation Loss: [6.5344453]
Epoch 2/100, Loss: [7.060312]
Validation Loss: [5.1393943]
Epoch 3/100, Loss: [5.1351347]
Validation Loss: [4.974843]
Epoch 4/100, Loss: [4.401457]
Validation Loss: [3.7752175]
Epoch 5/100, Loss: [3.503026]
Validation Loss: [2.9518936]
Epoch 6/100, Loss: [2.643328]
Validation Loss: [1.467156]
Epoch 7/100, Loss: [2.2506769]
Validation Loss: [1.191999]
Epoch 8/100, Loss: [1.8793784]
Validation Loss: [3.606987]
Epoch 9/100, Loss: [1.6466966]
Validation Loss: [4.020216]
Epoch 10/100, Loss: [1.9403824]
Validation Loss: [2.0689209]
Epoch 11/100, Loss: [1.6627022]
Validation Loss: [0.86616135]
Epoch 12/100, Loss: [1.5372518]
Validation Loss: [2.249643]
Epoch 13/100, Loss: [1.1770414]
Validation Loss: [2.2852886]
Epoch 14/100, Loss: [1.1431942]
Validation Loss: [0.9048082]
Epoch 15/100, Loss: [1.4785076]
Validation Loss: [1.1480452]
Epoch 16/100, Loss: [1.1645013]
Validation Loss: [0.9501813]
Epoch 17/100, Loss: [1.0247245]
Validation