In [None]:
import os
import random
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
    TrainingArguments
)

from datasets import Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer

In [None]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# cuDNN 관련 설정
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
torch.use_deterministic_algorithms(True)

os.environ["HF_HOME"] = "/workspace/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/workspace/huggingface"

In [None]:
# 통합 파이프라인 함수
def full_training_pipeline(train_df, output_dir, base_model_path, adapter_model_path, epoch):
    # 8-bit BitsAndBytes config (8-bit uses bfloat16)
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True
    )

    model_id = 'beomi/gemma-ko-7b'

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map={"": 0},
        cache_dir="/workspace/huggingface"    # <--- volume 지정
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_id,
        cache_dir="/workspace/huggingface"    # <--- volume 지정
    )

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'

    train = Dataset.from_pandas(train_df)

    def prompting(input, output):
        prompt = (
            "<start_of_turn> Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. The number of words and letters per word must be observed.\n"
            f"Input: {input}\n"
            "<end_of_turn>\n"
            "<start_of_turn>Assistant:\n"
            f"Output: {output}"
        )
        return prompt

    def chat_format(row):
        prompt = prompting(row["input"], row["output"])
        tokens = tokenizer.encode(prompt, truncation=True, max_length=512)
        row["input_ids"] = tokens
        return row

    train = train.map(chat_format, batched=False, num_proc=4)

    # LoRA 설정
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=[
            "q_proj", "v_proj", "k_proj", "o_proj",
            "gate_proj", "down_proj", "up_proj"
        ],
        lora_dropout=0.1,
        bias='none',
        task_type='CAUSAL_LM'
    )

    model = get_peft_model(model, lora_config)
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    training_args = TrainingArguments(
        seed=42,
        output_dir=output_dir,
        num_train_epochs=epoch,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        eval_strategy="no",
        logging_dir="./logs",
        logging_steps=50,
        warmup_steps=20,
        logging_strategy="steps",
        learning_rate=2e-4,
        group_by_length=True,
        save_strategy="epoch",
        fp16=True
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=train,
        args=training_args,
        peft_config=lora_config,
        formatting_func=lambda x: x['input_ids']
    )

    trainer.train()

    # 모델 저장 및 로드
    trainer.model.save_pretrained(adapter_model_path)

    model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map='auto',
    torch_dtype=torch.float16,
    cache_dir="/workspace/huggingface")

    model = PeftModel.from_pretrained(
        model,
        adapter_model_path,
        device_map='auto',
        torch_dtype=torch.float16
    )
    return model

if __name__ == "__main__":

    #2차 해독_finetuning_ver1(input:MLP)
    train0 = pd.read_csv('/workspace/data/finetuning_train_mlp.csv')
    output_dir0="/workspace/results0221"
    base_model_path0 = "beomi/gemma-ko-7b"
    adapter_model_path0 = "/workspace/results0221/lora-adapter-epoch3"

    model0 = full_training_pipeline(train0, output_dir0, base_model_path0, adapter_model_path0, epoch=3)
    model0.save_pretrained("/workspace/results0221/gemma-finetuning-epoch3")
    print("2차 해독_finetuning_ver1(input:MLP) 완료")

    #2차 해독_finetuning_ver1(input:원본)
    train1 = pd.read_csv('/workspace/data/finetuning_train_original.csv')
    output_dir1="/workspace/results0225"
    base_model_path1 = "beomi/gemma-ko-7b"
    adapter_model_path1 = "/workspace/results0225/lora-adapter-epoch5"

    model1 = full_training_pipeline(train1, output_dir1, base_model_path1, adapter_model_path1, epoch=5)
    model1.save_pretrained("/workspace/results0225/gemma-finetuning-epoch5")
    print("2차 해독_finetuning_ver1(input:원본) 완료")

In [None]:
def restore_reviews(FINETUNE_MODEL, BASE_MODEL, test):
    finetune_model = AutoModelForCausalLM.from_pretrained(
        FINETUNE_MODEL, cache_dir="/workspace/huggingface", device_map={"": 0}
    )
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir="/workspace/huggingface")

    text_gen_pipeline = pipeline(
        "text-generation",
        model=finetune_model,
        tokenizer=tokenizer,
    )

    restored_reviews = []
    total_reviews = len(test)

    with tqdm(total=total_reviews, desc="Processing", unit="review") as pbar:
        for index, row in test.iterrows():
            query = row['input']
            prompt = (
                "<start_of_turn> Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning. The number of words and letters per word must be observed.\n"
                f"Input: {query}\n"
                "<end_of_turn>\n"
                "<start_of_turn>Assistant:\n"
                "Output:"
            )

            generated = text_gen_pipeline(
                prompt,
                num_return_sequences=1,
                temperature=0.2,
                top_p=0.9,
                max_new_tokens=len(query),
                do_sample=True,
                eos_token_id=tokenizer.eos_token_id
            )

            generated_text = generated[0]['generated_text']
            output_start = generated_text.find("Output:")

            if output_start != -1:
                restored_reviews.append(generated_text[output_start + len("Output:"):].strip())
            else:
                restored_reviews.append(generated_text.strip())

            pbar.update(1)

    return restored_reviews


if __name__ == "__main__":

  FINETUNE_MODEL0 = "/workspace/results0221/gemma-finetuning-epoch3"
  BASE_MODEL0 = "beomi/gemma-ko-7b"
  test0 = pd.read_csv('/workspace/finetuning_test_mlp.csv')
  restored_reviews0 = restore_reviews(FINETUNE_MODEL0, BASE_MODEL0, test0)

  FINETUNE_MODEL1 = "/workspace/results0225/gemma-finetuning-epoch5"
  BASE_MODEL1 = "beomi/gemma-ko-7b"
  test1 = pd.read_csv('/workspace/finetuning_test_original.csv')
  restored_reviews1 = restore_reviews(FINETUNE_MODEL1, BASE_MODEL1, test1)

In [None]:
submission = pd.read_csv("/workspace/data/sample_submission.csv")
submission = pd.DataFrame()
submission['output'] = restored_reviews0
submission['output'] = submission['output'].apply(lambda x: x.split("<end_of_turn>")[0])
submission.to_csv('/workspace/kogemma_0222_raw.csv', index = False, encoding = 'utf-8-sig')

In [None]:
submission = pd.read_csv("/workspace/data/sample_submission.csv")
submission = pd.DataFrame()
submission['output'] = restored_reviews1
submission['output'] = submission['output'].apply(lambda x: x.split("<end_of_turn>")[0])
submission.to_csv('/workspace/kogemma_0226_raw.csv', index = False, encoding = 'utf-8-sig')