## 1) Import

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 2) Data Load

In [25]:
train = pd.read_csv('./train.csv', encoding = 'utf-8-sig')
test = pd.read_csv('./test.csv', encoding = 'utf-8-sig')

In [29]:
print(train.columns)  # 열 이름 확

In [31]:
train.columns

Index(['ID', 'input', 'output'], dtype='object')

In [49]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # 데이터가 DataFrame일 때 접근 방식 수정
        prompt = self.data.iloc[idx]["input"]  # DataFrame의 열명으로 접근
        response = self.data.iloc[idx]["output"]  # DataFrame의 열명으로 접근

        additional_prompt = (
            "Obfuscated Korean Review Restoration is a technique where a given obfuscated Korean sentence is restored to its original, clear, and natural form. "
            "The goal is to make the sentence easier to understand by reversing any unnecessary alterations or distortions. "
            "Obfuscated Korean relies on the fact that some words or letters might have been altered in ways that don’t affect the overall meaning. "
            "Given the following obfuscated Korean review, identify the errors in the altered text and restore it to its natural form.\n"
        )

        # Prompt와 Response를 하나의 텍스트로 결합
        combined = f"{additional_prompt}\nPrompt: {prompt}\nResponse: {response}"

        tokens = self.tokenizer(
            combined,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )



        # 'labels' 추가
        labels = self.tokenizer(
            response, 
            max_length=self.max_length, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )["input_ids"].squeeze()

        return {
            "input_ids": tokens["input_ids"].squeeze(), 
            "attention_mask": tokens["attention_mask"].squeeze(),
            "labels": labels  # response를 labels로 설정
        }



tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", cache_dir="H:/model/")
# pad_token 설정
tokenizer.pad_token = tokenizer.eos_token

dataset = CustomDataset(train, tokenizer)


In [50]:
# 데이터셋의 크기 확인
print(len(dataset))  # 데이터셋의 전체 길이

# 첫 번째 아이템 확인
sample = dataset[0]
print(sample)  # 첫 번째 샘플 출력

# 토큰화된 결과 확인
tokens = dataset[0]  # 첫 번째 샘플을 확인
print(tokens)  # 전체 토큰화 결과 확인

# input_ids와 attention_mask 확인
print(tokens["input_ids"].shape)  # input_ids의 shape
print(tokens["attention_mask"].shape)  # attention_mask의 shape

# combined 텍스트가 잘 만들어졌는지 확인
prompt = train.iloc[0]["input"]
response = train.iloc[0]["output"]
additional_prompt = (
    "Obfuscated Korean Review Restoration is a technique where a given obfuscated Korean sentence is restored to its original, clear, and natural form. "
    "The goal is to make the sentence easier to understand by reversing any unnecessary alterations or distortions. "
    "Obfuscated Korean relies on the fact that some words or letters might have been altered in ways that don’t affect the overall meaning. "
    "Given the following obfuscated Korean review, identify the errors in the altered text and restore it to its natural form.\n"
)
combined = f"{additional_prompt}\nPrompt: {prompt}\nResponse: {response}"
print(combined)  # combined 텍스트 출력
combined

'Obfuscated Korean Review Restoration is a technique where a given obfuscated Korean sentence is restored to its original, clear, and natural form. The goal is to make the sentence easier to understand by reversing any unnecessary alterations or distortions. Obfuscated Korean relies on the fact that some words or letters might have been altered in ways that don’t affect the overall meaning. Given the following obfuscated Korean review, identify the errors in the altered text and restore it to its natural form.\n\nPrompt: 별 한 게토 았깝땀. 왜 싸람듯릭 펼 1캐를 쥰눈징 컥꺾폰 싸람믐롯섞 맒록 섧멍핥쟈닐 탯끎룐눈 녀뮤 퀼교... 야뭍툰 둠 변 닺씨 깍낄 싫훈 굣. 깸삥읊 20여 년 댜녁뵨 곧 중 쩨윌 귑푼 낙팠떤 곶.\nResponse: 별 한 개도 아깝다. 왜 사람들이 별 1개를 주는지 겪어본 사람으로서 말로 설명하자니 댓글로는 너무 길고... 아무튼 두 번 다시 가길 싫은 곳. 캠핑을 20여 년 다녀본 곳 중 제일 기분 나빴던 곳.'

## 3) Model Load

In [6]:
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig
from accelerate import dispatch_model
import torch

BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"

# LoRA 설정
lora_config = LoraConfig(
    r=1,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

# 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    cache_dir="H:/model/",  
    torch_dtype=torch.float16  # FP16으로 모델을 로드
)

# GPU로 모델 할당 (가능한 경우)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# LoRA 어댑터 추가
model = get_peft_model(model, lora_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [57]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1,
    num_train_epochs=1,
    logging_dir="./logs",
    save_steps=500,
    save_total_limit=2,
)


# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# 학습 실행
trainer.train()



  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 

## 4) Inference

## 5) Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv', encoding = 'utf-8-sig')

In [None]:
submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')