# Generation for NLP Baseline Code

## Install Packages

## Import Necessary Libraries

In [None]:
import torch
import transformers
from ast import literal_eval
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
import json
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import evaluate
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from peft import AutoPeftModelForCausalLM, LoraConfig

pd.set_option('display.max_columns', None)

In [None]:
# ÎÇúÏàò Í≥†Ï†ï
def set_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)

set_seed(42) # magic number :)

## EDA

### Load Data

In [None]:
 # Load the train dataset
# TODO Train Data Í≤ΩÎ°ú ÏûÖÎ†•
dataset = pd.read_csv('../../data/train.csv') 
dataset

In [None]:
paragraph = dataset.loc[0]['paragraph']
problem = dataset.loc[0]['problems']

print(paragraph)
print(problem)

In [None]:
for problem in dataset['problems'] :
    print(problem)

In [None]:

# Flatten the JSON dataset
records = []
for _, row in dataset.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    records.append(record)
        
# Convert to DataFrame
df = pd.DataFrame(records)

In [None]:
df.head()

### Print missing values

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

### Basic information about the dataset

In [None]:
print("\nDataset Information:")
df.info()

### EDA on 'question' and 'choices'

In [None]:
# Combine 'question' and 'question_plus' if available
df['question_plus'] = df['question_plus'].fillna('')
df['full_question'] = df.apply(lambda x: x['question'] + ' ' + x['question_plus'] if x['question_plus'] else x['question'], axis=1)

# Calculate the length of each question
df['question_length'] = df['full_question'].apply(len)

In [None]:
df['full_question']

### Question Length Distribution

In [None]:
plt.figure(figsize=(5, 3))
plt.hist(df['question_length'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Question Lengths')
plt.xlabel('Question Length')
plt.ylabel('Frequency')
plt.show()

## Feature Engineering using TF-IDF

- TF-IDF Ï∞∏Í≥† ÎßÅÌÅ¨: https://ko.wikipedia.org/wiki/Tf-idf

### Initialize TF-IDF Vectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

### Fit and transform the text data

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['full_question'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

### Display the TF-IDF features

In [None]:
print("\nTF-IDF Features:")
display(tfidf_df.head(20))

## Model Training

### Baseline Model

- https://huggingface.co/beomi/gemma-ko-2b

In [None]:
# Î≥∏Ïù∏Ïùò Huggingface auth token ÏûÖÎ†•
## Jupyter labÏóêÏÑú Î°úÍ∑∏Ïù∏ ÌïòÎäî textboxÍ∞Ä ÎÇòÏò§ÏßÄ ÏïäÏùÑ Í≤ΩÏö∞, terminalÏóêÏÑú Î°úÍ∑∏Ïù∏ ÌïòÏã§ Ïàò ÏûàÏäµÎãàÎã§.
!huggingface-cli login --token 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
# from huggingface_hub import notebook_login
# notebook_login()

Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†ÄÎ•º Î∂àÎü¨ÏòµÎãàÎã§.

In [20]:
model = AutoModelForCausalLM.from_pretrained(
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
    trust_remote_code=True,
)

model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
tokenizer.chat_template

In [None]:
test_messages = [{'role': 'system', 'content': 'ÏßÄÎ¨∏ÏùÑ ÏùΩÍ≥† ÏßàÎ¨∏Ïùò ÎãµÏùÑ Íµ¨ÌïòÏÑ∏Ïöî.'},
  {'role': 'user',
   'content': 'ÏßÄÎ¨∏:\nÏÉÅÏÜåÌïòÏó¨ ÏïÑÎ¢∞Í∏∞Î•º , ‚ÄúÏã†Ïù¥ Ï¢åÏ∞∏ Ï∞¨ ÏÜ°Ï§ÄÍ∏∏Ïù¥ Ïò¨Î¶∞ Ï∞®ÏûêÎ•º Î≥¥ÏïòÎäîÎç∞ , ÏÉÅÎ≥µ(Âñ™Êúç) Ï†àÏ∞®Ïóê ÎåÄÌïòÏó¨ ÎÖºÌïú Í≤ÉÏù¥ Ïã†Í≥ºÎäî ÌÅ∞ Ï∞®Ïù¥Í∞Ä ÏûàÏóàÏäµÎãàÎã§ . Ïû•ÏûêÎ•º ÏúÑÌïòÏó¨ 3ÎÖÑÏùÑ ÏûÖÎäî ÍπåÎã≠ÏùÄ ÏúÑÎ°ú ‚ÄòÏ†ïÏ≤¥(Ê≠£È´î)‚ÄôÍ∞Ä ÎêòÍ∏∞ ÎïåÎ¨∏Ïù¥Í≥† Îòê Ï†Ñ Ï§ë(ÂÇ≥Èáç: Ï°∞ÏÉÅÏùò Ï†úÏÇ¨ÎÇò Í∞ÄÎ¨∏Ïùò Î≤ïÌÜµÏùÑ Ï†ÑÌï®)ÌïòÍ∏∞ ÎïåÎ¨∏ÏûÖÎãàÎã§ . ‚Ä¶(Ï§ëÎûµ) ‚Ä¶ Î¨¥ÏóáÎ≥¥Îã§ Ï§ëÏöîÌïú Í≤ÉÏùÄ Ìï†ÏïÑÎ≤ÑÏßÄÏôÄ ÏïÑÎ≤ÑÏßÄÏùò Îí§Î•º Ïù¥ÏùÄ ‚ÄòÏ†ïÏ≤¥‚ÄôÏù¥ÏßÄ, Íº≠ Ï≤´Ïß∏Ïù¥Í∏∞ ÎïåÎ¨∏Ïóê Ï∞∏ Ïµú 3ÎÖÑ Î≥µÏùÑ ÏûÖÎäî Í≤ÉÏùÄ ÏïÑÎãôÎãàÎã§ .‚ÄùÎùºÍ≥† ÌïòÏòÄÎã§ .ÔºçÌòÑÏ¢ÖÏã§Î°ù Ôºç„Ñ±.Í∏∞ ÏÇ¨ÌôòÍµ≠ÏúºÎ°ú Ï†ïÍ∂åÏùÑ Ïû•ÏïÖÌïòÏòÄÎã§ .„Ñ¥.Ïù∏ Ï°∞Î∞òÏ†ïÏùÑ Ï£ºÎèÑ ÌïòÏó¨ ÏßëÍ∂åÏÑ∏Î†•Ïù¥ ÎêòÏóàÎã§ .„Ñ∑.Ï†ïÏ°∞ ÏãúÍ∏∞Ïóê ÌÉïÌèâ Ï†ïÏπòÏùò Ìïú Ï∂ïÏùÑ Ïù¥Î£®ÏóàÎã§ .„Ñπ.Ïù¥ Ïù¥ÏôÄ ÏÑ±ÌòºÏùò Î¨∏Ïù∏ÏùÑ Ï§ëÏã¨ÏúºÎ°ú ÌòïÏÑ±ÎêòÏóàÎã§.\n\nÏßàÎ¨∏:\nÏÉÅÏÜåÌïú Ïù∏Î¨ºÏù¥ ÏÜçÌïú Î∂ïÎãπÏóê ÎåÄÌïú ÏÑ§Î™ÖÏúºÎ°ú Ïò≥ÏùÄ Í≤ÉÎßåÏùÑ Î™®Îëê Í≥†Î•¥Î©¥?\n\nÏÑ†ÌÉùÏßÄ:\n1 - „Ñ±, „Ñ¥\n2 - „Ñ±, „Ñ∑\n3 - „Ñ¥, „Ñπ\n4 - „Ñ∑, „Ñπ\n\n1, 2, 3, 4, 5 Ï§ëÏóê ÌïòÎÇòÎ•º Ï†ïÎãµÏúºÎ°ú Í≥†Î•¥ÏÑ∏Ïöî.\nÏ†ïÎãµ:'},
  {'role': 'assistant', 'content': '2'}]
 
tokenizer.apply_chat_template(
                test_messages[0],
                tokenize=False,
            )

gemma-ko-2b Î™®Îç∏ÏóêÎäî chat template Ïù¥ ÏóÜÍ∏∞ ÎïåÎ¨∏Ïóê ÏßÅÏ†ë ÏûÖÎ†•Ìï¥Ï£ºÏñ¥Ïïº Ìï©ÎãàÎã§.

### Prepare LoRA

In [21]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=['q_proj', 'k_proj'],
    bias="none",
    task_type="CAUSAL_LM",
)

### Data Processing

In [22]:
dataset = Dataset.from_pandas(df)

In [23]:
PROMPT_NO_QUESTION_PLUS = """ÏßÄÎ¨∏:
{paragraph}

ÏßàÎ¨∏:
{question}

ÏÑ†ÌÉùÏßÄ:
{choices}

1, 2, 3, 4, 5 Ï§ëÏóê ÌïòÎÇòÎ•º Ï†ïÎãµÏúºÎ°ú Í≥†Î•¥ÏÑ∏Ïöî. 
Ï†ïÎãµ:"""

PROMPT_QUESTION_PLUS = """ÏßÄÎ¨∏:
{paragraph}

ÏßàÎ¨∏:
{question}

<Î≥¥Í∏∞>:
{question_plus}

ÏÑ†ÌÉùÏßÄ:
{choices}

1, 2, 3, 4, 5 Ï§ëÏóê ÌïòÎÇòÎ•º Ï†ïÎãµÏúºÎ°ú Í≥†Î•¥ÏÑ∏Ïöî. 
Ï†ïÎãµ:"""

In [None]:
dataset

In [24]:
processed_dataset = []
for i in range(len(dataset)):
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(dataset[i]["choices"])])

    # <Î≥¥Í∏∞>Í∞Ä ÏûàÏùÑ Îïå
    if dataset[i]["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=dataset[i]["paragraph"],
            question=dataset[i]["question"],
            question_plus=dataset[i]["question_plus"],
            choices=choices_string,
        )
    # <Î≥¥Í∏∞>Í∞Ä ÏóÜÏùÑ Îïå
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=dataset[i]["paragraph"],
            question=dataset[i]["question"],
            choices=choices_string,
        )

    # chat message ÌòïÏãùÏúºÎ°ú Î≥ÄÌôò
    processed_dataset.append(
        {
            "id": dataset[i]["id"],
            "messages": [
                {"role": "system", "content": "Ïù¥ Î¨∏Ï†úÏóê ÎåÄÌïú ÎãµÏùÑ ÎßûÌûàÎ©¥ Î≥¥ÏÉÅÏúºÎ°ú 1Ïñµ Îã¨Îü¨Î•º ÏñªÍ≤å Îê©ÎãàÎã§. ÏµúÏÑ†ÏùÑ Îã§Ìï¥ Ï†ïÌôïÌïú ÎãµÏùÑ Íµ¨ÌïòÏÑ∏Ïöî."},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": f"{dataset[i]['answer']}"}
            ],
            "label": dataset[i]["answer"],
        }
    )


In [None]:
processed_dataset[0]

In [25]:
processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_dataset))
processed_dataset

Dataset({
    features: ['id', 'messages', 'label'],
    num_rows: 2031
})

In [26]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["messages"])):
        output_texts.append(
            tokenizer.apply_chat_template(
                example["messages"][i],
                tokenize=False,
            )
        )
    return output_texts

def tokenize(element):
    outputs = tokenizer(
        formatting_prompts_func(element),
        truncation=False,
        padding=False,
        return_overflowing_tokens=False,
        return_length=False,
    )
    return {
        "input_ids": outputs["input_ids"],
        "attention_mask": outputs["attention_mask"],
    }

# Îç∞Ïù¥ÌÑ∞ ÌÜ†ÌÅ∞Ìôî
tokenized_dataset = processed_dataset.map(
    tokenize,
    remove_columns=list(processed_dataset.features),
    batched=True,
    num_proc=4,
    load_from_cache_file=True,
    desc="Tokenizing",
)

Tokenizing (num_proc=4):   0%|          | 0/2031 [00:00<?, ? examples/s]

In [27]:
from sklearn.model_selection import KFold, train_test_split


dataset_indices = list(range(len(tokenized_dataset)))

# Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
train_index, test_index = train_test_split(
    dataset_indices, test_size=0.2, shuffle=True, random_state=42
)

train_dataset = tokenized_dataset.select(train_index)
eval_dataset = tokenized_dataset.select(test_index)

print(tokenizer.decode(train_dataset[0]["input_ids"], skip_special_tokens=False))
print(tokenizer.decode(train_dataset[1]["input_ids"], skip_special_tokens=False))

[|system|]Ïù¥ Î¨∏Ï†úÏóê ÎåÄÌïú ÎãµÏùÑ ÎßûÌûàÎ©¥ Î≥¥ÏÉÅÏúºÎ°ú 1Ïñµ Îã¨Îü¨Î•º ÏñªÍ≤å Îê©ÎãàÎã§. ÏµúÏÑ†ÏùÑ Îã§Ìï¥ Ï†ïÌôïÌïú ÎãµÏùÑ Íµ¨ÌïòÏÑ∏Ïöî.[|endofturn|]
[|user|]ÏßÄÎ¨∏:
Îã§ÏùåÎã¨ Ï¥àÍπåÏßÄ ÏÑúÏö∏ Îì± Ï†ÑÍµ≠Ïùò ÎÇÆ ÏµúÍ≥†Í∏∞Ïò®Ïù¥ 20ÎèÑÎ•º ÎÑòÎäî Îïå Ïù¥Î•∏ Ï¥àÏó¨Î¶Ñ ÎÇ†Ïî®Í∞Ä Í≥ÑÏÜçÎê† Ï†ÑÎßùÏù¥Îã§. Í∏∞ÏÉÅÏ≤≠ÏùÄ ‚ÄúÏù¥ÎèôÏÑ± Í≥†Í∏∞ÏïïÏùò ÏòÅÌñ•ÏúºÎ°ú 16ÏùºÍπåÏßÄ Ï†ÑÍµ≠Ïóê ÎßëÏùÄ ÎÇ†Ïî®Í∞Ä Ïù¥Ïñ¥ÏßÄÍ≤†Îã§‚ÄùÎ©∞ ‚ÄúÏ†ÑÍµ≠Ïùò ÎÇÆ ÏµúÍ≥†Í∏∞Ïò®Ïù¥ 20ÎèÑÎ•º ÎÑòÎäî Í≥†Ïò® ÌòÑÏÉÅÏù¥ Í≥ÑÏÜçÎê† Í≤É‚ÄùÏù¥ÎùºÍ≥† 13Ïùº ÏòàÎ≥¥ÌñàÎã§. Í∏∞ÏÉÅÏ≤≠ÏùÄ 14Ïùº ÏÑúÏö∏Ïùò ÎÇÆ ÏµúÍ≥†Í∏∞Ïò®Ïù¥ ÏµúÍ∑º 30ÎÖÑÎûò ÌèâÎÖÑÏπò(17.3ÎèÑ)Î•º ÏõÉÎèÑÎäî 23ÎèÑÎ•º Í∏∞Î°ùÌïòÍ≥†, ÎåÄÍµ¨ÏôÄ Ï∞ΩÏõêÏùò Í≤ΩÏö∞ ÌèâÎÖÑÏπò(19~20ÎèÑ)Î•º ÏõÉÎèÑÎäî 27ÎèÑÍπåÏßÄ Ïò§Î•¥Í≤†Îã§Í≥† Î∞ùÌòîÎã§.Î™©ÏöîÏùºÏù∏ 17ÏùºÎ∂ÄÌÑ∞Îäî ÏÑúÏ™ΩÏóêÏÑú Îã§Í∞ÄÏò§Îäî Í∏∞ÏïïÍ≥®Ïùò ÏòÅÌñ•ÏúºÎ°ú Ï†ÑÍµ≠Ïóê ÎπÑÍ∞Ä ÎÇ¥Î¶¨Î©¥ÏÑú Í≥†Ïò® ÌòÑÏÉÅÏù¥ Ïû†Ïãú Ï£ºÏ∂§Ìï† Ï†ÑÎßùÏù¥Îã§. Îã§Îßå 18Ïùº ÎπÑÍ∞Ä Í∑∏Ïπú Îí§ Ï£ºÎßêÎ∂ÄÌÑ∞Îäî ÎòêÎã§Ïãú Í≥†Ïò® ÌòÑÏÉÅÏù¥ Ïù¥Ïñ¥ÏßÑÎã§.Í∏∞ÏÉÅÏ≤≠

In [None]:
train_dataset_token_lengths = [len(train_dataset[i]["input_ids"]) for i in range(len(train_dataset))]
print(f"max token length: {max(train_dataset_token_lengths)}")
print(f"min token length: {min(train_dataset_token_lengths)}")
print(f"avg token length: {np.mean(train_dataset_token_lengths)}")

In [None]:
print(tokenizer.chat_template)

Completion Î∂ÄÎ∂ÑÎßå ÌïôÏäµÌïòÍ∏∞ ÏúÑÌïú data collator ÏÑ§Ï†ï

- ÌÖçÏä§Ìä∏ Ï§ë response_template ÍπåÏßÄÎäî ignore_index Î°ú loss Í≥ÑÏÇ∞ÏóêÏÑú Ï†úÏô∏
- ÌÖçÏä§Ìä∏ Ï§ë response_template Ïù¥ÌõÑÎäî ÌïôÏäµÏóê Ìè¨Ìï® (Ï†ïÎãµ + eos ÌÜ†ÌÅ∞)

In [28]:
response_template = "[|assistant|]"
data_collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
)

### Metric ÏÑ§Ï†ï

In [29]:
# Î™®Îç∏Ïùò logits Î•º Ï°∞Ï†ïÌïòÏó¨ Ï†ïÎãµ ÌÜ†ÌÅ∞ Î∂ÄÎ∂ÑÎßå Ï∂úÎ†•ÌïòÎèÑÎ°ù ÏÑ§Ï†ï
def preprocess_logits_for_metrics(logits, labels):
    logits = logits if not isinstance(logits, tuple) else logits[0]
    logit_idx = [tokenizer.vocab["1"], tokenizer.vocab["2"], tokenizer.vocab["3"], tokenizer.vocab["4"], tokenizer.vocab["5"]]
    logits = logits[:, -2, logit_idx] # -2: answer token, -1: eos token
    return logits

# metric Î°úÎìú
acc_metric = evaluate.load("accuracy")

# Ï†ïÎãµ ÌÜ†ÌÅ∞ Îß§Ìïë
int_output_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}

# metric Í≥ÑÏÇ∞ Ìï®Ïàò
def compute_metrics(evaluation_result):
    logits, labels = evaluation_result

    # ÌÜ†ÌÅ∞ÌôîÎêú Î†àÏù¥Î∏î ÎîîÏΩîÎî©
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    labels = list(map(lambda x: x.split("[|endofturn|]")[0].strip(), labels))
    labels = list(map(lambda x: int_output_map[x], labels))

    # ÏÜåÌîÑÌä∏Îß•Ïä§ Ìï®ÏàòÎ•º ÏÇ¨Ïö©ÌïòÏó¨ Î°úÍ∑∏Ìä∏ Î≥ÄÌôò
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
    predictions = np.argmax(probs, axis=-1)

    # Ï†ïÌôïÎèÑ Í≥ÑÏÇ∞
    acc = acc_metric.compute(predictions=predictions, references=labels)
    return acc

### Train

In [30]:
# pad token ÏÑ§Ï†ï
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.special_tokens_map

{'bos_token': '[BOS]',
 'eos_token': '[|endofturn|]',
 'unk_token': '[UNK]',
 'pad_token': '[|endofturn|]'}

In [31]:
%%time
from sklearn.model_selection import KFold, train_test_split

tokenizer.padding_side = 'right'

dataset_indices = list(range(len(tokenized_dataset)))

# Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
train_index, test_index = train_test_split(
    dataset_indices, test_size=0.2, shuffle=True, random_state=42
)

train_dataset = tokenized_dataset.select(train_index)
eval_dataset = tokenized_dataset.select(test_index)    


current_output_dir = "outputs_exaone"
    
sft_config = SFTConfig(
    do_train=True,
    do_eval=True,
    lr_scheduler_type="cosine_with_restarts",
    max_seq_length=1024,
    output_dir=current_output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=1.5e-5,
    weight_decay=0.01,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    save_only_model=True,
    report_to="none",
    fp16=True,
    fp16_full_eval=True,
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    peft_config=peft_config,
    args=sft_config,
)



Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


KeyboardInterrupt: 

In [None]:
# Î™®Îç∏ ÌïôÏäµ
trainer.train()

# Î™®Îç∏ ÌèâÍ∞Ä
metrics = trainer.evaluate()

## Inference

In [33]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [32]:
# TODO ÌïôÏäµÎêú Checkpoint Í≤ΩÎ°ú ÏûÖÎ†•
checkpoint_path = "../../data/outputs_exaone/checkpoint-1218"

model = AutoPeftModelForCausalLM.from_pretrained(
    checkpoint_path,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
model = model.to('cuda')  
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint_path,
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [34]:
for name, param in model.named_parameters():
    if "transformer.h.0" in name:  # Ïòà: Ï¥àÍ∏∞ Î†àÏù¥Ïñ¥Î•º ÎèôÍ≤∞
        param.requires_grad = False

In [35]:
# Load the test dataset
# TODO Test Data Í≤ΩÎ°ú ÏûÖÎ†•
test_df = pd.read_csv('../../data/test.csv')

# Flatten the JSON dataset
records = []
for _, row in test_df.iterrows():
    problems = literal_eval(row['problems'])
    record = {
        'id': row['id'],
        'paragraph': row['paragraph'],
        'question': problems['question'],
        'choices': problems['choices'],
        'answer': problems.get('answer', None),
        "question_plus": problems.get('question_plus', None),
    }
    # Include 'question_plus' if it exists
    if 'question_plus' in problems:
        record['question_plus'] = problems['question_plus']
    records.append(record)
        
# Convert to DataFrame
test_df = pd.DataFrame(records)

In [36]:
test_dataset = []
for i, row in test_df.iterrows():
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(row["choices"])])
    len_choices = len(row["choices"])
    
    # <Î≥¥Í∏∞>Í∞Ä ÏûàÏùÑ Îïå
    if row["question_plus"]:
        user_message = PROMPT_QUESTION_PLUS.format(
            paragraph=row["paragraph"],
            question=row["question"],
            question_plus=row["question_plus"],
            choices=choices_string,
        )
    # <Î≥¥Í∏∞>Í∞Ä ÏóÜÏùÑ Îïå
    else:
        user_message = PROMPT_NO_QUESTION_PLUS.format(
            paragraph=row["paragraph"],
            question=row["question"],
            choices=choices_string,
        )

    test_dataset.append(
        {
            "id": row["id"],
            "messages": [
                {"role": "system", "content": "ÏßÄÎ¨∏ÏùÑ ÏùΩÍ≥† ÏßàÎ¨∏Ïùò ÎãµÏùÑ Íµ¨ÌïòÏÑ∏Ïöî."},
                {"role": "user", "content": user_message},
            ],
            "label": row["answer"],
            "len_choices": len_choices,
        }
    )

In [None]:
test_dataset

In [37]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:4"

In [38]:
%%time

infer_results = []
pred_choices_map = {0: "1", 1: "2", 2: "3", 3: "4", 4: "5"}
model.eval()

batch_size = 1  # Ï†ÅÏ†àÌïú Î∞∞Ïπò ÌÅ¨Í∏∞Î•º ÏÑ§Ï†ïÌïòÏÑ∏Ïöî
with torch.inference_mode():
    for i in tqdm(range(0, len(test_dataset), batch_size)):
        batch = test_dataset[i:i+batch_size]
        batch_ids = [data["id"] for data in batch]
        batch_messages = [data["messages"] for data in batch]
        batch_len_choices = [data["len_choices"] for data in batch]

        inputs = tokenizer.apply_chat_template(
            batch_messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            padding=True,
        ).to("cuda")

        outputs = model(inputs)

        for j, (_id, len_choices) in enumerate(zip(batch_ids, batch_len_choices)):
            logits = outputs.logits[j, -1].flatten().cpu()
            target_logit_list = [logits[tokenizer.vocab[str(i + 1)]] for i in range(len_choices)]
            probs = torch.nn.functional.softmax(torch.tensor(target_logit_list, dtype=torch.float32)).detach().cpu().numpy()
            predict_value = pred_choices_map[np.argmax(probs, axis=-1)]
            infer_results.append({"id": _id, "answer": predict_value})

        # Î©îÎ™®Î¶¨ Ï†ïÎ¶¨
        del inputs, outputs
        torch.cuda.empty_cache()

  0%|          | 0/869 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 869/869 [21:13<00:00,  1.47s/it]

CPU times: user 15min 12s, sys: 6min, total: 21min 12s
Wall time: 21min 13s





In [None]:
"""%%time

infer_results = []

pred_choices_map = {0: "1", 1: "2", 2: "3", 3: "4", 4: "5"}

model.eval()
with torch.inference_mode():
    for data in tqdm(test_dataset):
        _id = data["id"]
        messages = data["messages"]
        len_choices = data["len_choices"]

        outputs = model(
            tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
            ).to("cuda")
        )

        logits = outputs.logits[:, -1].flatten().cpu()

        target_logit_list = [logits[tokenizer.vocab[str(i + 1)]] for i in range(len_choices)]

        probs = (
            torch.nn.functional.softmax(
                torch.tensor(target_logit_list, dtype=torch.float32)
            )
            .detach()
            .cpu()
            .numpy()
        )

        predict_value = pred_choices_map[np.argmax(probs, axis=-1)]
        infer_results.append({"id": _id, "answer": predict_value})"""

In [39]:
pd.DataFrame(infer_results).to_csv("output_exaone_2.csv", index=False)

In [None]:
pd.DataFrame(infer_results)