In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from datetime import datetime

from transformers import XGLMTokenizer, XGLMForCausalLM,AutoTokenizer,AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset,Dataset
# from datasets import 
from tqdm import tqdm
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import random
import wandb

wandb.login(key="8b274369bdaf41e5e92db13dc11653a1e7a97fe6")
wandb.init(project="nlp-onet-finetuning")

In [None]:
def prepare_reasoning_dataset(df):
    # Format each example as an instruction with reasoning
    formatted_data = []
    for _, row in df.iterrows():
        # Format for Thai reasoning problems
        text = row['prompt']+row['reasoning']
        formatted_data.append({"text": text,
                               "prompt":row['prompt']
                            })
    return Dataset.from_pandas(pd.DataFrame(formatted_data))
    
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=1136)

def getLabels(example):
    prompt_ids = tokenizer(example['prompt'], return_tensors="pt").input_ids
    labels = example['input_ids'].copy()
    labels[:len(prompt_ids)] = [-100]*len(prompt_ids)
    return {'labels':labels}
    
def genPrompt(data,task="\nกรุณาอธิบายเหตุผลอย่างเป็นขั้นตอนและระบุคำตอบที่ถูกต้องที่ท้ายคำอธิบาย (ในรูปแบบ: คำตอบคือ: b) ans: ",random_order=False):
    # Set the seed for reproducibility
    # random.seed(42)  # Use any integer you like
    
    choices = [
        ('A', data['choice1']),
        ('B', data['choice2']),
        ('C', data['choice3']),
        ('D', data['choice4']),
        ('E', data['choice5']),
    ]
    # choices = [
    #     ('\n', data['choice1']),
    #     ('\n', data['choice2']),
    #     ('\n', data['choice3']),
    #     ('\n', data['choice4']),
    #     ('\n', data['choice5']),
    # ]
    
    # Shuffle the choices
    if random_order :
        random.shuffle(choices)
    
    # Build the prompt
    prompt = data['question'] + "\nตัวเลือก:\n"
    for label, choice in choices:
        prompt += f"{label}. {choice}\n"
        # prompt += f"{label}|{choice}|"
        
    
    prompt += task
    return prompt
    # return  data['question'] + "\nตัวเลือก:\n" + "A. "+data['choice1'] + "\nB. "+data['choice2']+"\nC. "+data['choice3']+"\nD. "+data['choice4']+"\nE. "+data['choice5']+ task

def genPromptV2(data):
    return genPrompt(data,"")+"\nYour task is to answer the given multiple choice mathamatical question by thinking step by step.\n You must add <think> when the thinking started and </think> when the thinking has ended, then answer the choice in which you think is correct in this example format (answer: A.). To answer "

def genPromptNoChoice(data):
    return data['question'] + "\nans:"

def genAns(input_text,max_new_tokens = 200):
    input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
    tokenizer.pad_token = tokenizer.eos_token  # Make sure they're properly linke        d
    tokenizer.pad_token_id = tokenizer.eos_token_id
    
    generated_ids = model.generate(
        input_ids.input_ids,
        attention_mask=input_ids.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,  # Disable sampling for more deterministic outputs
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        # repetition_penalty=1.1,  # Mild repetition penalty
        num_beams=3,     # Use beam search for more coherent outputs
        early_stopping=True
    )
    
    # Decode the generated ids to text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text

def genSamples(df,n=20):
    for i in range(n):
        generated_text = genAns(genPrompt(df.iloc[i]))
        print(generated_text) # 1.9B
        print('correct ans:',df.iloc[i]['choice_ans'])
        print("-"*50)

# Setup

In [None]:
# model_name = "facebook/xglm-564M"
# model_name = "facebook/xglm-1.7B"
model_name = "facebook/xglm-2.9B"

safe_model_name = model_name.replace("/", "_")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = safe_model_name + "_" + timestamp
tokenizer = XGLMTokenizer.from_pretrained(model_name)
model = XGLMForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto"  # Automatically distribute model across available GPUs
)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4,
    lora_alpha=32,
    lora_dropout=0.05,
    # bias="none",
    target_modules=["q_proj", "v_proj","k_proj","out_proj"]  # Explicitly required
)

model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

In [None]:
df = pd.read_csv("/kaggle/input/project-nlp-onet-math/preped_data.csv")
dataset = prepare_reasoning_dataset(df)
dataset = dataset.map(tokenize)
dataset = dataset.map(getLabels)
split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

# Before

In [None]:
genSamples(df)

# Train

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_accumulation_steps=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # greater_is_better=False,
    learning_rate=5e-5,
    num_train_epochs=50,
    weight_decay=0.01,
    report_to="wandb",  # ใช้ WandB ในการติดตาม
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # ใช้ FP16 สำหรับการฝึก
    label_names=["labels"],
    gradient_accumulation_steps=8  # สะสม gradient สำหรับหลายรอบ
)

# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're not doing masked language modeling
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
)

In [None]:
# Start training
print("Starting training...")
trainer.train()

# Save the model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# After

In [None]:
tokenizer = XGLMTokenizer.from_pretrained(output_dir)
model = XGLMForCausalLM.from_pretrained(
    output_dir,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto"  # Automatically distribute model across available GPUs
)

In [None]:
genSamples(df)