# Vionous Korean Language LoRA Trainer

Train a LoRA adapter to teach Qwen2.5-7B about korean language using the Vionous Korean Language knowledge package.

**Requirements:**
- Google Colab with T4 GPU (free tier works)
- ~1-2 hours runtime

**Output:**
- LoRA adapter files (adapter_model.safetensors, adapter_config.json)
- Downloadable as zip

---
**Before running:** Go to Runtime → Change runtime type → Select T4 GPU

In [None]:
# Cell 1: Install dependencies
!pip install -q transformers>=4.36.0
!pip install -q peft>=0.7.0
!pip install -q datasets>=2.14.0
!pip install -q bitsandbytes>=0.41.0
!pip install -q trl>=0.7.0
!pip install -q accelerate>=0.25.0
!pip install -q scipy

print("\n" + "="*50)
print("Dependencies installed successfully!")
print("="*50)

In [None]:
# Cell 2: Clone vionous repo and load training data
import os
import json
from datasets import Dataset

# Clone the repository
if not os.path.exists('vionous'):
    !git clone https://github.com/larro1991/vionous.git
    print("Repository cloned!")
else:
    print("Repository already exists")

# Load training data
DATA_PATH = "vionous/packages/korean/training-data"

def load_jsonl(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = load_jsonl(f"{DATA_PATH}/train.jsonl")
val_data = load_jsonl(f"{DATA_PATH}/val.jsonl")

print(f"\nLoaded {len(train_data):,} training examples")
print(f"Loaded {len(val_data):,} validation examples")

# Convert to chat format for Qwen
def format_for_training(example):
    return {
        "text": f"<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n{example['answer']}<|im_end|>"
    }

train_formatted = [format_for_training(ex) for ex in train_data]
val_formatted = [format_for_training(ex) for ex in val_data]

# Create datasets
train_dataset = Dataset.from_list(train_formatted)
val_dataset = Dataset.from_list(val_formatted)

print(f"\nDatasets created!")
print(f"Sample:\n{train_formatted[0]['text'][:300]}...")

In [None]:
# Cell 3: Load base model with 4-bit quantization
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading {MODEL_ID} with 4-bit quantization...")
print("This may take a few minutes...\n")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

print("\n" + "="*50)
print("Model loaded successfully!")
print(f"Model dtype: {model.dtype}")
print(f"Device: {model.device}")
print("="*50)

In [None]:
# Cell 4: Configure LoRA
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} ({100 * trainable_params / all_param:.2f}%)")
    print(f"All params: {all_param:,}")

print("\nLoRA Configuration:")
print(f"  r: {lora_config.r}")
print(f"  alpha: {lora_config.lora_alpha}")
print(f"  dropout: {lora_config.lora_dropout}")
print(f"  target_modules: {lora_config.target_modules}")
print()
print_trainable_parameters(model)

In [None]:
# Cell 5: Set up trainer
from transformers import TrainingArguments
from trl import SFTTrainer

OUTPUT_DIR = "./vionous-korean-lora"

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=True,
    optim="paged_adamw_8bit",
    max_grad_norm=0.3,
    group_by_length=True,
    report_to="none",
    save_total_limit=2,
)

# Create trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=512,
    packing=True,
)

print("Trainer configured!")
print(f"\nTraining settings:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")

In [None]:
# Cell 6: Train!
print("="*50)
print("STARTING TRAINING")
print("="*50)
print("\nThis will take approximately 1-2 hours on a T4 GPU.")
print("You can monitor progress below.\n")

# Train
trainer.train()

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)

In [None]:
# Cell 7: Save adapter
ADAPTER_DIR = "./vionous-korean-adapter"

# Save the LoRA adapter
model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

print(f"Adapter saved to: {ADAPTER_DIR}")
print("\nFiles created:")
!ls -la {ADAPTER_DIR}

In [None]:
# Cell 8: Test with korean language questions
print("="*50)
print("TESTING THE TRAINED MODEL")
print("="*50 + "\n")

test_questions = [
    "How do I learn Hangul?",
    "What are honorifics?",
    "How do I use particles?",
    "What's the difference between 은/는 and 이/가?",
    "How do I conjugate verbs?",
]

def generate_response(question):
    prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1]
        response = response.replace("<|im_end|>", "").strip()
    return response

for q in test_questions:
    print(f"Q: {q}")
    answer = generate_response(q)
    print(f"A: {answer}")
    print("-" * 40 + "\n")

In [None]:
# Cell 9: Download adapter as zip
import shutil
from google.colab import files

ZIP_NAME = "vionous-korean-lora-adapter"

# Create zip file
shutil.make_archive(ZIP_NAME, 'zip', ADAPTER_DIR)

print(f"Created: {ZIP_NAME}.zip")
print("\nDownloading...")

# Download
files.download(f"{ZIP_NAME}.zip")

print("\n" + "="*50)
print("ALL DONE!")
print("="*50)
print("\nYour Korean Language LoRA adapter has been downloaded.")

---
**Vionous Knowledge Package** | Korean Language | CC-BY-SA 4.0 | Source: Stack Exchange