## Main Idea
> How Korean benchmark of 1B model will change?

- Use LoRA technique (training small, efficient adapter layers)
- Large Korean pre-training corpus as the dataset



In [1]:
!pip install transformers datasets accelerate peft bitsandbytes trl ijson

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.19.0-py3-none-any.whl.metadata (10 kB)
Collecting ijson
  Downloading ijson-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_

In [2]:
import os
from huggingface_hub import login
# login(token="")
login()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import torch
import ijson
import json
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset

In [5]:
model_id = "meta-llama/Llama-3.2-1B"

# 4-bit quantization for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Use the data loader we designed for the pre-training corpus
def load_pretraining_corpus(path, debug_sample_limit=None):
    print("--- Loading Korean Pre-training Corpus ---")
    all_texts = []
    for dirpath, _, filenames in os.walk(path):
        for filename in sorted(filenames):
            if not filename.endswith(".json"): continue
            file_path = os.path.join(dirpath, filename)
            try:
                with open(file_path, 'rb') as f:
                    items_iterator = ijson.items(f, 'data_info.item')
                    count = 0
                    for item in items_iterator:
                        if debug_sample_limit is not None and count >= debug_sample_limit:
                            break
                        content = item.get('contents')
                        if content:
                            all_texts.append({"text": content})
                        count += 1
            except Exception as e:
                print(f"    -> Could not process file {file_path}: {e}")
    print(f"\nLoaded {len(all_texts)} documents for pre-training.")
    return Dataset.from_list(all_texts)

# Groups texts into chunks of a fixed size
def group_texts(examples, block_size=1024):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [6]:
corpus_base_path = '/content/drive/MyDrive/datasets/한국어_성능이_개선된_초거대AI_언어모델_개발_및_데이터/Training/'

# For a quick debug run, use a small limit. For a real run, set to None.
pretraining_dataset = load_pretraining_corpus(corpus_base_path, debug_sample_limit=100)

# Tokenize and chunk the dataset
tokenized_dataset = pretraining_dataset.map(lambda examples: tokenizer(examples["text"]), batched=True, remove_columns=["text"])
lm_dataset = tokenized_dataset.map(group_texts, batched=True)


# --- Training ---
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./korean-adapted-llama-1b",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator,
)

# Start the adaptation process
print("--- Starting LoRA Adaptation on Korean Corpus ---")
trainer.train()

# Save the trained LoRA adapters
trainer.save_model("./korean-adapted-llama1b")
print("--- Adaptation complete. LoRA adapters saved. ---")


--- Loading Korean Pre-training Corpus ---

Loaded 36618 documents for pre-training.


Map:   0%|          | 0/36618 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (289335 > 131072). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/36618 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


--- Starting LoRA Adaptation on Korean Corpus ---


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mleezion-git[0m ([33mzion-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
100,2.8844
200,2.7437
300,2.6761
400,2.6516
500,2.6252
600,2.5938
700,2.593
800,2.5805
900,2.556
1000,2.5601


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


--- Adaptation complete. LoRA adapters saved. ---


# Executing Log (T4)

```bash
--- Loading Korean Pre-training Corpus ---

Loaded 36618 documents for pre-training.
Map: 100%
 36618/36618 [07:26<00:00, 143.81 examples/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (289335 > 131072). Running this sequence through the model will result in indexing errors
Map: 100%
 36618/36618 [56:58<00:00, 18.87 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
--- Starting LoRA Adaptation on Korean Corpus ---
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter: ··········
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publicly.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: No netrc file found, creating one.
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
wandb: Currently logged in as: - (-) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
Tracking run with wandb version 0.20.1
Run data is saved locally in /content/wandb/run-20250623_091711-spofet26
Syncing run ./korean-adapted-llama-1b to Weights & Biases (docs)
View project at https://wandb.ai/-/huggingface
View run at https://wandb.ai/-/huggingface/runs/spofet26
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)
 [ 281/12613 1:37:22 < 71:43:56, 0.05 it/s, Epoch 0.02/1]
Step	Training Loss
100	2.885000
200	2.744400
```

## Executing Log (A100)

--- Loading Korean Pre-training Corpus ---
```bash
Loaded 36618 documents for pre-training.
Map: 100%
 36618/36618 [05:55<00:00, 179.73 examples/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (289335 > 131072). Running this sequence through the model will result in indexing errors
Map: 100%
 36618/36618 [50:52<00:00, 21.16 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
--- Starting LoRA Adaptation on Korean Corpus ---
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter: ··········
wandb: WARNING If you're specifying your api key in code, ensure this code is not shared publicly.
wandb: WARNING Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.
wandb: No netrc file found, creating one.
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
wandb: Currently logged in as: - (-) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
Tracking run with wandb version 0.20.1
Run data is saved locally in /content/wandb/run-20250623_121435-vv8s2vxt
Syncing run ./korean-adapted-llama-1b to Weights & Biases (docs)
View project at https://wandb.ai/-/huggingface
View run at https://wandb.ai/-/huggingface/runs/vv8s2vxt
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)
 [ 2952/12613 2:37:32 < 8:35:56, 0.31 it/s, Epoch 0.23/1]
Step	Training Loss
100	2.884400
200	2.743700
300	2.676100
400	2.651600
500	2.625200
600	2.593800
700	2.593000
800	2.580500
900	2.556000
1000	2.560100
1100	2.549600
1200	2.528400
1300	2.530500
1400	2.536800
1500	2.523100
1600	2.513400
1700	2.518100
1800	2.505200
1900	2.505100
2000	2.499000
2100	2.495000
2200	2.490700
2300	2.469000
2400	2.491900
2500	2.485400
2600	2.492400
2700	2.479300
2800	2.477600
2900	2.470100
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)
/usr/local/lib/python3.11/dist-packages/torch/_dynamo/eval_frame.py:745: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  return fn(*args, **kwargs)
```

In [None]:
final_model_path = "/content/drive/MyDrive/Llama3_Korean_Finetune/final-korean-adapted-llama1b"

# Use the save_model command to save to Drive
trainer.save_model(final_model_path)
print(f"Training complete and final LoRA adapters saved to: {final_model_path}")

Training complete and final LoRA adapters saved to: /content/drive/MyDrive/Llama3_Korean_Finetune/final-korean-adapted-adapters
