# Importing Libraries

In [1]:
import os
from dotenv import load_dotenv
import random
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass

# PyTorch
import torch

# Huggingface
import huggingface_hub
from transformers import TextStreamer
from datasets import load_dataset

# Weights & Biases
import wandb

# Unsloth
from unsloth import FastLanguageModel, is_bf16_supported, UnslothTrainer, UnslothTrainingArguments

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Configuration

In [2]:
@dataclass
class CONFIG:
    debug: bool = False
    
    # Model
    model_id: str = "meta-llama/Llama-3.2-3B"
    
    # HuggingFace Hub
    username: str = "PathFinderKR"
    model_name: str = f"KHU-Llama-3.2-3B"
    
    # Data
    dataset_id: str = "Khudanlp/KHUrious_pretraining_data"
    
    # Training
    ## Paths
    output_dir: str = "./results"
    logging_dir: str = "./logs"
    save_strategy: str = "epoch"
    logging_strategy: str = "steps"
    logging_steps: int = 1
    save_total_limit: int = 1
    report_to: str = "wandb" if not debug else None
    ## Hyperparameters
    num_train_epochs: int = 2
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 2
    fp16: bool = not is_bf16_supported()
    bf16: bool = is_bf16_supported()
    dtype: torch.dtype = torch.bfloat16 if is_bf16_supported() else torch.float16
    load_in_4bit: bool = True
    learning_rate: float = 5e-4
    embedding_learning_rate = 1e-4
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.1
    optim: str = "adamw_8bit"
    weight_decay: float = 0.01
    max_seq_length: int = 12800
    dataset_num_proc: int = 2
    packing: bool = True
    ### LoRA
    lora: bool = True
    if lora:
        r: int = 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"]
        lora_alpha: int = 32
        lora_dropout: float = 0
        bias: str = "none"
        use_gradient_checkpointing: str = "unsloth"
        use_rslora: bool = True
        loftq_config: str = None
        save_method: str = "merged_16bit"
    
    # Inference
    max_new_tokens: int = 2048
    do_sample: bool = True
    temperature: float = 0.7
    top_p: float = 0.9
    repetition_penalty: float = 1.1
    
    # Device
    device: torch.device = None
    
    # Seed
    seed: int = 42

## Reproducibility

In [3]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    print(f"Seed: {seed}")
    
set_seed(CONFIG.seed)

Seed: 42


## Device

In [4]:
def configure_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print("> Running on GPU", end=' | ')
        print("Num of GPUs: ", num_gpu)
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("> Running on MPS")
    else:
        device = torch.device("cpu")
        print("> Running on CPU")
    return device

CONFIG.device = configure_device()

> Running on GPU | Num of GPUs:  1


## Debugging

In [5]:
if CONFIG.debug:
    CONFIG.num_train_epochs = 1

## HuggingFace

In [6]:
load_dotenv()
huggingface_hub.login(
    token=os.getenv("HUGGINGFACE_TOKEN"),
    add_to_git_credential=True
)

## Weights & Biases

In [7]:
if not CONFIG.debug:
    wandb.login(
        key=os.getenv("WANDB_API_KEY")
    )
    wandb.init(
        project=CONFIG.model_name
    )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc


# Utility Functions

In [8]:
# Generate base model
def generate_text(prompt):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        prompt
    ], return_tensors = "pt").to(CONFIG.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=CONFIG.max_new_tokens,
        do_sample=CONFIG.do_sample,
        temperature=CONFIG.temperature,
        top_p=CONFIG.top_p,
        repetition_penalty=CONFIG.repetition_penalty,
        use_cache=True,
        streamer=TextStreamer(tokenizer)
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [9]:
khu_prompt = """Kyung Hee University
### Title: {}

### Information:
{}
"""

def formatting_func(examples):
    texts = []
    metadata = examples["meta"]
    data = examples["TEXT"]
    for metadata, data in zip(metadata, data):
        text = khu_prompt.format(metadata, data) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

In [10]:
def plot_token_length(fields):
    for field in fields:
        token_lengths = [len(tokenizer.encode(example[field])) for example in dataset if example[field] != ""]
        
        plt.figure(figsize=(10, 5))
        plt.hist(token_lengths, bins=50, color='skyblue', edgecolor='black')
        plt.xlabel(f'{field.capitalize()} Length')
        plt.ylabel('Frequency')
        plt.title(f'{field.capitalize()} Token Length Distribution')
        plt.show()
        
        print(f"Max {field} token length: {max(token_lengths)}")
        print(f"Min {field} token length: {min(token_lengths)}")
        print(f"Mean {field} token length: {np.mean(token_lengths):.2f}")
        print(f"Standard deviation of {field} token length: {np.std(token_lengths):.2f}")

# Model

In [11]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=CONFIG.model_id,
    max_seq_length=CONFIG.max_seq_length,
    dtype=CONFIG.dtype,
    load_in_4bit=CONFIG.load_in_4bit if CONFIG.lora else False
)

==((====))==  Unsloth 2024.11.8: Fast Llama patching. Transformers = 4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4080 SUPER. Max memory: 15.992 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




In [12]:
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.all_special_tokens}")

Vocabulary size: 128000
Special tokens: ['<|begin_of_text|>', '<|end_of_text|>', '<|finetune_right_pad_id|>']


In [13]:
print(model)
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f}B")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Llam

In [14]:
if CONFIG.debug:
    sample_text = """Kyung Hee University
### Title: Department of Computer Science and Engineering

### Information:
"""
    sample_response = generate_text(sample_text)
    print(sample_response)

# Dataset

In [15]:
dataset = load_dataset(CONFIG.dataset_id, split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [16]:
dataset

Dataset({
    features: ['TEXT', 'meta'],
    num_rows: 1690
})

In [17]:
if CONFIG.debug:
    print(dataset[0]["TEXT"])

## Preprocessing

In [18]:
dataset = dataset.map(formatting_func, batched=True)

In [19]:
if CONFIG.debug:
    print(dataset[0]["text"])
    print(tokenizer.tokenize(dataset[0]["text"]))

In [20]:
dataset

Dataset({
    features: ['TEXT', 'meta', 'text'],
    num_rows: 1690
})

In [21]:
if CONFIG.debug:
    plot_token_length(["text"])

# Continued Pre-Training (LoRA)

In [22]:
if CONFIG.lora:
    model = FastLanguageModel.get_peft_model(
        model,
        r=CONFIG.r,
        target_modules=CONFIG.target_modules,
        lora_alpha=CONFIG.lora_alpha,
        lora_dropout=CONFIG.lora_dropout,
        bias=CONFIG.bias,
        use_gradient_checkpointing=CONFIG.use_gradient_checkpointing,
        use_rslora=CONFIG.use_rslora,
        loftq_config=CONFIG.loftq_config,
        random_state=CONFIG.seed
    )

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.11.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [23]:
if CONFIG.lora:
    model.print_trainable_parameters()

trainable params: 982,515,712 || all params: 4,589,267,968 || trainable%: 21.4090


In [24]:
trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=CONFIG.max_seq_length,
    dataset_num_proc=CONFIG.dataset_num_proc,
    packing=CONFIG.packing,
    args=UnslothTrainingArguments(
        output_dir=CONFIG.output_dir,
        logging_dir=CONFIG.logging_dir,
        save_strategy=CONFIG.save_strategy,
        logging_strategy=CONFIG.logging_strategy,
        logging_steps=CONFIG.logging_steps,
        save_total_limit=CONFIG.save_total_limit,
        report_to=CONFIG.report_to,
        num_train_epochs=CONFIG.num_train_epochs,
        per_device_train_batch_size=CONFIG.per_device_train_batch_size,
        gradient_accumulation_steps=CONFIG.gradient_accumulation_steps,
        fp16=CONFIG.fp16,
        bf16=CONFIG.bf16,
        learning_rate=CONFIG.learning_rate,
        embedding_learning_rate=CONFIG.embedding_learning_rate,
        lr_scheduler_type=CONFIG.lr_scheduler_type,
        warmup_ratio=CONFIG.warmup_ratio,
        optim=CONFIG.optim,
        weight_decay=CONFIG.weight_decay
    )
)

In [25]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 93 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 46
 "-____-"     Number of trainable parameters = 982,515,712


Unsloth: Setting lr = 1.00e-04 instead of 5.00e-04 for embed_tokens.
Unsloth: Setting lr = 1.00e-04 instead of 5.00e-04 for lm_head.




Step,Training Loss
1,2.4764
2,2.4537
3,2.0301
4,2.0356
5,1.8847
6,1.7667
7,2.009
8,1.6682
9,1.9064
10,1.9065


TrainOutput(global_step=46, training_loss=1.5811909437179565, metrics={'train_runtime': 1006.0047, 'train_samples_per_second': 0.185, 'train_steps_per_second': 0.046, 'total_flos': 5.34244721688576e+16, 'train_loss': 1.5811909437179565, 'epoch': 1.9574468085106385})

In [26]:
if not CONFIG.debug:
    wandb.finish()
    if CONFIG.lora:
        model.save_pretrained(CONFIG.model_name + "-LoRA")
        tokenizer.save_pretrained(CONFIG.model_name + "-LoRA")
    else:
        model.save_pretrained(CONFIG.model_name)
        tokenizer.save_pretrained(CONFIG.model_name)

VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▅█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▅▇███████▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁
train/loss,██▆▆▅▄▆▅▅▄▄▄▄▄▄▄▃▄▃▃▆▁▃▂▂▂▂▃▂▁▁▂▂▂▂▂▂▂▂▂

0,1
total_flos,5.34244721688576e+16
train/epoch,1.95745
train/global_step,46.0
train/grad_norm,0.37885
train/learning_rate,0.0
train/loss,1.2603
train_loss,1.58119
train_runtime,1006.0047
train_samples_per_second,0.185
train_steps_per_second,0.046


# Inference

In [27]:
sample_text = """Kyung Hee University
### Title: Department of Computer Science and Engineering

### Information:
"""
sample_response = generate_text(sample_text)

<|begin_of_text|>Kyung Hee University
### Title: Department of Computer Science and Engineering

### Information:
The document outlines the curriculum for the undergraduate program in computer engineering at Kyung Hee University, known as the Software Convergence Department. It includes a detailed description of the educational objectives, the structure of the curriculum, and specific courses offered.The first section introduces the department, stating that it is part of the College of Electronic Information but operates independently under the name "Software Convergence." The second section details the educational goals. These include nurturing creative talents who can lead future industries, fostering scientific thinking through interdisciplinary learning, and providing practical training to meet industry needs. To achieve these goals, the department aims to enhance students' problem-solving skills by combining theory with practice, emphasize communication and teamwork, and provide o

# Save

In [28]:
if not CONFIG.debug:
    if CONFIG.lora:
        model.save_pretrained_merged(
            CONFIG.model_name,
            tokenizer,
            save_method=CONFIG.save_method
        )
        model.push_to_hub_merged(
            CONFIG.model_name,
            tokenizer,
            save_method=CONFIG.save_method
        )
    else:
        model.push_to_hub(
            repo_id=CONFIG.username + "/" + CONFIG.model_name,
            use_temp_dir=False
        )
        tokenizer.push_to_hub(
            repo_id=CONFIG.username + "/" + CONFIG.model_name,
            use_temp_dir=False
        )

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 0.0 out of 15.48 RAM for saving.


 86%|████████▌ | 24/28 [00:00<00:00, 44.53it/s]We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:02<00:00, 13.45it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 0.0 out of 15.48 RAM for saving.


100%|██████████| 28/28 [00:02<00:00, 12.16it/s]


Unsloth: Saving to organization with address PathFinderKR/KHU-Llama-3.2-3B
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving to organization with address PathFinderKR/KHU-Llama-3.2-3B
Unsloth: Uploading all files... Please wait...


model-00002-of-00002.safetensors:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/KHU-Llama-3.2-3B
