In [1]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import AutoTokenizer
from collections import Counter
from tqdm import tqdm
import re
import os
from datasets import load_dataset
import re
import unicodedata
import os
import sys

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
#adding the fvt repo to the default path
sys.path.append(os.path.abspath("../fast-vocabulary-transfer"))
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [3]:

max_seq_length = 512
dtype = None
load_in_4bit = True
fourbit_models = [
    'unsloth/gemma-2-9b-bnb-4bit',
    'unsloth/gemma-2-27b-bnb-4bit'
]

model,tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/gemma-2-9b-bnb-4bit',max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla V100-SXM2-32GB. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
import os
from datasets import load_dataset, disable_progress_bar

# Disable hf_transfer in multiple ways
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = "0"
os.environ['USE_TORCH'] = "1"  # Force using torch for downloads

# Optional: Disable progress bars if they're causing issues
disable_progress_bar()

# Set longer timeout and chunk size
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = "500"  # 500 seconds timeout
os.environ['HF_HUB_DOWNLOAD_CHUNK_SIZE'] = "10485760"  # 10MB chunks

# Specify cache directory
cache_dir = "/root/gemma2_finetuning/cache"  # Replace with your path

# Load dataset with more conservative settings
iriis_train = load_dataset(
    'IRIISNEPAL/Nepali-Text-Corpus', 
    split='train',
    revision='main',
    streaming=True,
    download_mode='force_redownload',

)



def preprocess_nepali(values_dict):
    
    text = values_dict['Article']
    # Handle empty or non-string input
    if not text or not isinstance(text, str):
        return ""
        
    # This step adds spaces before common Nepali suffixes (postpositions) that attach to words
    # ले = ergative case marker, को = possessive, मा = locative, बाट = ablative, देखि = from, सम्म = until
    # Example: "घरमा" -> "घर मा" (meaning "in the house")
    text = re.sub(r'(ले|को|मा|बाट|देखि|सम्म)$', r' \1', text)

    # First remove non-Devanagari characters except word separators, whitespace and numbers
    text = re.sub(r'[^\u0900-\u097F\s।0-9]', '', text)
    
    # Then add spaces after word separators | and । in a single step
    text = re.sub(r'([।])(\S)', r'\1 \2', text)
    
    # Remove extra spaces including newlines and tabs
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [5]:
iriis_tokenizer_dataset = iriis_train.take(100000)
#loading the tokenizer dataset
mapped_tokenizer_iterator = map(preprocess_nepali, iriis_tokenizer_dataset)

In [6]:
# Add Nepali-specific normalizers

#preparing the tokenizer for training
new_tokenizer = tokenizer.train_new_from_iterator(
    mapped_tokenizer_iterator, 
    vocab_size=6400, 
    show_progress=True,
)






In [7]:
new_tokenizer.tokenize('बिहीबार दिउँसो खोला तर्ने क्रममा एक जना बगेर बेपत्ता भएको प्रहरीले जनाएको छ। शिवतासक्षी नगरपालिका–१० सुकुम्वासी बस्तीका ४३ वर्षीय साहेब सरदार माइ खोला तर्ने क्रममा दिउँसो १ बजेदेखि बेपत्ता भएका हुन्। खोला तर्ने क्रममा सरदार बगेको देखेपछि स्थानीयले प्रहरीलाई खबर गरेका थिए । इलाका प्रहरी कार्यालय झिलभिले, प्रहरी चौकी माइधार र स्थानीयवासीले बेपत्ता सरदारको खाजीकार्य गरिरहेको जिल्ला प्रहरी कार्यालय झापाका प्रवक्ता महेन्द्रकुमार श्रेष्ठले जानकारी दिए ।')[:20]

['बिहीबार▁',
 'दिउँसो▁',
 'खोला▁',
 'तर्',
 'ने▁',
 'क्रममा▁',
 'एक▁जना▁',
 'ब',
 'ग',
 'ेर▁',
 'बेपत्ता▁',
 'भएको▁',
 'प्रहरीले▁',
 'जनाएको▁छ।▁',
 'शिव',
 'ता',
 'स',
 'क्ष',
 'ी▁',
 'नगरपालिका']

In [8]:
from fvt.fvt import FastVocabularyTransfer

# let's create a new model withe new mapped embedding table
fvt = FastVocabularyTransfer()
new_model = fvt.transfer(
    in_tokenizer=new_tokenizer,
    gen_tokenizer=tokenizer,
    gen_model=model
)

In [9]:
from transformers import Trainer,  TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
print(f"Using {n_gpus} GPUs")



Using 1 GPUs


In [10]:
from peft import LoraConfig, get_peft_model

def custom_prepare_model_for_kbit_training(model):
    # 1. Enable input gradients - necessary for training
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    
    # 2. Disable gradient checkpointing to avoid memory issues with Gemma
    if hasattr(model, "gradient_checkpointing_enable"):
        model.gradient_checkpointing_disable()
    
    # 3. Disable KV caching which can interfere with training
    model.config.use_cache = False
    
    # 4. Convert 1D parameters (like bias terms and LayerNorm) to float32
    # This is crucial because these parameters need higher precision
    for name, param in model.named_parameters():
        if param.ndim == 1:  # bias or LayerNorm parameters
            param.data = param.data.to(torch.float16)
    
    return model

In [11]:
model = new_model
model = model.to(device)

# Prepare model for LoRA training
model = custom_prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=2,
    lora_alpha=4,
    target_modules=['q_proj','v_proj'],
    # target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,118,208 || all params: 8,348,257,792 || trainable%: 0.0134


In [12]:
#creating a dataset:
from torch.utils.data import IterableDataset

class NepaliIterableDataset(IterableDataset):
    def __init__(self, dataset, tokenizer, max_length=2048):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Set a fixed number of steps for the scheduler
        self.num_examples = 1000000  # Adjust this based on how many examples you want to train on

    def __iter__(self):
        for item in self.dataset:
            text = preprocess_nepali(item)
            if text:
                encodings = self.tokenizer(
                    text, 
                    truncation=True,
                    padding='max_length',
                    max_length=self.max_length,
                    return_tensors="pt"
                )
                yield {
                    "input_ids": encodings["input_ids"][0],
                    "attention_mask": encodings["attention_mask"][0],
                    "labels": encodings["input_ids"][0].clone()
                }

    def __len__(self):
        return self.num_examples

# Create data collator with padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=new_tokenizer, 
    mlm=False,
    pad_to_multiple_of=8  # Helps with GPU efficiency
)


train_dataset = NepaliIterableDataset(iriis_train, new_tokenizer)


In [13]:
import os
os.environ['CC'] = 'gcc'

In [14]:

# Update training arguments with device settings
training_args = TrainingArguments(
    output_dir='./nepali_lora_weights',
    num_train_epochs=3,
    warmup_steps=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    save_steps=100,
    logging_steps=5,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_32bit",
    save_total_limit=2,
    no_cuda=False,  # Enable CUDA
)


# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=new_tokenizer, 
    mlm=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your dataset
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the LoRA weights
model.save_pretrained("./nepali_lora_weights")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000,000 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 16
\        /    Total batch size = 32 | Total steps = 93,750
 "-____-"     Number of trainable parameters = 1,118,208
AUTOTUNE bmm(32x2048x2048, 32x2048x256)
  bmm 1.0537 ms 100.0% 
  triton_bmm_70 1.8504 ms 56.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_62 2.0777 ms 50.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_72 2.2528 ms 46.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=8
  triton_bmm_75 2.3634 ms 44.6% ACC_TYPE='tl.float32', ALLOW

OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 180.69 MiB is free. Process 1430676 has 31.55 GiB memory in use. Of the allocated memory 28.97 GiB is allocated by PyTorch, and 659.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)