In [1]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import AutoTokenizer
from collections import Counter
from tqdm import tqdm
import re
import os
import unicodedata
import sys

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
#adding the fvt repo to the default path
sys.path.append(os.path.abspath("../fast-vocabulary-transfer"))
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:

max_seq_length = 512
dtype = None
load_in_4bit = True
fourbit_models = [
    'unsloth/gemma-2-9b-bnb-4bit',
    'unsloth/gemma-2-27b-bnb-4bit'
]

model,tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/gemma-2-9b-bnb-4bit',max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla V100-SXM2-32GB. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
print(model)

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (post_attention_laye

In [5]:
model.model.layers[0].self_attn.q_proj

Linear4bit(in_features=3584, out_features=4096, bias=False)

In [6]:
import os
from datasets import load_dataset, disable_progress_bar

# Disable hf_transfer in multiple ways
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = "0"
os.environ['USE_TORCH'] = "1"  # Force using torch for downloads

# Optional: Disable progress bars if they're causing issues
disable_progress_bar()

# Set longer timeout and chunk size
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = "500"  # 500 seconds timeout
os.environ['HF_HUB_DOWNLOAD_CHUNK_SIZE'] = "10485760"  # 10MB chunks

In [7]:
#downloading the sentiment dataset for evaluation
sentiment_dataset = load_dataset(
    'Prazzwal07/sentiment_analysis_nepglue'
    )

In [8]:
sentiment_dataset['test'][0]


{'label': 0,
 'sentence_nepali': "दिनको कष्टकर अन्त्य जस्तो, 'कोलेटरल डेमेज' ले श्वार्जनेगरलाई एक दुःखद पात्रको रूपमा प्रस्तुत गर्दछ।"}

In [9]:
GEMMA_PROMPT="""
<start_of_turn>user
{}<end_of_turn>
<start_of_turn>model
"""

# def eval_iterator(hf_dataset):
#     input_val =   GEMMA_PROMPT.format(hf_dataset[])  

In [10]:


# Specify cache directory
cache_dir = "/root/gemma2_finetuning/cache"  # Replace with your path

# Load dataset with more conservative settings
iriis_train = load_dataset(
    'IRIISNEPAL/Nepali-Text-Corpus', 
    split='train',
    revision='main',
    streaming=True,
    download_mode='force_redownload',
)



def preprocess_nepali(values_dict):
    
    text = values_dict['Article']
    # Handle empty or non-string input
    if not text or not isinstance(text, str):
        return ""
        
    # This step adds spaces before common Nepali suffixes (postpositions) that attach to words
    # ले = ergative case marker, को = possessive, मा = locative, बाट = ablative, देखि = from, सम्म = until
    # Example: "घरमा" -> "घर मा" (meaning "in the house")
    text = re.sub(r'(ले|को|मा|बाट|देखि|सम्म)$', r' \1', text)

    # First remove non-Devanagari characters except word separators, whitespace and numbers
    # text = re.sub(r'[^\u0900-\u097F\s।0-9]', '', text)
    
    # Then add spaces after word separators | and । in a single step
    text = re.sub(r'([।])(\S)', r'\1 \2', text)
    
    # Remove extra spaces including newlines and tabs
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [11]:
dummy_input = preprocess_nepali({'Article': GEMMA_PROMPT.format('Classify this sentiment as positive(+1) negative(-1) or neutral(0) -- बिहीबार दिउँसो खोला तर्ने क्रममा एक जना बगेर बेपत्ता भएको प्रहरीले जनाएको छ।')})

In [12]:
# Set CUDA debugging flags
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from IPython.core.debugger import set_trace


# Prepare inputs with position IDs

max_length = 512  # Match your model's configuration
inputs = tokenizer(
    dummy_input, 
    return_tensors='pt',
    max_length=max_length,
    truncation=True,
    padding='max_length',
    padding_side='right',
).to('cuda')
# 

print(inputs)

{'input_ids': tensor([[     2,    106,   1645,  51110,   4739,    736,  25627,    685,   6222,
          38567, 235274, 235275,   8322,   7026, 235274, 235275,    689,  17120,
         235278, 235276, 235275,   3297,  63730,  17293, 197703, 230668, 236351,
         237835, 190854,  20973, 235631,  17285,  10321,   8592,   9396, 210414,
          21595,  15848,   7516,  10993,   7320,  60546, 235462,  75314, 235619,
         185523,  12076, 143593,  12218, 235571,  19105,  19346,   7516,  10993,
         143593,  33905, 235940, 235248,    107, 235248,    106,   2516,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

In [13]:
inputs

{'input_ids': tensor([[     2,    106,   1645,  51110,   4739,    736,  25627,    685,   6222,
          38567, 235274, 235275,   8322,   7026, 235274, 235275,    689,  17120,
         235278, 235276, 235275,   3297,  63730,  17293, 197703, 230668, 236351,
         237835, 190854,  20973, 235631,  17285,  10321,   8592,   9396, 210414,
          21595,  15848,   7516,  10993,   7320,  60546, 235462,  75314, 235619,
         185523,  12076, 143593,  12218, 235571,  19105,  19346,   7516,  10993,
         143593,  33905, 235940, 235248,    107, 235248,    106,   2516,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

In [14]:
 # %%
# Add position IDs explicitly
position_ids = torch.arange(
        0, 
        inputs['input_ids'].shape[1], 
        dtype=torch.long, 
        device='cuda'
    )
inputs['position_ids'] = position_ids

print(f"position_ids: {inputs['position_ids']}")
# Update generation config
generation_config = model.generation_config
generation_config.use_cache = False
generation_config.max_length = max_length

# Generate model = model.to('cuda')
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    generation_config=generation_config,
    pad_token_id=tokenizer.pad_token_id
)

# Decode
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

position_ids: tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 17

--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/logging/__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/logging/__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/logging/__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/logging/__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/conda/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/conda/lib/python3.11/site-packages/traitlets/config/application.p

RuntimeError: The size of tensor a (16) must match the size of tensor b (513) at non-singleton dimension 1

In [5]:
iriis_tokenizer_dataset = iriis_train.take(100000)
#loading the tokenizer dataset
mapped_tokenizer_iterator = map(preprocess_nepali, iriis_tokenizer_dataset)

In [None]:
# Add Nepali-specific normalizers

#preparing the tokenizer for training
new_tokenizer = tokenizer.train_new_from_iterator(
    mapped_tokenizer_iterator, 
    vocab_size=6400, 
    show_progress=True,
)

In [None]:
new_tokenizer.tokenize('बिहीबार दिउँसो खोला तर्ने क्रममा एक जना बगेर बेपत्ता भएको प्रहरीले जनाएको छ। शिवतासक्षी नगरपालिका–१० सुकुम्वासी बस्तीका ४३ वर्षीय साहेब सरदार माइ खोला तर्ने क्रममा दिउँसो १ बजेदेखि बेपत्ता भएका हुन्। खोला तर्ने क्रममा सरदार बगेको देखेपछि स्थानीयले प्रहरीलाई खबर गरेका थिए । इलाका प्रहरी कार्यालय झिलभिले, प्रहरी चौकी माइधार र स्थानीयवासीले बेपत्ता सरदारको खाजीकार्य गरिरहेको जिल्ला प्रहरी कार्यालय झापाका प्रवक्ता महेन्द्रकुमार श्रेष्ठले जानकारी दिए ।')[:20]

In [8]:
from fvt.fvt import FastVocabularyTransfer

# let's create a new model withe new mapped embedding table
fvt = FastVocabularyTransfer()
new_model = fvt.transfer(
    in_tokenizer=new_tokenizer,
    gen_tokenizer=tokenizer,
    gen_model=model
)

In [None]:
from transformers import Trainer,  TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
print(f"Using {n_gpus} GPUs")



In [10]:
from peft import LoraConfig, get_peft_model

def custom_prepare_model_for_kbit_training(model):
    # 1. Enable input gradients - necessary for training
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    
    # 2. Disable gradient checkpointing to avoid memory issues with Gemma
    if hasattr(model, "gradient_checkpointing_enable"):
        model.gradient_checkpointing_disable()
    
    # 3. Disable KV caching which can interfere with training
    model.config.use_cache = False
    
    # 4. Convert 1D parameters (like bias terms and LayerNorm) to float32
    # This is crucial because these parameters need higher precision
    for name, param in model.named_parameters():
        if param.ndim == 1:  # bias or LayerNorm parameters
            param.data = param.data.to(torch.float16)
    
    return model

In [None]:
model = new_model
model = model.to(device)

# Prepare model for LoRA training
model = custom_prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=2,
    lora_alpha=4,
    target_modules=['q_proj','v_proj'],
    # target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [12]:
#creating a dataset:
from torch.utils.data import IterableDataset

class NepaliIterableDataset(IterableDataset):
    def __init__(self, dataset, tokenizer, max_length=2048):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Set a fixed number of steps for the scheduler
        self.num_examples = 1000000  # Adjust this based on how many examples you want to train on

    def __iter__(self):
        for item in self.dataset:
            text = preprocess_nepali(item)
            if text:
                encodings = self.tokenizer(
                    text, 
                    truncation=True,
                    padding='max_length',
                    max_length=self.max_length,
                    return_tensors="pt"
                )
                yield {
                    "input_ids": encodings["input_ids"][0],
                    "attention_mask": encodings["attention_mask"][0],
                    "labels": encodings["input_ids"][0].clone()
                }

    def __len__(self):
        return self.num_examples

# Create data collator with padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=new_tokenizer, 
    mlm=False,
    pad_to_multiple_of=8  # Helps with GPU efficiency
)


train_dataset = NepaliIterableDataset(iriis_train, new_tokenizer)


In [13]:
import os
os.environ['CC'] = 'gcc'

In [None]:

# Update training arguments with device settings
training_args = TrainingArguments(
    output_dir='./nepali_lora_weights_ver2',
    num_train_epochs=3,
    warmup_steps=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    save_steps=500,
    logging_steps=5,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_32bit",
    save_total_limit=2,
    no_cuda=False,  # Enable CUDA
)


# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=new_tokenizer, 
    mlm=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your dataset
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the LoRA weights
model.save_pretrained("./nepali_lora_weights")

In [3]:
import openai
import os
from openai import OpenAI

client = OpenAI(
    api_key='',  # This is the default and can be omitted
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- कांही नभएको जात्रा हाँडी गाउँमा. It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

In [None]:
chat_completion

In [5]:
chat_completion2 = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- कसैलाई के धन्दा घरज्वाईंलाई खानको धन्दा. It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

In [None]:
print(chat_completion2)

In [None]:
chat_completion3 = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- कच्चा वैद्यको मात्रा यमपुरीको यात्रा It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

print(chat_completion3)

In [None]:
chat_completion3 = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- औंला दिंदा डुँडुल्ना निल्ने	Try really hard to understand the context of the usage from previous examples in the internet. It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

print(chat_completion3)

In [None]:
chat_completion4 = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- ओरालो लागेको मृगलाई बाच्छाले पनि खेद्छ		Try really hard to understand the context of the usage from previous examples in the internet. It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

print(chat_completion4)

In [None]:
chat_completion5 = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- एक हातले तालि बज्दैन		Try really hard to understand the context of the usage from previous examples in the internet. It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

print(chat_completion5)

In [None]:
chat_completion6 = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Use this nepali idiom in a good sentence -- एक कान दुई कान मैदान		Try really hard to understand the context of the usage from previous examples in the internet. It should be structured such that user 1 says something and user 2 says this idiom in the response sentence",
        }
    ],
    model="gpt-4o",
)

print(chat_completion6)