In [2]:
!pip install datasets --quiet
import json
from datasets import Dataset

# Upload your JSON file
from google.colab import files
uploaded = files.upload("content_cleaned.json")
json_filename = list(uploaded.keys())[0]

with open(json_filename, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

def prepare_example(ex):
    prompt = ex['instruction']
    if ex.get('input'):
        prompt += ex['input']
    return {
        "instruction": prompt.strip(),
        "response": ex['output'].strip()
    }

dataset = Dataset.from_list([prepare_example(ex) for ex in raw_data])
dataset.save_to_disk('/content/telugu_stories_dataset')
print("Dataset saved!")


Saving content_cleaned.json to content_cleaned.json/content_cleaned.json


Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

Dataset saved!


In [None]:

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("PYTORCH_CUDA_ALLOC_CONF set to expandable_segments:True")
# Install dependencies (run in Colab)
!pip install transformers datasets accelerate peft bitsandbytes torch --quiet

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig # Removed DataCollatorForLanguageModeling
from datasets import load_from_disk
from peft import get_peft_model, LoraConfig, TaskType

# Load processed dataset (assume from previous notebook)
dataset = load_from_disk('/content/telugu_stories_dataset')

model_name = "sarvamai/sarvam-1"

# Configure 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True, # Enable CPU offload for 32-bit modules
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", # Reverted to auto device map
    quantization_config=bnb_config
)

# Tokenize function with explicit padding and truncation
def tokenize_function(example):
    # Prepare messages for each example in the batch
    batched_messages = []
    for i in range(len(example['instruction'])):
        user_message = example['instruction'][i]
        if example.get('input') and example['input'][i] is not None:
            user_message += example['input'][i]

        messages = [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": example['response'][i]}
        ]
        batched_messages.append(messages)

    # Apply chat template and tokenize the batch with padding and truncation
    tokenized_output = tokenizer.apply_chat_template(
        batched_messages, # Pass the list of message lists
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        padding="max_length", # Pad to max length
        truncation=True, # Truncate sequences
        max_length=512 # Set a fixed max length instead of tokenizer.model_max_length
    )

    # Add labels for causal language modeling (clone input_ids)
    tokenized_output["labels"] = tokenized_output["input_ids"].clone()

    return tokenized_output


tokenized_dataset = dataset.map(tokenize_function, batched=True) # Use batched=True for efficient padding

# PEFT LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./models/sarvam_finetuned",
    per_device_train_batch_size=2, # Reduced batch size
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    eval_strategy="no",
    load_best_model_at_end=False, # Removed because eval_strategy is "no"
    report_to="none", # Disable W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    # Removed data_collator as padding is handled in tokenize_function
)

trainer.train()

model.save_pretrained("./models/sarvam_finetuned")
tokenizer.save_pretrained("./models/sarvam_finetuned")
print("Fine-tuning complete!")

PYTORCH_CUDA_ALLOC_CONF set to expandable_segments:True
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/279M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/193 [00:00<?, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  trainer = Trainer(
