In [None]:
!pip install datasets --quiet
import json
from datasets import Dataset

# Upload your JSON file
from google.colab import files
uploaded = files.upload("content_cleaned.json")
json_filename = list(uploaded.keys())[0]

with open(json_filename, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

def prepare_example(ex):
    prompt = ex['instruction']
    if ex.get('input'):
        prompt += ex['input']
    return {
        "instruction": prompt.strip(),
        "response": ex['output'].strip()
    }

dataset = Dataset.from_list([prepare_example(ex) for ex in raw_data])
dataset.save_to_disk('/content/telugu_stories_dataset')
print("Dataset saved!")


Saving content_cleaned.json to content_cleaned.json/content_cleaned (1).json


Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

Dataset saved!


In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("PYTORCH_CUDA_ALLOC_CONF set to expandable_segments:True")

PYTORCH_CUDA_ALLOC_CONF set to expandable_segments:True


In [None]:
# Install dependencies (run in Colab)
!pip install transformers datasets accelerate peft bitsandbytes torch --quiet

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig # Removed DataCollatorForLanguageModeling
from datasets import load_from_disk
from peft import get_peft_model, LoraConfig, TaskType

# Load processed dataset (assume from previous notebook)
dataset = load_from_disk('/content/telugu_stories_dataset')

model_name = "sarvamai/sarvam-1"

# Configure 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True, # Enable CPU offload for 32-bit modules
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", # Reverted to auto device map
    quantization_config=bnb_config
)

# Tokenize function with explicit padding and truncation
def tokenize_function(example):
    # Prepare messages for each example in the batch
    batched_messages = []
    for i in range(len(example['instruction'])):
        user_message = example['instruction'][i]
        if example.get('input') and example['input'][i] is not None:
            user_message += example['input'][i]

        messages = [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": example['response'][i]}
        ]
        batched_messages.append(messages)

    # Apply chat template and tokenize the batch with padding and truncation
    tokenized_output = tokenizer.apply_chat_template(
        batched_messages, # Pass the list of message lists
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        padding="max_length", # Pad to max length
        truncation=True, # Truncate sequences
        max_length=512 # Set a fixed max length instead of tokenizer.model_max_length
    )

    # Add labels for causal language modeling (clone input_ids)
    tokenized_output["labels"] = tokenized_output["input_ids"].clone()

    return tokenized_output


tokenized_dataset = dataset.map(tokenize_function, batched=True) # Use batched=True for efficient padding

# PEFT LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./models/sarvam_finetuned",
    per_device_train_batch_size=2, # Reduced batch size
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    eval_strategy="no",
    load_best_model_at_end=False, # Removed because eval_strategy is "no"
    report_to="none", # Disable W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    # Removed data_collator as padding is handled in tokenize_function
)

trainer.train()

model.save_pretrained("./models/sarvam_finetuned")
tokenizer.save_pretrained("./models/sarvam_finetuned")
print("Fine-tuning complete!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
!pip install streamlit --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install pyngrok --quiet

In [None]:
# Run the Streamlit app
!streamlit run app.py --server.port 8501 &>/dev/null &

In [None]:
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "./models/sarvam_finetuned"

@st.cache_resource(show_spinner=True)
def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, local_files_only=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_model()

st.title("TeluguGPT - Fine-tuned Sarvam LLM")

prompt = st.text_area("Enter your prompt (Telugu or English):")

if st.button("Generate"):
    if not prompt.strip():
        st.warning("Please enter a prompt.")
    else:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            inputs["input_ids"],
            max_length=200,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        st.write(result)
