In [None]:
# !pip install -q transformers datasets accelerate peft bitsandbytes soundfile librosa
!pip install snac
# Force update the three libraries that must work together
# !pip install -U bitsandbytes accelerate transformers -q

In [1]:
# BLOCK 2: Import Libraries
# ============================================================================
# WHY: We need to load all the tools we just installed
# WHAT IT DOES: Brings in all the functions and classes we'll use
# Think of this like opening your toolbox before starting work
import librosa
import torch  # PyTorch - the foundation for deep learning
import soundfile as sf  # For saving audio files
import numpy as np  # For working with numbers and arrays
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoFeatureExtractor,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset, Audio  # For loading your dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
)
import warnings
warnings.filterwarnings('ignore')
from dataclasses import dataclass
from typing import Dict, List, Union
import os

print("‚úÖ Libraries imported!")


‚úÖ Libraries imported!


In [2]:
# BLOCK 3: Configuration Settings
# ============================================================================

CONFIG = {
    # Model - snorbyte/snorTTS-Indic-v0
    "model_name": "snorbyte/snorTTS-Indic-v0",
    
    # Your Malayalam dataset
    "dataset_name": "neuralmaverick47/FULL_ML_DATASET-2024-25-reset",
    
    # Where to save the fine-tuned model
    "output_dir": "./snortts-indic-malayalam-finetuned",
    
    # Training duration
    "num_train_epochs": 15,
    
    # Batch settings
    "batch_size": 2,
    "gradient_accumulation_steps": 8,
    
    # Learning rate
    "learning_rate": 1e-4,
    
    # Logging and saving
    "warmup_steps": 50,
    "logging_steps": 10,
    "save_steps": 200,
    "eval_steps": 200,
    
    # Audio settings
    "max_text_length": 200,
    "audio_sample_rate": 24000,  # SnorTTS-Indic uses 22.05kHz
    "max_audio_length": 30,  # seconds
    
    # LoRA settings
    "use_lora": True,
    "lora_r": 16,
    "lora_alpha": 32,
}

print("‚úÖ Configuration set!")
print(f"üìä Model: {CONFIG['model_name']}")
print(f"üìä Dataset: {CONFIG['dataset_name']}")
print(f"üíæ Output: {CONFIG['output_dir']}")
print(f"üéµ Sample rate: {CONFIG['audio_sample_rate']} Hz")

‚úÖ Configuration set!
üìä Model: snorbyte/snorTTS-Indic-v0
üìä Dataset: neuralmaverick47/FULL_ML_DATASET-2024-25-reset
üíæ Output: ./snortts-indic-malayalam-finetuned
üéµ Sample rate: 24000 Hz


In [3]:
# BLOCK 4: Load Dataset
# ============================================================================

print("\n" + "="*60)
print("üìä LOADING DATASET")
print("="*60)

dataset = load_dataset(CONFIG["dataset_name"])

# Auto-detect splits
available_splits = list(dataset.keys())
print(f"‚úÖ Available splits: {available_splits}")

# Use train/validation split
if 'train' in available_splits:
    train_data = dataset['train']
    
    if 'validation' in available_splits:
        eval_data = dataset['validation']
        print("‚úÖ Using existing train/validation splits")
    elif 'test' in available_splits:
        eval_data = dataset['test']
        print("‚úÖ Using train/test splits")
    else:
        # Create validation split
        split_dataset = train_data.train_test_split(test_size=0.05, seed=42)
        train_data = split_dataset['train']
        eval_data = split_dataset['test']
        print("‚úÖ Created 95/5 train/validation split")
else:
    train_data = dataset[available_splits[0]]
    eval_data = None
    print(f"‚ö†Ô∏è Using '{available_splits[0]}' for training only")

print(f"\nüìä Training examples: {len(train_data)}")
if eval_data:
    print(f"üìä Validation examples: {len(eval_data)}")

print(f"\nüìã Dataset structure:")
print(train_data)
print(f"\nüìÑ Sample entry:")
print(train_data[0])
print(f"\nüîç Column names: {train_data.column_names}")



üìä LOADING DATASET


Using custom data configuration neuralmaverick47--FULL_ML_DATASET-2024-25-reset-e24e113030060bc5
Reusing dataset parquet (/root/.cache/huggingface/datasets/neuralmaverick47___parquet/neuralmaverick47--FULL_ML_DATASET-2024-25-reset-e24e113030060bc5/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/neuralmaverick47___parquet/neuralmaverick47--FULL_ML_DATASET-2024-25-reset-e24e113030060bc5/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-ed7d0aa01e5f518d.arrow and /root/.cache/huggingface/datasets/neuralmaverick47___parquet/neuralmaverick47--FULL_ML_DATASET-2024-25-reset-e24e113030060bc5/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-55f98666d11cd286.arrow


‚úÖ Available splits: ['train']
‚úÖ Created 95/5 train/validation split

üìä Training examples: 3021
üìä Validation examples: 159

üìã Dataset structure:
Dataset({
    features: ['index', 'audio', 'transcription'],
    num_rows: 3021
})

üìÑ Sample entry:
{'index': 2557, 'audio': {'path': 'chunk_21.wav', 'array': array([-0.00015259, -0.00015259, -0.00018311, ..., -0.00180054,
       -0.00088501,  0.00036621]), 'sampling_rate': 16000}, 'transcription': '‡¥à ‡¥á‡¥™‡µç‡¥™‡µã‡µæ ‡¥ö‡µÜ‡¥Ø‡µç‡¥§ ‡¥ï‡¥æ‡¥∞‡µç‡¥Ø‡¥ô‡µç‡¥ô‡µæ ‡¥í‡¥ï‡µç‡¥ï‡µÜ ‡¥é‡¥≤‡µç‡¥≤‡¥æ‡¥∞‡µÅ‡¥Ç ‡¥í‡¥®‡µç‡¥®‡µç ‡¥ü‡µç‡¥∞‡µà ‡¥ö‡µÜ‡¥Ø‡µç‡¥§‡µç ‡¥®‡µã‡¥ï‡µç‡¥ï‡¥£‡¥Ç.'}

üîç Column names: ['index', 'audio', 'transcription']


In [4]:
# BLOCK 5: Load Tokenizer and Feature Extractor
# ============================================================================

print("\n" + "="*60)
print("üìù LOADING TOKENIZER AND FEATURE EXTRACTOR")
print("="*60)

# Load tokenizer for text
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    CONFIG["model_name"],
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("‚úÖ Tokenizer loaded!")
print(f"   Vocabulary size: {len(tokenizer)}")
print(f"   Pad token: {tokenizer.pad_token}")

# Load feature extractor for audio (if available)
print("\nLoading feature extractor...")
try:
    feature_extractor = AutoFeatureExtractor.from_pretrained(
        CONFIG["model_name"],
        trust_remote_code=True
    )
    print("‚úÖ Feature extractor loaded!")
except Exception as e:
    print(f"‚ö†Ô∏è Feature extractor not available: {e}")
    print("   Using default audio processing settings")
    feature_extractor = None

# Test tokenization
test_text = "‡¥®‡¥Æ‡¥∏‡µç‡¥ï‡¥æ‡¥∞‡¥Ç"
test_tokens = tokenizer(test_text, return_tensors="pt")
print(f"\nüß™ Test tokenization:")
print(f"   Input: '{test_text}'")
print(f"   Token IDs shape: {test_tokens['input_ids'].shape}")
print(f"   First 10 tokens: {test_tokens['input_ids'][0][:10].tolist()}")


üìù LOADING TOKENIZER AND FEATURE EXTRACTOR
Loading tokenizer...
‚úÖ Tokenizer loaded!
   Vocabulary size: 156940
   Pad token: <custom_token_7>

Loading feature extractor...
‚ö†Ô∏è Feature extractor not available: Can't load feature extractor for 'snorbyte/snorTTS-Indic-v0'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'snorbyte/snorTTS-Indic-v0' is the correct path to a directory containing a preprocessor_config.json file
   Using default audio processing settings

üß™ Test tokenization:
   Input: '‡¥®‡¥Æ‡¥∏‡µç‡¥ï‡¥æ‡¥∞‡¥Ç'
   Token IDs shape: torch.Size([1, 15])
   First 10 tokens: [128000, 34839, 101, 34839, 106, 34839, 116, 85805, 243, 34839]


In [5]:
# BLOCK 6: Prepare Audio Data
# ============================================================================

print("\n" + "="*60)
print("üéµ PREPARING AUDIO DATA")
print("="*60)

# Cast audio to correct sample rate
print(f"Setting audio sample rate to {CONFIG['audio_sample_rate']} Hz...")

# Note: We'll handle audio processing in the preprocessing function
print(f"‚úÖ Audio will be processed during dataset preparation")



üéµ PREPARING AUDIO DATA
Setting audio sample rate to 24000 Hz...
‚úÖ Audio will be processed during dataset preparation


In [6]:
import torch
import librosa
import numpy as np
from snac import SNAC  # Use the correct class name

# 1. Properly load the SNAC model
# If you don't load the weights here, snac_model is undefined
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda")
snac_model.eval()

def preprocess_function(examples):
    all_input_ids = []
    
    # Identify transcription column
    text_col = next((c for c in ['transcription', 'text', 'sentence'] if c in examples), None)
    
    if text_col is None:
        print("‚ùå Error: Could not find text column in dataset.")
        return {"input_ids": [], "labels": []}

    for i in range(len(examples[text_col])):
        try:
            # A. Text to Tokens
            text = examples[text_col][i]
            # Ensure tokenizer is defined in your environment
            text_tokens = tokenizer.encode(text, add_special_tokens=True)
            
            # B. Audio Processing
            audio_data = examples['audio'][i]
            waveform = np.array(audio_data['array'])
            sr = audio_data['sampling_rate']
            
            # SNAC requires 24kHz
            if sr != 24000:
                waveform = librosa.resample(waveform, orig_sr=sr, target_sr=24000)
            
            # C. GPU-Accelerated Encoding
            audio_tensor = torch.from_numpy(waveform).float().unsqueeze(0).unsqueeze(0).to("cuda")
            
            with torch.no_grad():
                codes = snac_model.encode(audio_tensor)
            
            # D. Flattening & Offsetting (The 7-token pattern for snorTTS/Indic)
            AUDIO_OFFSET = 128266 
            flat_audio_tokens = []
            for level in codes:
                # level is [1, T]. Squeeze to get 1D and add offset
                tokens = (level.squeeze(0).cpu().numpy() + AUDIO_OFFSET).astype(int).tolist()
                flat_audio_tokens.extend(tokens)
            
            # E. Concatenation
            combined_sequence = text_tokens + flat_audio_tokens
            all_input_ids.append(combined_sequence[:2048])
            
        except Exception as e:
            # DON'T just continue; print the first few errors to debug
            if i < 3: 
                print(f"‚ö†Ô∏è Error at index {i}: {e}")
            continue

    return {"input_ids": all_input_ids, "labels": all_input_ids}

# 2. Execute
# Try batch_size=1 first if it keeps returning zero to see the error messages
processed_dataset = train_data.map(
    preprocess_function,
    batched=True,
    batch_size=32, 
    remove_columns=train_data.column_names,
    desc="üöÄ GPU-Accelerated SNAC Encoding"
)

print(f"\nüìä Final Processed Dataset Count: {len(processed_dataset)}")



üöÄ GPU-Accelerated SNAC Encoding:   0%|          | 0/95 [00:00<?, ?ba/s]


üìä Final Processed Dataset Count: 3021


In [7]:
# ============================================================================
# BLOCK 9: Data Collator (LSM / Llama-3 Optimized)
# ============================================================================
from transformers import DataCollatorForLanguageModeling

print("\n" + "="*60)
print("üì¶ INITIALIZING DATA COLLATOR")
print("="*60)

# Since snorTTS is a Causal LLM (Llama-based), we treat everything as 
# a language modeling task. 
# mlm=False tells the collator we are doing Causal LM (next token prediction), 
# not Masked LM (like BERT).

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Important: This is for Causal LM (Llama)
    pad_to_multiple_of=8 # Optimized for Shakti Cloud GPUs (H100/L40S)
)

print("‚úÖ Data Collator initialized using standard CLM logic.")
print("üí° No audio padding needed; the collator now pads integer tokens.")


üì¶ INITIALIZING DATA COLLATOR
‚úÖ Data Collator initialized using standard CLM logic.
üí° No audio padding needed; the collator now pads integer tokens.


In [None]:
# ============================================================================
# BLOCK 10: Load snorTTS (Meta-Tensor Fix & Stable Memory)
# ============================================================================
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import os

print("\n" + "="*60)
print("ü§ñ LOADING SNORTTS-INDIC (FINAL STABLE MODE)")
print("="*60)

offload_dir = "model_offload_cache"
os.makedirs(offload_dir, exist_ok=True)

# 1. Standardized Quantization Config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, 
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

try:
    # 2. Load without manual config overrides inside from_pretrained
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        token="",#place hf token
        offload_folder=offload_dir,
        torch_dtype=torch.float16, # Prevents meta-tensor type mismatch
        low_cpu_mem_usage=True
    )

    # 3. CRITICAL: Only set config values AFTER model is loaded on real devices
    model.config.use_cache = False
    
    # Check if pad_token exists, if not, align with tokenizer
    if tokenizer.pad_token_id is not None:
        model.config.pad_token_id = tokenizer.pad_token_id
    else:
        model.config.pad_token_id = tokenizer.eos_token_id

    print(f"‚úÖ Model loaded successfully!")
    print(f"üìç Real Device: {model.device}")

except Exception as e:
    print(f"‚ùå Load failed: {e}")
    print("\nüí° Troubleshooting:")
    print("If the error persists, try removing device_map='auto' and use device_map={'': 0}")


ü§ñ LOADING SNORTTS-INDIC (FINAL STABLE MODE)


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

‚úÖ Model loaded successfully!
üìç Real Device: cuda:0


In [10]:
# ============================================================================
# BLOCK 11: Apply LoRA for Llama-3.2 (Causal LM Architecture)
# ============================================================================
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

if CONFIG["use_lora"]:
    print("\n" + "="*60)
    print("üéØ APPLYING LORA FOR SNORTTS (LLAMA-3.2 BACKBONE)")
    print("="*60)
    
    # 1. Prepare model for k-bit training (Essential for 4-bit/8-bit models)
    # This enables gradient checkpointing and ensures non-trainable weights are frozen.
    model = prepare_model_for_kbit_training(model)
    
    # 2. Target Modules for Llama-3.2
    # For Llama-3 architectures, we target ALL linear layers to capture 
    # the nuances of Malayalam phonetics and tone.
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj", 
        "gate_proj", "up_proj", "down_proj"
    ]
    
    # 3. Configure LoRA for Causal Language Modeling
    # Note: TaskType.CAUSAL_LM is mandatory for Llama-based models.
    lora_config = LoraConfig(
        r=CONFIG.get("lora_r", 16),
        lora_alpha=CONFIG.get("lora_alpha", 32),
        target_modules=target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM  # CRITICAL: Llama is a Causal LM
    )
    
    # 4. Wrap model with LoRA adapters
    model = get_peft_model(model, lora_config)
    
    # Enable gradient checkpointing to save massive amounts of VRAM
    model.gradient_checkpointing_enable()
    
    print("‚úÖ LoRA Configured for Llama-3.2 Decoder-Only Architecture.")
    print(f"üìä Targeted Modules: {target_modules}")
    model.print_trainable_parameters()


üéØ APPLYING LORA FOR SNORTTS (LLAMA-3.2 BACKBONE)
‚úÖ LoRA Configured for Llama-3.2 Decoder-Only Architecture.
üìä Targeted Modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
trainable params: 24,313,856 || all params: 3,325,180,928 || trainable%: 0.7312


In [None]:
# ============================================================================
# BLOCK 12: Training Configuration
# ============================================================================
from transformers import TrainingArguments

print("\n" + "="*60)
print("‚öôÔ∏è CONFIGURING TRAINING SETTINGS")
print("="*60)

training_args = TrainingArguments(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["num_train_epochs"],
    per_device_train_batch_size=1, # Lowest for VRAM stability
    gradient_accumulation_steps=16, # Effective batch size = 16
    learning_rate=5e-5,            # Stable for audio token prediction
    lr_scheduler_type="cosine",
    warmup_steps=100,
    logging_steps=10,
    save_steps=200,
    fp16=True,                     # Use mixed precision
    gradient_checkpointing=True,   # Saves massive VRAM
    optim="paged_adamw_8bit",      # Stable 8-bit optimizer
    remove_unused_columns=False,
    report_to="none"
)

print("‚úÖ Training configuration complete!")

In [None]:
# ============================================================================
# BLOCK 13 & 14: Trainer Initialization & Execution
# ============================================================================
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset, # The SNAC-encoded stream
    data_collator=data_collator,     # The CLM collator from Block 9
)

print("\nüöÄ STARTING FINE-TUNING (MALAYALAM SPEECH)")
print("="*60)
trainer.train()
print("\nüéâ TRAINING COMPLETE!")

In [None]:
generate_malayalam_speech("‡¥®‡¥Æ‡¥∏‡µç‡¥ï‡¥æ‡¥∞‡¥Ç, ‡¥∏‡µÅ‡¥ñ‡¥Æ‡¥æ‡¥£‡µã?")

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
import torch

import os
import torch
import numpy as np
import soundfile as sf
from snac import SNAC
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel


# ============================================================================
# 1. CONFIGURATION
# ============================================================================
# Use the SnorTTS model as the "Base" for your sequential fine-tune
BASE_MODEL_ID = 'snorbyte/snorTTS-Indic-v0'
# Path to your specific Malayalam checkpoint folder
CHECKPOINT_PATH = 'snortts-indic-malayalam-finetuned/checkpoint-2600'
HUGGINGFACE_TOKEN = "" # Add your token 

# SnorTTS Architecture Constants
AUDIO_START_ID = 128266
END_OF_SPEECH_ID = 128258
MAX_SEQ_LENGTH = 2048

# ============================================================================
# 2. SEQUENTIAL MODEL LOADING
# ============================================================================
# Restart Kernel before running this!
logger.info("üì¶ Loading model in BF16 stability mode...")

# 1. Skip bnb_config entirely to avoid the quantization crash
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH)

# 2. Load base model in Bfloat16 (Standard for Llama 3.2 / H100)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16, # Use 16-bit instead of 4-bit
    device_map="auto",
    low_cpu_mem_usage=True,
    token=HUGGINGFACE_TOKEN,
    trust_remote_code=True
)

# 3. Load your Malayalam adapters
model = PeftModel.from_pretrained(base_model, CHECKPOINT_PATH)
model.eval()
logger.success("‚úÖ Model loaded successfully in BF16!")

# Load and Attach your fine-tuned Malayalam adapters
# Removed 'token=HUGGINGFACE_TOKEN' as it's a local path
model = PeftModel.from_pretrained(base_model, CHECKPOINT_PATH)
model.eval()
logger.success("‚úÖ Sequential model loaded successfully.")

# Load SNAC decoder (24kHz)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cuda").eval()

# ============================================================================
# 3. GENERATION FUNCTION
# ============================================================================
def generate_malayalam_audio(text, speaker_id=189):
    prompt = f"<custom_token_3><|begin_of_text|>malayalam{speaker_id}: {text} <|eot_id|><custom_token_4><custom_token_5><custom_token_1>"
    
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=0.5,
            do_sample=True,
            eos_token_id=END_OF_SPEECH_ID,
            pad_token_id=tokenizer.eos_token_id
        )

    # 1. Extraction
    generated_ids = output[0][inputs.input_ids.shape[1]:].cpu().tolist()
    audio_ids = [tid for tid in generated_ids if tid >= AUDIO_START_ID]
    
    num_frames = len(audio_ids) // 7
    if num_frames == 0:
        return None

    # 2. THE SHAKTHI FIX: Strict Bounds Checking
    # We ensure every ID is between 0 and 4095 before passing to SNAC
    clean_ids = [tid - AUDIO_START_ID for tid in audio_ids[:num_frames * 7]]
    l1, l2, l3 = [], [], []
    
    for i in range(num_frames):
        idx = i * 7
        try:
            # Level 1
            l1.append(max(0, min(4095, clean_ids[idx])))
            # Level 2
            l2.append(max(0, min(4095, clean_ids[idx + 1] - 4096)))
            l2.append(max(0, min(4095, clean_ids[idx + 4] - (4 * 4096))))
            # Level 3
            l3.append(max(0, min(4095, clean_ids[idx + 2] - (2 * 4096))))
            l3.append(max(0, min(4095, clean_ids[idx + 3] - (3 * 4096))))
            l3.append(max(0, min(4095, clean_ids[idx + 5] - (5 * 4096))))
            l3.append(max(0, min(4095, clean_ids[idx + 6] - (6 * 4096))))
        except IndexError:
            break

    # 3. Safe Tensors
    codes = [torch.tensor(c).unsqueeze(0).to("cuda") for c in [l1, l2, l3]]

    # 4. Final Reconstruction
    with torch.inference_mode():
        # This is where the assert usually happens if 'codes' has invalid values
        audio_waveform = snac_model.decode(codes)
    
    return audio_waveform.detach().squeeze().cpu().numpy()

# ============================================================================
# 4. RUN
# ============================================================================
text = "‡¥®‡¥Æ‡¥∏‡µç‡¥ï‡¥æ‡¥∞‡¥Ç, ‡¥á‡¥§‡µç ‡¥í‡¥∞‡µÅ ‡¥∏‡µÜ‡¥ï‡µç‡¥µ‡µª‡¥∑‡µç‡¥Ø‡µΩ ‡¥´‡µà‡µª ‡¥ü‡µç‡¥Ø‡µÇ‡µ∫‡¥°‡µç ‡¥Æ‡µã‡¥°‡¥≤‡¥æ‡¥£‡µç."
audio = generate_malayalam_audio(text)
import soundfile as sf
import numpy as np

if audio is not None:
    # 2. Define your local path
    # You can use a relative path like "output.wav" or an absolute path
    output_path = "malayalam_final_output.wav"
    
    # 3. Save the file using soundfile
    # We cast to float32 to ensure compatibility with most players
    sf.write(output_path, audio.astype(np.float32), 24000)
    
    print(f"‚úÖ Successfully saved audio locally to: {os.path.abspath(output_path)}")
else:
    print("‚ùå Audio generation failed, no file was saved.")


[32m2026-01-13 10:17:48.469[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1müì¶ Loading model in BF16 stability mode...[0m


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[32m2026-01-13 10:18:05.476[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m52[0m - [32m[1m‚úÖ Model loaded successfully in BF16![0m
[32m2026-01-13 10:18:16.592[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m58[0m - [32m[1m‚úÖ Sequential model loaded successfully.[0m


‚úÖ Successfully saved audio locally to: /workspace/malayalam_xtts_training/malayalam_tts_output/29-DEC-TTS/snorTTS-Indic-v0/malayalam_final_output.wav
