## Fine Tune ( Nepali language - till best)

In [4]:
# !python -V
# !nvidia-smi

In [1]:
import torch, sys
print("python:", sys.version.splitlines()[0])
print("torch:", getattr(torch, "__version__", "not installed"))
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("cuda device name:", torch.cuda.get_device_name(0))
    prop = torch.cuda.get_device_properties(0)
    print("total device memory (GB):", round(prop.total_memory/1024**3,2))


python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
torch: 2.6.0+cu124
cuda available: True
cuda device name: Tesla T4
total device memory (GB): 14.74


In [7]:
# core packages to install
# upgrade pip, install core libs (quiet)
# !pip install -q --upgrade pip

# core TTS and NLP tooling
# !pip install -q TTS transformers datasets huggingface_hub

# PEFT (LoRA) and accelerate
# !pip install -q peft accelerate

# audio helpers
# !pip install -q librosa soundfile

# ffmpeg (system tool) - necessary for some audio ops
# !apt-get update -y && apt-get install -y ffmpeg


## Prepraing data

In [60]:
import os
import pandas as pd
import librosa
import soundfile as sf

In [61]:
import os
import pandas as pd

# -------------------------------
# Set paths for all datasets
# -------------------------------
data_root1 = "//kaggle/input/nepali-speech2/cv-corpus-21.0-2025-03-14/ne-NP/"
data_root2 = "//kaggle/input/nepali-speech/cv-corpus-22.0-2025-06-20/ne-NP/"
data_root3 = "/kaggle/input/dataset-3/cv-corpus-12.0-2022-12-07/ne-NP/"
data_root4 = "/kaggle/input/nepali-speech4/cv-corpus-20.0-2024-12-06/ne-NP/"
data_root5 = "/kaggle/input/speech56/cv-corpus-13.0-2023-03-09/cv-corpus-13.0-2023-03-09/ne-NP/"
data_root6 = "/kaggle/input/speech56/cv-corpus-14.0-2023-06-23/cv-corpus-14.0-2023-06-23/ne-NP/"

# Dataset 1
clips_dir1 = os.path.join(data_root1, "clips")
train_tsv1 = os.path.join(data_root1, "train.tsv")
test_tsv1 = os.path.join(data_root1, "test.tsv")

# Dataset 2
clips_dir2 = os.path.join(data_root2, "clips")
train_tsv2 = os.path.join(data_root2, "train.tsv")
test_tsv2 = os.path.join(data_root2, "test.tsv")

# Dataset 3
clips_dir3 = os.path.join(data_root3, "clips")
train_tsv3 = os.path.join(data_root3, "train.tsv")
test_tsv3 = os.path.join(data_root3, "test.tsv")

# Dataset 4
clips_dir4 = os.path.join(data_root4, "clips")
train_tsv4 = os.path.join(data_root4, "train.tsv")
test_tsv4 = os.path.join(data_root4, "test.tsv")

# Dataset 5
clips_dir5 = os.path.join(data_root5, "clips")
train_tsv5 = os.path.join(data_root5, "train.tsv")
test_tsv5 = os.path.join(data_root5, "test.tsv")

# Dataset 6
clips_dir6 = os.path.join(data_root6, "clips")
train_tsv6 = os.path.join(data_root6, "train.tsv")
test_tsv6 = os.path.join(data_root6, "test.tsv")

# -------------------------------
# Load all TSV files
# -------------------------------
df1 = pd.read_csv(train_tsv1, sep="\t")
df2 = pd.read_csv(train_tsv2, sep="\t")
df3 = pd.read_csv(train_tsv3, sep="\t")
df4 = pd.read_csv(train_tsv4, sep="\t")
df5 = pd.read_csv(test_tsv1, sep="\t")
df6 = pd.read_csv(test_tsv2, sep="\t")
df7 = pd.read_csv(test_tsv3, sep="\t")
df8 = pd.read_csv(test_tsv4, sep="\t")
df9 = pd.read_csv(train_tsv5, sep="\t")
df10 = pd.read_csv(train_tsv6, sep="\t")
df11 = pd.read_csv(test_tsv5, sep="\t")
df12 = pd.read_csv(test_tsv6, sep="\t")


# Inspect first 2 rows of each dataset
print("Dataset 1 sample:\n", df1.head(2))
# print("Dataset 2 sample:\n", df2.head(2))
# print("Dataset 3 sample:\n", df3.head(2))
# print("Dataset 4 sample:\n", df4.head(2))

# Check columns and number of samples
for i, df in enumerate([df1, df2, df3, df4], start=1):
    print(f"Dataset {i} Columns: {df.columns.tolist()}")
    print(f"Dataset {i} Total training samples: {len(df)}\n")


Dataset 1 sample:
                                            client_id  \
0  008ec3f33e366622a460d146d5486d7b192c8ddc438c64...   
1  008ec3f33e366622a460d146d5486d7b192c8ddc438c64...   

                              path  \
0  common_voice_ne-NP_36285450.mp3   
1  common_voice_ne-NP_36285453.mp3   

                                         sentence_id  \
0  cac68e3d9c9e3d739f2ff039d4998f411a3d45dc191d11...   
1  af44ec5b32a98f98ec6845ab7b0deb6dfb48821ef26c41...   

                                        sentence  sentence_domain  up_votes  \
0  अहिलेलाई एक हजारवटा वाक्यहरु जम्मा गर्ने हो ।              NaN         2   
1                त्यतिबेलाका मित्रले सोधेका थिए।              NaN         2   

   down_votes       age          gender accents  variant locale  segment  
0           0  twenties  male_masculine     NaN      NaN  ne-NP      NaN  
1           1  twenties  male_masculine     NaN      NaN  ne-NP      NaN  
Dataset 1 Columns: ['client_id', 'path', 'sentence_id', 'sentence

In [62]:
import os
import pandas as pd
import librosa
import soundfile as sf

# Paths for all 6 datasets
data_roots = [
    "//kaggle/input/nepali-speech2/cv-corpus-21.0-2025-03-14/ne-NP/",
    "//kaggle/input/nepali-speech/cv-corpus-22.0-2025-06-20/ne-NP/",
    "/kaggle/input/dataset-3/cv-corpus-12.0-2022-12-07/ne-NP/",
    "/kaggle/input/nepali-speech4/cv-corpus-20.0-2024-12-06/ne-NP/",
    "/kaggle/input/speech56/cv-corpus-13.0-2023-03-09/cv-corpus-13.0-2023-03-09/ne-NP/",
    "/kaggle/input/speech56/cv-corpus-14.0-2023-06-23/cv-corpus-14.0-2023-06-23/ne-NP/"
]

# Output directory for processed audio
output_dir = "/kaggle/working/audio_files"
os.makedirs(output_dir, exist_ok=True)

# Load TSVs (train + test for each dataset)
tsv_files = []
for root in data_roots:
    tsv_files.append(os.path.join(root, "train.tsv"))
    tsv_files.append(os.path.join(root, "test.tsv"))

dfs = [pd.read_csv(f, sep="\t") for f in tsv_files]

# Define clips directories corresponding to each TSV
clips_dirs = []
for root in data_roots:
    clips_dirs.append(os.path.join(root, "clips"))  # train
    clips_dirs.append(os.path.join(root, "clips"))  # test

# Initialize manifest
manifest = []

# Process all datasets
counter = 0
for df_idx, (df, clips_dir) in enumerate(zip(dfs, clips_dirs)):
    for i in range(len(df)):
        audio_path = os.path.join(clips_dir, df.loc[i, 'path'])
        out_path = os.path.join(output_dir, f"audio_{counter:06d}.wav")
        
        # Load, resample, save as 16k
        y, sr = librosa.load(audio_path, sr=16000)
        sf.write(out_path, y, 16000)
        
        # Add to manifest
        sentence = df.loc[i, 'sentence'].strip()
        if len(sentence) > 0:
            manifest.append({"path": out_path, "sentence": sentence})
        
        # Progress print
        if (i+1) % 50 == 0 or i == len(df)-1:
            print(f"Processed {i+1}/{len(df)} files from dataset {df_idx}, total count {counter+1}")
        
        counter += 1

# Save combined manifest
df_manifest = pd.DataFrame(manifest)
manifest_csv = "/kaggle/working/manifest.csv"
df_manifest.to_csv(manifest_csv, index=False)
print(f"Full manifest saved to {manifest_csv}")


Processed 50/365 files from dataset 0, total count 50
Processed 100/365 files from dataset 0, total count 100
Processed 150/365 files from dataset 0, total count 150
Processed 200/365 files from dataset 0, total count 200
Processed 250/365 files from dataset 0, total count 250
Processed 300/365 files from dataset 0, total count 300
Processed 350/365 files from dataset 0, total count 350
Processed 365/365 files from dataset 0, total count 365
Processed 50/272 files from dataset 1, total count 415
Processed 100/272 files from dataset 1, total count 465
Processed 150/272 files from dataset 1, total count 515
Processed 200/272 files from dataset 1, total count 565
Processed 250/272 files from dataset 1, total count 615
Processed 272/272 files from dataset 1, total count 637
Processed 50/353 files from dataset 2, total count 687
Processed 100/353 files from dataset 2, total count 737
Processed 150/353 files from dataset 2, total count 787
Processed 200/353 files from dataset 2, total count 

## Model and Tokenizer setup

In [63]:
from torch.utils.data import Dataset

In [64]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import VitsModel, AutoProcessor
from torch.utils.data import DataLoader
import torchaudio

import warnings
warnings.filterwarnings("ignore")

## training

In [103]:
from transformers import VitsModel, AutoTokenizer
import torch
from IPython.display import Audio


model = VitsModel.from_pretrained("procit001/nepali_male_v1")
tokenizer = AutoTokenizer.from_pretrained("procit001/nepali_male_v1")

text = "म पनि जान्छु है त अहिले लाई"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs).waveform



Audio(output, rate=model.config.sampling_rate)


In [21]:
from IPython.display import Audio, display


## Better

In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, VitsModel
import os
import numpy as np
from scipy.io.wavfile import write
import matplotlib.pyplot as plt


In [66]:
MODEL_NAME = "tuskbyte/nepali_male_v1"
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = VitsModel.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded...")

Model loaded...


In [67]:
# Create directories for checkpoints and samples
os.makedirs("checkpoints", exist_ok=True)
os.makedirs("samples", exist_ok=True)

In [68]:
class NepaliTTSDataset(Dataset):
    def __init__(self, manifest_path, processor, max_length=512):
        self.df = pd.read_csv(manifest_path)
        self.processor = processor
        self.max_length = max_length
        
        # Filter out samples that are too long
        self.df = self.df[self.df['sentence'].str.len() < 200].reset_index(drop=True)
        print(f"Dataset size after filtering: {len(self.df)}")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['sentence']
        audio_path = row['path']

        # Load and preprocess audio
        try:
            waveform, sr = torchaudio.load(audio_path)
            if sr != 16000:
                waveform = torchaudio.functional.resample(waveform, sr, 16000)
            
            # Normalize audio
            waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-9)
            
            # Tokenize text with proper handling
            inputs = self.processor(
                text=text, 
                return_tensors="pt", 
                padding="max_length", 
                truncation=True, 
                max_length=self.max_length
            )

            return {
                "input_ids": inputs["input_ids"].squeeze(0),
                "attention_mask": inputs["attention_mask"].squeeze(0),
                "waveform": waveform.squeeze(0),
                "text": text,
                "audio_path": audio_path
            }
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            # Return a dummy sample in case of error
            dummy_text = "नमस्ते"
            dummy_inputs = self.processor(
                text=dummy_text, 
                return_tensors="pt", 
                padding="max_length", 
                truncation=True, 
                max_length=self.max_length
            )
            return {
                "input_ids": dummy_inputs["input_ids"].squeeze(0),
                "attention_mask": dummy_inputs["attention_mask"].squeeze(0),
                "waveform": torch.zeros(16000),
                "text": dummy_text,
                "audio_path": "dummy"
            }

In [69]:
def collate_fn(batch):
    # Filter out dummy samples
    batch = [item for item in batch if item["audio_path"] != "dummy"]
    if not batch:
        return None
    
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_masks = torch.stack([item["attention_mask"] for item in batch])
    waveforms = [item["waveform"] for item in batch]
    texts = [item["text"] for item in batch]
    
    # Pad waveforms to same length
    max_length = max(w.size(0) for w in waveforms)
    padded_waveforms = []
    for w in waveforms:
        if w.size(0) < max_length:
            padded_w = torch.nn.functional.pad(w, (0, max_length - w.size(0)))
        else:
            padded_w = w[:max_length]
        padded_waveforms.append(padded_w)
    
    batch_waveforms = torch.stack(padded_waveforms)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "waveforms": batch_waveforms,
        "texts": texts
    }


In [70]:
# Custom loss function that's more suitable for TTS
class TTSLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1_loss = nn.L1Loss()
        self.mse_loss = nn.MSELoss()
    
    def forward(self, pred_waveform, target_waveform):
        # Ensure same length
        min_length = min(pred_waveform.size(-1), target_waveform.size(-1))
        pred = pred_waveform[..., :min_length]
        target = target_waveform[..., :min_length]
        
        # Combine L1 and MSE loss for better quality
        l1 = self.l1_loss(pred, target)
        mse = self.mse_loss(pred, target)
        
        return l1 + 0.5 * mse

In [71]:
def save_checkpoint(model, optimizer, scheduler, epoch, loss, filepath):
    """Save model checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': loss,
    }
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved: {filepath}")

In [72]:
def load_checkpoint(model, optimizer, scheduler, filepath):
    """Load model checkpoint"""
    if os.path.exists(filepath):
        checkpoint = torch.load(filepath, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"Checkpoint loaded: {filepath}, resuming from epoch {start_epoch}")
        return start_epoch
    return 0

In [73]:
# def generate_sample(model, processor, text, filepath, device):
#     """Generate and save audio sample during training"""
#     model.eval()
#     with torch.no_grad():
#         try:
#             inputs = processor(text=text, return_tensors="pt").to(device)
#             with torch.no_grad():
#                 outputs = model(**inputs)
            
#             waveform = outputs.waveform.squeeze().cpu().numpy()
            
#             # Normalize and save
#             waveform = waveform / (np.max(np.abs(waveform)) + 1e-9) * 0.8
#             write(filepath, 16000, (waveform * 32767).astype(np.int16))
#             print(f"Sample saved: {filepath}")
            
#         except Exception as e:
#             print(f"Error generating sample: {e}")
#     model.train()


import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio, display

def generate_sample(model, processor, text, filepath, device):
    """Generate, save, and play audio sample during training"""
    model.eval()
    with torch.no_grad():
        try:
            inputs = processor(text=text, return_tensors="pt").to(device)
            outputs = model(**inputs)

            waveform = outputs.waveform.squeeze().cpu().numpy()

            # Normalize waveform
            waveform = waveform / (np.max(np.abs(waveform)) + 1e-9) * 0.8

            # Save audio
            write(filepath, 16000, (waveform * 32767).astype(np.int16))
            print(f"✅ Sample saved: {filepath}")

            # Play audio in notebook
            display(Audio(waveform, rate=16000))

        except Exception as e:
            print(f"⚠️ Error generating sample: {e}")
    model.train()


In [74]:
# Setup dataset and dataloader
dataset = NepaliTTSDataset("/kaggle/working/manifest.csv", processor)
dataloader = DataLoader(
    dataset, 
    batch_size=4,  # Reduced batch size for stability
    shuffle=True, 
    collate_fn=collate_fn,
    drop_last=True
)

Dataset size after filtering: 2592


In [75]:
# Selective fine-tuning - be more conservative
trainable_layers = []
for name, param in model.named_parameters():
    # Only train specific components, keep most frozen
    if any(layer in name for layer in [
        'text_encoder.encoder.layers.4',  # Second to last encoder layer
        'text_encoder.encoder.layers.5',  # Last encoder layer
        'text_encoder.project'            # Text encoder projection layer
    ]):
        param.requires_grad = True
        trainable_layers.append(name)
    else:
        param.requires_grad = False

print(f"Trainable parameters: {trainable_layers}")
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"\nTraining {trainable_params}/{total_params} parameters ({trainable_params/total_params*100:.2f}%)")


Trainable parameters: ['text_encoder.encoder.layers.4.attention.emb_rel_k', 'text_encoder.encoder.layers.4.attention.emb_rel_v', 'text_encoder.encoder.layers.4.attention.k_proj.weight', 'text_encoder.encoder.layers.4.attention.k_proj.bias', 'text_encoder.encoder.layers.4.attention.v_proj.weight', 'text_encoder.encoder.layers.4.attention.v_proj.bias', 'text_encoder.encoder.layers.4.attention.q_proj.weight', 'text_encoder.encoder.layers.4.attention.q_proj.bias', 'text_encoder.encoder.layers.4.attention.out_proj.weight', 'text_encoder.encoder.layers.4.attention.out_proj.bias', 'text_encoder.encoder.layers.4.layer_norm.weight', 'text_encoder.encoder.layers.4.layer_norm.bias', 'text_encoder.encoder.layers.4.feed_forward.conv_1.weight', 'text_encoder.encoder.layers.4.feed_forward.conv_1.bias', 'text_encoder.encoder.layers.4.feed_forward.conv_2.weight', 'text_encoder.encoder.layers.4.feed_forward.conv_2.bias', 'text_encoder.encoder.layers.4.final_layer_norm.weight', 'text_encoder.encoder.laye

In [76]:
# Setup training with conservative settings
optimizer = optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), 
    lr=1e-6,  # Much smaller learning rate
    weight_decay=0.01
)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-8)
loss_fn = TTSLoss()

# Sample texts for testing during training
test_texts = [
    "कृपया मद्दत गर्नुहोस्, घरमा आगलागी भएको छ।",
    "एउटा मानिस घाइते भएका छन्, तुरुन्त एम्बुलेन्स पठाउनुहोस्।",
    "सडक दुर्घटना भएको छ, हामीलाई तत्काल सहयोग चाहिन्छ।",
    "भूकम्प आएको छ, सबैलाई सुरक्षित स्थानमा जान अनुरोध छ।"
]

# Training parameters
EPOCHS = 1
CHECKPOINT_INTERVAL = 2
SAMPLE_INTERVAL = 1
best_loss = float('inf')

In [77]:
# Load checkpoint if exists
# start_epoch = load_checkpoint(model, optimizer, scheduler, "checkpoints/latest_checkpoint.pth")


In [78]:




print("Starting training...")
model.train()

for epoch in range(start_epoch, EPOCHS):
    epoch_loss = 0
    valid_batches = 0
    
    for batch_idx, batch in enumerate(dataloader):
        if batch is None:  # Skip invalid batches
            continue
            
        try:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            target_waveforms = batch["waveforms"].to(device)

            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pred_waveforms = outputs.waveform
            
            # Calculate loss
            loss = loss_fn(pred_waveforms, target_waveforms)
            
            # Backward pass with gradient clipping
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)  # Conservative clipping
            optimizer.step()

            epoch_loss += loss.item()
            valid_batches += 1
            
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}/{EPOCHS}, Batch {batch_idx}, Loss: {loss.item():.6f}")
                
        except Exception as e:
            print(f"Error in batch {batch_idx}: {e}")
            continue

    if valid_batches == 0:
        print(f"No valid batches in epoch {epoch+1}, skipping...")
        continue
        
    avg_loss = epoch_loss / valid_batches
    scheduler.step()
    
    print(f"Epoch {epoch+1}/{EPOCHS} completed | Avg Loss: {avg_loss:.6f}")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.2e}")
    
    # Save checkpoint
    if (epoch + 1) % CHECKPOINT_INTERVAL == 0:
        checkpoint_path = f"checkpoints/checkpoint_epoch_{epoch+1}.pth"
        save_checkpoint(model, optimizer, scheduler, epoch, avg_loss, checkpoint_path)
        
        # Save as latest checkpoint too
        save_checkpoint(model, optimizer, scheduler, epoch, avg_loss, "checkpoints/latest_checkpoint.pth")
        
        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            save_checkpoint(model, optimizer, scheduler, epoch, avg_loss, "checkpoints/best_model.pth")
            print(f"New best model saved with loss: {best_loss:.6f}")
    
    # Generate sample audio
    if (epoch + 1) % SAMPLE_INTERVAL == 0:
        for i, text in enumerate(test_texts):
            sample_path = f"samples/epoch_{epoch+1}_sample_{i+1}.wav"
            generate_sample(model, processor, text, sample_path, device)
    
    print("-" * 60)

print("Training completed!")

# Save final model
final_checkpoint = "checkpoints/final_model.pth"
save_checkpoint(model, optimizer, scheduler, EPOCHS-1, avg_loss, final_checkpoint)

# Generate final samples
print("Generating final samples...")
for i, text in enumerate(test_texts):
    sample_path = f"samples/final_sample_{i+1}.wav"
    generate_sample(model, processor, text, sample_path, device)

print("Training and sampling completed!")

Starting training...
Epoch 1/1, Batch 0, Loss: 0.110949
Epoch 1/1, Batch 10, Loss: 0.081283
Epoch 1/1, Batch 20, Loss: 0.047387
Epoch 1/1, Batch 30, Loss: 0.083695
Epoch 1/1, Batch 40, Loss: 0.078355
Epoch 1/1, Batch 50, Loss: 0.076087
Epoch 1/1, Batch 60, Loss: 0.091401
Epoch 1/1, Batch 70, Loss: 0.094361
Epoch 1/1, Batch 80, Loss: 0.069162
Epoch 1/1, Batch 90, Loss: 0.046177
Epoch 1/1, Batch 100, Loss: 0.083606
Epoch 1/1, Batch 110, Loss: 0.122788
Epoch 1/1, Batch 120, Loss: 0.050469
Epoch 1/1, Batch 130, Loss: 0.049380
Epoch 1/1, Batch 140, Loss: 0.052222
Epoch 1/1, Batch 150, Loss: 0.102833
Epoch 1/1, Batch 160, Loss: 0.086657
Epoch 1/1, Batch 170, Loss: 0.075529
Epoch 1/1, Batch 180, Loss: 0.088576
Epoch 1/1, Batch 190, Loss: 0.074226
Epoch 1/1, Batch 200, Loss: 0.090171
Epoch 1/1, Batch 210, Loss: 0.070092
Epoch 1/1, Batch 220, Loss: 0.088272
Epoch 1/1, Batch 230, Loss: 0.053004
Epoch 1/1, Batch 240, Loss: 0.054021
Epoch 1/1, Batch 250, Loss: 0.089170
Epoch 1/1, Batch 260, Loss: 

✅ Sample saved: samples/epoch_1_sample_2.wav


✅ Sample saved: samples/epoch_1_sample_3.wav


✅ Sample saved: samples/epoch_1_sample_4.wav


------------------------------------------------------------
Training completed!
Checkpoint saved: checkpoints/final_model.pth
Generating final samples...
✅ Sample saved: samples/final_sample_1.wav


✅ Sample saved: samples/final_sample_2.wav


✅ Sample saved: samples/final_sample_3.wav


✅ Sample saved: samples/final_sample_4.wav


Training and sampling completed!


In [79]:

# Directory to save your fine-tuned model
save_dir = "./fine_tuned_nepali_vits_v5.2"
os.makedirs(save_dir, exist_ok=True)

# Save model
model.save_pretrained(save_dir)

# Save processor (tokenizer + config)
processor.save_pretrained(save_dir)

print(f"Final Model and processor saved to {save_dir}")


Final Model and processor saved to ./fine_tuned_nepali_vits_v5.2


In [80]:
ft_model = VitsModel.from_pretrained("./fine_tuned_nepali_vits_v5.2").to(device)
processor = AutoProcessor.from_pretrained("./fine_tuned_nepali_vits_v5.2")

# Your input text
text = "म छु, तिमी छौ"
# text = "अहिलेलाई एक हजारवटा वाक्यहरु जम्मा गर्ने हो"
inputs = processor(text, return_tensors="pt").to(device)  # Move inputs to same device as model

# Inference
with torch.no_grad():
    output = ft_model(**inputs).waveform  # waveform will be on the same device

# Move waveform to CPU for playback
output = output.cpu()

# Play audio
Audio(output.numpy(), rate=ft_model.config.sampling_rate)


In [81]:
import shutil

# Path to the saved model
save_dir = "/kaggle/working/fine_tuned_nepali_vits_v5.2"
zip_path = "/kaggle/working/fine_tuned_nepali_vits_v5.2.zip"

# Create zip
shutil.make_archive(base_name=zip_path.replace('.zip',''), format='zip', root_dir=save_dir)

print(f"✅ Model zipped at: {zip_path}")


✅ Model zipped at: /kaggle/working/fine_tuned_nepali_vits_v5.2.zip
