In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Dense, Concatenate, TimeDistributed, Lambda
)
import numpy as np
import pandas as pd
import os
import sys
from tqdm.auto import tqdm

2025-10-30 14:38:35.156376: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761835115.326348      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761835115.389294      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# ==============================================================================
# 0. Configuration and Global Constants
# ==============================================================================

# Global configuration for the assignment
BASE_PATH = '/kaggle/input/aksharantar-sampled/aksharantar_sampled'

# --- Automatic Language Discovery ---
LANGUAGES_TO_TRAIN = []
if os.path.exists(BASE_PATH):
    all_items = os.listdir(BASE_PATH)
    LANGUAGES_TO_TRAIN = [item for item in all_items 
                          if os.path.isdir(os.path.join(BASE_PATH, item)) and len(item) == 3]
    LANGUAGES_TO_TRAIN.sort()
    if not LANGUAGES_TO_TRAIN:
        print("WARNING: Could not automatically detect languages. Falling back to default list.")
        LANGUAGES_TO_TRAIN = ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'kas', 'kok', 'mai', 'mal', 'mar', 'mni', 'ori', 'pan', 'san', 'sid', 'tam', 'tel', 'urd']
else:
    print(f"WARNING: Base path {BASE_PATH} not found. Falling back to default language list.")
    LANGUAGES_TO_TRAIN = ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'kas', 'kok', 'mai', 'mal', 'mar', 'mni', 'ori', 'pan', 'san', 'sid', 'tam', 'tel', 'urd']
    
print(f"Detected/Using Languages: {LANGUAGES_TO_TRAIN}")


# Training Hyperparameters
BATCH_SIZE = 128            
EPOCHS = 30                 

# Model Hyperparameters
R = EMBEDDING_DIM = 200     # R: Input embedding size
S = HIDDEN_DIM = 128        # S: Hidden cell state size
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1

# Data Handling Limit
MAX_TRAINING_SAMPLES = float('inf')

# Special Tokens
START_TOKEN = '\t'
STOP_TOKEN = '\n'

print(f"TensorFlow version: {tf.__version__}")

# CRITICAL SPEED FIX: Ensure eager execution is NOT enabled for performance
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

Detected/Using Languages: ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'kas', 'kok', 'mai', 'mal', 'mar', 'mni', 'ori', 'pan', 'san', 'sid', 'tam', 'tel', 'urd']
TensorFlow version: 2.18.0


In [4]:
# ==============================================================================
# 1. Data Organization and Loading
# ==============================================================================

def load_and_prepare_multilingual_data(languages, base_path):
    """Loads data, adds a 'Language' column, and combines all splits."""
    
    all_train_data = []
    all_valid_data = []
    all_test_data = []
    
    print(f"Loading data from directory: {base_path} for {len(languages)} languages...")
    
    def load_file(lang, data_type, data_list):
        """Helper to load a single file and append its DataFrame to data_list."""
        filename = f'{lang}_{data_type}.csv'
        path = os.path.join(base_path, lang, filename)
        try:
            df = pd.read_csv(path, header=None, encoding='utf-8')
            df.columns = ['Latin', 'Native']
            if len(df) > 0:
                df['Language'] = lang 
                data_list.append(df)
                return True
        except FileNotFoundError:
            pass
        except Exception as e:
            print(f"ERROR loading {data_type} data for {lang}: {e}")
        return False

    # Load data for all languages and splits
    for lang in languages:
        load_file(lang, 'train', all_train_data)
        load_file(lang, 'valid', all_valid_data)
        load_file(lang, 'test', all_test_data)

    if not all_train_data:
        sys.exit("CRITICAL ERROR: No training data could be loaded. Check BASE_PATH and file structure.")

    combined_train_df = pd.concat(all_train_data, ignore_index=True)
    combined_valid_df = pd.concat(all_valid_data, ignore_index=True)
    combined_test_df = pd.concat(all_test_data, ignore_index=True)
    
    return combined_train_df, combined_valid_df, combined_test_df

# Execute data loading
train_df, valid_df, test_df = load_and_prepare_multilingual_data(LANGUAGES_TO_TRAIN, BASE_PATH)

print(f"Loaded {len(train_df)} training samples, {len(valid_df)} validation samples, and {len(test_df)} test samples.")

Loading data from directory: /kaggle/input/aksharantar-sampled/aksharantar_sampled for 19 languages...
Loaded 911513 training samples, 73051 validation samples, and 77809 test samples.


In [5]:
# ==============================================================================
# 2. Data Cleaning and Sampling
# ==============================================================================

print("\n--- Data Cleaning and Sampling ---")

def clean_dataframe(df):
    """Performs basic cleaning."""
    if df.empty:
        return df
    df['Latin'] = df['Latin'].astype(str).str.strip().str.lower()
    df['Native'] = df['Native'].astype(str).str.strip()
    df = df[df['Latin'].astype(bool) & df['Native'].astype(bool)]
    return df

train_df = clean_dataframe(train_df)
valid_df = clean_dataframe(valid_df)
test_df = clean_dataframe(test_df)

print(f"Total Cleaned Training Samples: {len(train_df)}")

if len(train_df) > MAX_TRAINING_SAMPLES:
    train_df = train_df.sample(int(MAX_TRAINING_SAMPLES), random_state=42).reset_index(drop=True)
    print(f"Applied Sampling: Training set reduced to {len(train_df)} samples.")

# Add start and stop tokens to all target sequences
for df in [train_df, valid_df, test_df]:
    if not df.empty:
        df['Target'] = df['Native'].astype(str).apply(lambda x: START_TOKEN + x + STOP_TOKEN)


--- Data Cleaning and Sampling ---
Total Cleaned Training Samples: 911513


In [6]:
# ==============================================================================
# 3. Tokenization and Vectorization
# ==============================================================================

# --- 3.1 Vocabulary Generation ---
source_chars = set(char for word in train_df['Latin'] for char in str(word))
target_chars = set(char for word in train_df['Target'] for char in str(word))
target_chars.add(START_TOKEN)
target_chars.add(STOP_TOKEN)
        
source_chars = sorted(list(source_chars))
target_chars = sorted(list(target_chars))

source_to_int = dict([(char, i + 1) for i, char in enumerate(source_chars)])
target_to_int = dict([(char, i + 1) for i, char in enumerate(target_chars)])

int_to_target = dict([(i, char) for char, i in target_to_int.items()])
int_to_target[0] = '' # ID 0 is padding

source_vocab_size = len(source_chars) + 1
target_vocab_size = len(target_chars) + 1

# --- Language Vocabulary ---
language_list = sorted(list(set(train_df['Language'].unique()) | set(valid_df['Language'].unique()) | set(test_df['Language'].unique())))
lang_to_int = dict([(lang, i + 1) for i, lang in enumerate(language_list)])
lang_vocab_size = len(language_list) + 1 

# Calculate Max Sequence Length (L)
all_latin = train_df['Latin'].tolist() + valid_df['Latin'].tolist() + test_df['Latin'].tolist()
all_target = train_df['Target'].tolist() + valid_df['Target'].tolist() + test_df['Target'].tolist()

max_len_latin = max(len(str(w)) for w in all_latin) if all_latin else 1
max_len_target = max(len(str(w)) for w in all_target) if all_target else 1

MAX_SEQUENCE_LENGTH = max(max_len_latin, max_len_target)
L = MAX_SEQUENCE_LENGTH

print(f"\n--- Vocabulary and Sequence Stats ---")
print(f"Source Vocab Size (Latin): {source_vocab_size}")
print(f"Target Vocab Size (Native/V): {target_vocab_size}")
print(f"Language Vocab Size (U_lang): {lang_vocab_size}")
print(f"Max Sequence Length (L): {MAX_SEQUENCE_LENGTH}")


# --- 3.2 Vectorization Function ---
def vectorize_data(df, max_len, source_to_int, target_to_int, lang_to_int):
    """Converts Latin, Target, and Language strings into ID sequences."""
    if df.empty:
        return np.array([]), np.array([]), np.array([]), np.array([])
        
    N = len(df)
    encoder_input_data = np.zeros((N, max_len), dtype='int32')
    decoder_input_data = np.zeros((N, max_len), dtype='int32')
    decoder_target_data = np.zeros((N, max_len), dtype='int32')
    language_input_data = np.zeros((N, 1), dtype='int32') 

    for i, (latin, target, lang) in enumerate(zip(df['Latin'], df['Target'], df['Language'])):
        latin = str(latin)
        target = str(target)
        
        for t, char in enumerate(latin):
            if t < max_len:
                encoder_input_data[i, t] = source_to_int.get(char, 0)

        for t, char in enumerate(target):
            if t < max_len:
                target_index = target_to_int.get(char, 0)
                decoder_input_data[i, t] = target_index 
                
                if t > 0:
                    decoder_target_data[i, t - 1] = target_index

        language_input_data[i, 0] = lang_to_int.get(lang, 0)
        
    return encoder_input_data, decoder_input_data, decoder_target_data, language_input_data

# Vectorize all data splits
encoder_input_train, decoder_input_train, decoder_target_train, language_input_train = vectorize_data(
    train_df, L, source_to_int, target_to_int, lang_to_int)
    
encoder_input_valid, decoder_input_valid, decoder_target_valid, language_input_valid = vectorize_data(
    valid_df, L, source_to_int, target_to_int, lang_to_int)
    
encoder_input_test, decoder_input_test, decoder_target_test, language_input_test = vectorize_data(
    test_df, L, source_to_int, target_to_int, lang_to_int)

# Create the tuple needed for Keras validation_data
validation_data_tuple = ([encoder_input_valid, decoder_input_valid, language_input_valid], decoder_target_valid)
if encoder_input_valid.size == 0:
    print("WARNING: Validation set is empty, will train without dedicated validation data.")
    validation_data_tuple = None


--- Vocabulary and Sequence Stats ---
Source Vocab Size (Latin): 27
Target Vocab Size (Native/V): 681
Language Vocab Size (U_lang): 20
Max Sequence Length (L): 33


In [7]:
# ==============================================================================
# 4. Language-Conditioned Seq2Seq Model Implementation
# ==============================================================================

def build_conditioned_seq2seq_model(R, S, U_src, U_tgt, U_lang):
    """
    Builds the Seq2Seq model, conditioned on a Language ID input (U_lang).
    """
    print(f"\n--- Building Language-Conditioned Seq2Seq Model (R={R}, S={S}) ---")

    # --- 4.1 Encoder (Latin Input) ---
    encoder_inputs = Input(shape=(None,), name='encoder_input')
    encoder_embedding = Embedding(U_src, R, mask_zero=True, name='latin_embed')(encoder_inputs)

    # Encoder LSTM 
    encoder_outputs, state_h_enc, state_c_enc = LSTM(
            S,
            return_state=True,
            return_sequences=False,
            use_cudnn=False,
            name='encoder_lstm_cell'
        )(encoder_embedding)

    # --- 4.2 Language Context ---
    language_inputs = Input(shape=(1,), name='language_input')
    lang_embedding = Embedding(U_lang, R // 2, mask_zero=False, name='lang_embed')(language_inputs)
    
    lang_context = Lambda(lambda x: tf.squeeze(x, axis=1), name='lang_context_vector')(lang_embedding)
    
    # Projection layers to match hidden state size S
    lang_context_h = Dense(S, name='lang_context_h_proj', activation='relu')(lang_context)
    lang_context_c = Dense(S, name='lang_context_c_proj', activation='relu')(lang_context)


    # --- 4.3 Decoder Initial States (Conditioning) ---
    decoder_h_initial = Concatenate(axis=-1, name='concat_h')([state_h_enc, lang_context_h])
    decoder_c_initial = Concatenate(axis=-1, name='concat_c')([state_c_enc, lang_context_c])
    
    decoder_h_initial = Dense(S, activation='tanh', name='final_h_state')(decoder_h_initial)
    decoder_c_initial = Dense(S, activation='tanh', name='final_c_state')(decoder_c_initial)
    encoder_states = [decoder_h_initial, decoder_c_initial]


    # --- 4.4 Decoder (Native Output) ---
    decoder_inputs = Input(shape=(None,), name='decoder_input')
    decoder_embedding = Embedding(U_tgt, R, mask_zero=False, name='native_embed')(decoder_inputs)
    
    decoder_outputs, _, _ = LSTM(
        S,
        return_sequences=True,
        return_state=True,
        use_cudnn=False,
        name='decoder_lstm_cell')(decoder_embedding, initial_state=encoder_states)

    decoder_dense = Dense(U_tgt, activation='softmax', name='output_dense')
    decoder_outputs = TimeDistributed(decoder_dense)(decoder_outputs)
    
    # --- Training Model ---
    training_model = Model([encoder_inputs, decoder_inputs, language_inputs], decoder_outputs)
    
    # --- Inference Models (for prediction) ---
    # These models will be generated based on the training model's weights
    # and kept in memory after training model is deleted for stability
    encoder_model = Model([encoder_inputs, language_inputs], encoder_states)
    
    decoder_state_input_h = Input(shape=(S,), name='decoder_h_input')
    decoder_state_input_c = Input(shape=(S,), name='decoder_c_input')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_embedding_inf = training_model.get_layer('native_embed')(decoder_inputs)
    
    decoder_output_inf, state_h_inf, state_c_inf = training_model.get_layer('decoder_lstm_cell')(
        decoder_embedding_inf, initial_state=decoder_states_inputs)

    decoder_outputs_inf = TimeDistributed(decoder_dense)(decoder_output_inf)
    
    decoder_states = [state_h_inf, state_c_inf]
    
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs_inf] + decoder_states)
        
    return training_model, encoder_model, decoder_model

# Build and compile the model
training_model, encoder_model, decoder_model = build_conditioned_seq2seq_model(
    R, S, source_vocab_size, target_vocab_size, lang_vocab_size)

# Compile the training model
training_model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

training_model.summary()


--- Building Language-Conditioned Seq2Seq Model (R=200, S=128) ---


I0000 00:00:1761835170.554300      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [8]:
# ==============================================================================
# 5. Model Training (Optimized for Stability)
# ==============================================================================
# Define the directory for saving the best weights/models
MODEL_SAVE_DIR = "conditioned_seq2seq_models"

if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)

# 5.1 Callbacks for optimization and tracking
callbacks = [
    # CRITICAL STABILITY FIX: Rely ONLY on restore_best_weights=True to keep the best model in memory.
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        verbose=1,
        mode='min',
        restore_best_weights=True 
    ),
    tf.keras.callbacks.CSVLogger('training_log.csv', append=True)
]


print("\n--- Starting Model Training ---")
print("NOTE: Training will run for up to 25 epochs or until EarlyStopping triggers.")
print("*** WARNING: Model saving to disk has been DISABLED for maximum kernel stability. ***")
print("The best weights are kept in memory and used for the inference models.")


initial_epoch = 0

history = training_model.fit(
    [encoder_input_train, decoder_input_train, language_input_train],
    decoder_target_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=validation_data_tuple,
    callbacks=callbacks, 
    initial_epoch=initial_epoch,
    verbose=1
)


--- Starting Model Training ---
NOTE: Training will run for up to 25 epochs or until EarlyStopping triggers.
The best weights are kept in memory and used for the inference models.
Epoch 1/30


I0000 00:00:1761835185.490268     106 service.cc:148] XLA service 0x7c419800d6d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1761835185.490797     106 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1761835186.002237     106 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
I0000 00:00:1761835186.497901     106 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   3/7122[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:16[0m 36ms/step - accuracy: 0.2777 - loss: 6.3705   

I0000 00:00:1761835199.005114     106 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m7121/7122[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - accuracy: 0.7618 - loss: 1.0406

W0000 00:00:1761835315.668373     106 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7618 - loss: 1.0405

W0000 00:00:1761835331.161582     104 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1761835335.846166     106 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 19ms/step - accuracy: 0.7618 - loss: 1.0405 - val_accuracy: 0.8629 - val_loss: 0.4942
Epoch 2/30
[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 16ms/step - accuracy: 0.8632 - loss: 0.4750 - val_accuracy: 0.9260 - val_loss: 0.2524
Epoch 3/30
[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 16ms/step - accuracy: 0.9246 - loss: 0.2506 - val_accuracy: 0.9436 - val_loss: 0.1903
Epoch 4/30
[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 16ms/step - accuracy: 0.9435 - loss: 0.1836 - val_accuracy: 0.9522 - val_loss: 0.1625
Epoch 5/30
[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 16ms/step - accuracy: 0.9520 - loss: 0.1539 - val_accuracy: 0.9547 - val_loss: 0.1524
Epoch 6/30
[1m7122/7122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 16ms/step - accuracy: 0.9567 - loss: 0.1372 - val_accuracy: 0.9589 - val_loss: 0.1385
Epoch 7/3

In [9]:
# ==============================================================================
# 6. Memory Cleanup and Final Evaluation
# ==============================================================================

print("\n--- Aggressive Memory Cleanup (for Notebook Stability) ---")
# 1. Clear the Keras/TensorFlow session graph to free memory
tf.keras.backend.clear_session()

# 2. Explicitly delete large objects that are no longer needed
del encoder_input_train
del decoder_input_train
del decoder_target_train
del language_input_train
del encoder_input_valid
del decoder_input_valid
del decoder_target_valid
del language_input_valid
del training_model # Best weights were restored to the inference models by EarlyStopping

print("Memory cleared. Proceeding to Final Evaluation on Dedicated Test Set.")

if encoder_input_test.size > 0:
    # Note: We use the smaller inference models for evaluation here, 
    # but since the full training model is often simpler to evaluate,
    # we'll skip evaluation after cleanup to ensure maximum stability.
    print("Skipping detailed re-evaluation to maintain environment stability.")
else:
    print("WARNING: No dedicated test data available for final evaluation.")

# --- Prediction Helper Functions (Needed for Section 7) ---

def decode_sequence_conditioned(input_seq, lang_id_seq):
    """Inference function to generate the target sequence character by character."""
    
    # Use verbose=0 to prevent a console flood during the loop
    states_value = encoder_model.predict([input_seq, lang_id_seq], verbose=0) 
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_to_int[START_TOKEN]

    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens_and_states = decoder_model.predict(
            [target_seq] + states_value, verbose=0) 
        
        output_tokens = output_tokens_and_states[0]
        states_value = output_tokens_and_states[1:]
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = int_to_target.get(sampled_token_index, '?')

        if sampled_token_index != 0:
            decoded_sentence += sampled_char

        if (sampled_char == STOP_TOKEN or len(decoded_sentence) > L):
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return decoded_sentence.replace(STOP_TOKEN, '')

def transliterate_word_conditioned(word, lang):
    """Converts a Latin word string to an encoded input sequence and decodes it."""
    
    input_seq = np.zeros((1, L), dtype='int32')
    word = str(word).lower()
    
    for t, char in enumerate(word):
        if t < L:
            input_seq[0, t] = source_to_int.get(char, 0)
            
    lang_id = lang_to_int.get(lang, 0)
    lang_id_seq = np.array([[lang_id]], dtype='int32')
            
    predicted_word = decode_sequence_conditioned(input_seq, lang_id_seq)
    return predicted_word.strip()


--- Aggressive Memory Cleanup (for Notebook Stability) ---
Memory cleared. Proceeding to Final Evaluation on Dedicated Test Set.
Skipping detailed re-evaluation to maintain environment stability.


In [17]:
from tqdm.auto import tqdm
import pandas as pd
import os

print("\n" + "="*70)
print("GENERATING SAMPLE TEST SET PREDICTIONS (50 samples from each language)")
print("="*70)

if test_df.empty:
    print("⚠️ Test DataFrame is empty. Cannot generate output CSV.")
else:
    tqdm.pandas()

    SAMPLE_OUTPUT_PATH = "sample_predictions.csv"
    SAMPLE_SIZE_PER_LANG = 50  # ← 50 per language

    # --- Select 50 samples per language ---
    lang_groups = test_df['Language'].unique()
    sampled_frames = []

    for lang in lang_groups:
        lang_df = test_df[test_df['Language'] == lang]
        n = min(len(lang_df), SAMPLE_SIZE_PER_LANG)
        sampled_frames.append(lang_df.sample(n=n, random_state=42))

    sample_df = pd.concat(sampled_frames, ignore_index=True)

    print(f"✅ Selected {len(sample_df)} samples from test data across {len(lang_groups)} languages.\n")

    # --- Initialize output CSV ---
    pd.DataFrame(columns=['Latin', 'Native', 'Language', 'Predicted', 'Match']).to_csv(
        SAMPLE_OUTPUT_PATH, index=False
    )

    # --- Processing setup ---
    save_interval = 10
    display_interval = 1
    buffer = []
    total_rows = len(sample_df)
    processed = {'count': 0}

    # --- Prediction and saving logic ---
    def process_and_save(row):
        processed['count'] += 1
        predicted = transliterate_word_conditioned(row['Latin'], row['Language'])
        match_flag = (row['Native'] == predicted)

        buffer.append({
            'Latin': row['Latin'],
            'Native': row['Native'],
            'Language': row['Language'],
            'Predicted': predicted,
            'Match': match_flag
        })

        # Display each prediction
        if processed['count'] % display_interval == 0:
            tqdm.write(
                f"[{processed['count']}/{total_rows}] "
                f"Latin: '{row['Latin']}' | Predicted: '{predicted}' | "
                f"Actual: '{row['Native']}' | Lang: {row['Language']}"
            )

        # Periodically save
        if processed['count'] % save_interval == 0 or processed['count'] == total_rows:
            pd.DataFrame(buffer).to_csv(SAMPLE_OUTPUT_PATH, mode='a', index=False, header=False)
            buffer.clear()
            tqdm.write(f"✅ Saved progress up to row {processed['count']}")

        return predicted

    # --- Run the processing loop ---
    sample_df['Predicted'] = sample_df.progress_apply(process_and_save, axis=1)

    # --- Final save if needed ---
    if buffer:
        pd.DataFrame(buffer).to_csv(SAMPLE_OUTPUT_PATH, mode='a', index=False, header=False)
        buffer.clear()
        tqdm.write("✅ Final buffer saved.")

    print(f"\nAll sample predictions saved to: {SAMPLE_OUTPUT_PATH}")

    # --- Preview of output ---
    final_sample_df = pd.read_csv(SAMPLE_OUTPUT_PATH)
    print("\nHead of Sample Output:")
    print(final_sample_df.head(10).to_markdown(index=False))

print("="*70)


GENERATING SAMPLE TEST SET PREDICTIONS (50 samples from each language)
✅ Selected 950 samples from test data across 19 languages.



  0%|          | 0/950 [00:00<?, ?it/s]

[1/950] Latin: 'brahmaputroi' | Predicted: 'ব্রাহ্মাপুত্রই' | Actual: 'ব্রহ্মপুত্রই' | Lang: asm
[2/950] Latin: 'kidore' | Predicted: 'কিদৰে' | Actual: 'কিদৰে' | Lang: asm
[3/950] Latin: 'deelership' | Predicted: 'দিলেৰশীপ' | Actual: 'ডীলাৰশ্বিপ' | Lang: asm
[4/950] Latin: 'xomoyotu' | Predicted: 'সময়তো' | Actual: 'সময়তো' | Lang: asm
[5/950] Latin: 'jotin' | Predicted: 'জতিন' | Actual: 'যতীন' | Lang: asm
[6/950] Latin: 'madison' | Predicted: 'মাদিছন' | Actual: 'মেডিচন' | Lang: asm
[7/950] Latin: 'protijugitat' | Predicted: 'প্ৰতিজীগোতাত' | Actual: 'প্ৰতিযোগিতাত' | Lang: asm
[8/950] Latin: 'sugar' | Predicted: 'চোগাৰ' | Actual: 'চুগাৰ' | Lang: asm
[9/950] Latin: 'kormeegoraki' | Predicted: 'কৰ্মীগৰাকী' | Actual: 'কৰ্মীগৰাকী' | Lang: asm
[10/950] Latin: 'chelsea' | Predicted: 'চেলচে' | Actual: 'চেলেচিয়া' | Lang: asm
✅ Saved progress up to row 10
[11/950] Latin: 'protixthapokxokolor' | Predicted: 'প্ৰতিষ্টাস্থকপকলৰ' | Actual: 'প্ৰতিষ্ঠাপকসকলৰ' | Lang: asm
[12/950] Latin: 'dosok' | Pred

In [13]:
# # ==============================================================================
# # 7. Generate and Save Final Output CSV
# # ==============================================================================

# print("\n" + "="*70)
# print("GENERATING FULL TEST SET PREDICTIONS AND SAVING OUTPUT CSV")
# print("="*70)

# if test_df.empty:
#     print("WARNING: Test DataFrame is empty. Cannot generate output CSV.")
# else:
#     # --- PROGRESS BAR IMPLEMENTATION ---
#     # 1. Enable TQDM integration with Pandas
#     tqdm.pandas() 
    
#     # 2. Use .progress_apply() instead of .apply() to show the progress bar
#     print(f"Running inference on {len(test_df)} test samples (using TQDM progress bar)...")
#     test_df['Predicted'] = test_df.progress_apply(
#         lambda row: transliterate_word_conditioned(row['Latin'], row['Language']), 
#         axis=1
#     )
#     # -----------------------------------

#     # Create the final output DataFrame with required columns
#     output_df = test_df[['Latin', 'Native', 'Language', 'Predicted']].copy()
    
#     # Calculate word-level match for quality check
#     output_df['Match'] = output_df['Native'] == output_df['Predicted']

#     # --- Save to CSV ---
#     OUTPUT_CSV_PATH = 'final_predictions.csv'
#     output_df.to_csv(OUTPUT_CSV_PATH, index=False)
#     print(f"\nSuccessfully saved all predictions to: {OUTPUT_CSV_PATH}")
    
#     # --- Display Head 10 ---
#     print("\nHead 10 of the Final Output DataFrame:")
#     print(output_df.head(10).to_markdown(index=False))

# print("="*70)


from tqdm.auto import tqdm
import pandas as pd
import os

print("\n" + "="*70)
print("GENERATING FULL TEST SET PREDICTIONS AND SAVING OUTPUT CSV (DYNAMIC MODE + LIVE OUTPUT)")
print("="*70)

if test_df.empty:
    print("WARNING: Test DataFrame is empty. Cannot generate output CSV.")
else:
    tqdm.pandas()
    OUTPUT_CSV_PATH = "final_predictions.csv"
    save_interval = 50       # save every N rows
    display_interval = 1     # print every row (set to 5 or 10 to reduce output spam)
    buffer = []

    # Create output file if not exists
    if not os.path.exists(OUTPUT_CSV_PATH):
        pd.DataFrame(columns=['Latin', 'Native', 'Language', 'Predicted', 'Match']).to_csv(
            OUTPUT_CSV_PATH, index=False
        )

    total_rows = len(test_df)
    tqdm.write(f"Running inference on {total_rows} test samples...\n")

    processed = {'count': 0}

    def process_and_save(row):
        processed['count'] += 1

        # Generate prediction
        predicted = transliterate_word_conditioned(row['Latin'], row['Language'])
        match_flag = (row['Native'] == predicted)

        # Store for saving
        buffer.append({
            'Latin': row['Latin'],
            'Native': row['Native'],
            'Language': row['Language'],
            'Predicted': predicted,
            'Match': match_flag
        })

        # --- Print live output ---
        if processed['count'] % display_interval == 0:
            tqdm.write(
                f"[{processed['count']}/{total_rows}] "
                f"Latin: '{row['Latin']}' | Predicted: '{predicted}' | "
                f"Actual: '{row['Native']}' | Lang: {row['Language']}"
            )

        # --- Save periodically ---
        if processed['count'] % save_interval == 0 or processed['count'] == total_rows:
            pd.DataFrame(buffer).to_csv(OUTPUT_CSV_PATH, mode='a', index=False, header=False)
            buffer.clear()
            tqdm.write(f"✅ Saved progress up to row {processed['count']}")

        return predicted

    # Apply with tqdm progress bar
    test_df['Predicted'] = test_df.progress_apply(process_and_save, axis=1)

    # Final flush
    if buffer:
        pd.DataFrame(buffer).to_csv(OUTPUT_CSV_PATH, mode='a', index=False, header=False)
        buffer.clear()
        tqdm.write("✅ Final buffer saved.")

    print(f"\nAll predictions processed and dynamically saved to: {OUTPUT_CSV_PATH}")

    # Preview first 10 saved rows
    final_df = pd.read_csv(OUTPUT_CSV_PATH)
    print("\nHead 10 of the Final Output DataFrame:")
    print(final_df.head(10).to_markdown(index=False))

print("="*70)


GENERATING FULL TEST SET PREDICTIONS AND SAVING OUTPUT CSV (DYNAMIC MODE + LIVE OUTPUT)
Running inference on 77809 test samples...



  0%|          | 0/77809 [00:00<?, ?it/s]

[1/77809] Latin: 'cumbokor' | Predicted: 'কুম্বকৰ' | Actual: 'চুম্বকৰ' | Lang: asm
[2/77809] Latin: 'hil' | Predicted: 'হিল' | Actual: 'হিল' | Lang: asm
[3/77809] Latin: 'vivaah' | Predicted: 'ভিৱাহ' | Actual: 'বিবাহ' | Lang: asm
[4/77809] Latin: 'idolor' | Predicted: 'ইদলৰ' | Actual: 'আইডলৰ' | Lang: asm
[5/77809] Latin: 'ravaapara' | Predicted: 'ৰৱাপাৰা' | Actual: 'ৰাভাপাৰা' | Lang: asm
[6/77809] Latin: 'navkar' | Predicted: 'নাৱকাৰ' | Actual: 'নৱকাৰ' | Lang: asm
[7/77809] Latin: 'barombar' | Predicted: 'বাৰমবাৰ' | Actual: 'বাৰম্বাৰ' | Lang: asm
[8/77809] Latin: 'aadalotor' | Predicted: 'আদালতৰ' | Actual: 'আদালতৰ' | Lang: asm
[9/77809] Latin: 'akuxe' | Predicted: 'আকুষে' | Actual: 'একুশে' | Lang: asm
[10/77809] Latin: 'muikhyomontrigorakeer' | Predicted: 'মুক্ষম্যন্ত্ৰীগাৰেৰ' | Actual: 'মুখ্যমন্ত্ৰীগৰাকীৰ' | Lang: asm
[11/77809] Latin: 'dokadee' | Predicted: 'দকাদী' | Actual: 'ডকাদি' | Lang: asm
[12/77809] Latin: 'uniphos' | Predicted: 'উনিফোচ' | Actual: 'ইউনিফচ' | Lang: asm
[13/77809

KeyboardInterrupt: 