In [1]:
import json
import pandas as pd

# Function to load JSON lines from a file
def load_json_lines(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))  # Convert each line to a dictionary
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")
    return data

# Function to extract Kannada and English word pairs
def extract_pairs(json_data):
    kannada_words = [entry["native word"] for entry in json_data]
    english_words = [entry["english word"] for entry in json_data]
    return kannada_words, english_words

# Function to load, clean, and sample JSON data as CSV
def load_clean_and_sample_json(file_path, output_csv, sample_size):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line.strip())
                data.append({"Kannada": obj["native word"], "English": obj["english word"]})
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")
    
    # Convert to Pandas DataFrame
    df = pd.DataFrame(data)
    
    # Reduce dataset size
    df = df.sample(n=min(sample_size, len(df)), random_state=42)
    
    # Save cleaned and reduced dataset
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"✅ Reduced dataset saved: {output_csv}")
    
    return df

# Define dataset paths
train_json = r"/home/gwl/Desktop/test/kan_train.json"
valid_json = r"/home/gwl/Desktop/test/kan_valid.json"
test_json = r"/home/gwl/Desktop/test/kan_test.json"

# Load, clean, and sample datasets
train_df = load_clean_and_sample_json(train_json, "kan_train_reduced.csv", 3000)
valid_df = load_clean_and_sample_json(valid_json, "kan_valid_reduced.csv", 600)
test_df = load_clean_and_sample_json(test_json, "kan_test_reduced.csv", 300)

# Load reduced datasets
train_df = pd.read_csv("kan_train_reduced.csv")
valid_df = pd.read_csv("kan_valid_reduced.csv")
test_df = pd.read_csv("kan_test_reduced.csv")

# Print dataset sizes
print(f"Train Data: {len(train_df)} entries")
print(f"Validation Data: {len(valid_df)} entries")
print(f"Test Data: {len(test_df)} entries")

# Print first 5 rows for verification
print("\nSample Train Data:\n", train_df.head())
print("\nSample Validation Data:\n", valid_df.head())
print("\nSample Test Data:\n", test_df.head())

✅ Reduced dataset saved: kan_train_reduced.csv
✅ Reduced dataset saved: kan_valid_reduced.csv
✅ Reduced dataset saved: kan_test_reduced.csv
Train Data: 3000 entries
Validation Data: 600 entries
Test Data: 300 entries

Sample Train Data:
        Kannada         English
0    ಆಶ್ರಿತೇಷು      ashriteshu
1  ಪ್ರಸ್ತಭೂಮಿಯ  prastabhoomiya
2      ಮನೆವಾಡ್         manewad
3       ವಿಹ್ವಲ         vihwala
4    ಭಾವನೆಯಲ್ಲ    bhavaneyalla

Sample Validation Data:
          Kannada            English
0  ಭಾವಗೀತೆಗಳನ್ನು  bhavagithegalannu
1           ಸದೃಶ           sadrusha
2     ಸೆಂಟಮೀಟರ್ನ       centameterna
3      ನಾವೆಲ್ಲರೂ          navellaru
4   ಸಮ್ಮೇಳನವನ್ನು     sammelanawannu

Sample Test Data:
               Kannada               English
0               ಸಮದರ್              samaddar
1  ಸಾವರಿಸಿಕೊಳ್ಳುತ್ತಲೇ  saavarisikolluttalae
2            ತಿರಸ್ಕಾರ            tiraskaara
3          ಇತಿಹಾಸತಜ್ಞ         itihaasatajna
4          ಸೆರ್ವಿಲ್ಲೆ            sayreville


In [2]:
import torch
from transformers import AutoTokenizer
import pandas as pd

# Load reduced dataset
train_df = pd.read_csv("kan_train_reduced.csv")
valid_df = pd.read_csv("kan_valid_reduced.csv")
test_df = pd.read_csv("kan_test_reduced.csv")

# Use a pre-trained tokenizer (IndicTrans) with trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True)

# Ensure all values are strings and handle NaNs
for df in [train_df, valid_df, test_df]:
    df["Kannada"] = df["Kannada"].astype(str).fillna("")
    df["English"] = df["English"].astype(str).fillna("")

# Tokenize Kannada & English
for df in [train_df, valid_df, test_df]:
    df["Kannada_tokens"] = df["Kannada"].apply(lambda x: tokenizer(x, return_tensors="pt")["input_ids"][0].tolist())
    df["English_tokens"] = df["English"].apply(lambda x: tokenizer(x, return_tensors="pt")["input_ids"][0].tolist())

# Save tokenized datasets
train_df.to_csv("kan_train_tokenized.csv", index=False, encoding="utf-8")
valid_df.to_csv("kan_valid_tokenized.csv", index=False, encoding="utf-8")
test_df.to_csv("kan_test_tokenized.csv", index=False, encoding="utf-8")

# Print dataset sizes
print(f"Training Data: {len(train_df)} entries")
print(f"Validation Data: {len(valid_df)} entries")
print(f"Test Data: {len(test_df)} entries")

# Print sample tokenized data
print("\nSample Training Data:\n", train_df.head())
print("\nSample Validation Data:\n", valid_df.head())
print("\nSample Test Data:\n", test_df.head())

Training Data: 3000 entries
Validation Data: 600 entries
Test Data: 300 entries

Sample Training Data:
        Kannada         English  \
0    ಆಶ್ರಿತೇಷು      ashriteshu   
1  ಪ್ರಸ್ತಭೂಮಿಯ  prastabhoomiya   
2      ಮನೆವಾಡ್         manewad   
3       ವಿಹ್ವಲ         vihwala   
4    ಭಾವನೆಯಲ್ಲ    bhavaneyalla   

                                      Kannada_tokens  \
0  [277, 28083, 29001, 24793, 26372, 25795, 27088...   
1  [277, 27796, 24793, 26372, 27673, 24793, 27088...   
2  [277, 27034, 26466, 27090, 27320, 26657, 27425...   
3  [277, 27320, 25795, 28116, 24793, 27320, 27062...   
4  [277, 29170, 26657, 27320, 26466, 27090, 27449...   

                       English_tokens  
0          [5624, 3851, 1483, 491, 2]  
1  [7152, 1842, 4692, 10526, 4908, 2]  
2                 [320, 2952, 666, 2]  
3               [1496, 6778, 8274, 2]  
4       [12129, 6578, 3263, 10334, 2]  

Sample Validation Data:
          Kannada            English  \
0  ಭಾವಗೀತೆಗಳನ್ನು  bhavagithegalannu   
1         

In [3]:
train_df = pd.read_csv("kan_train_tokenized.csv")
print(train_df.columns)

Index(['Kannada', 'English', 'Kannada_tokens', 'English_tokens'], dtype='object')


In [4]:
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset Class
import ast

class TransliterationDataset(Dataset):
    def __init__(self, dataframe):
        # Convert string representation of lists to actual lists if needed
        self.kannada = dataframe["Kannada_tokens"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x).tolist()
        self.english = dataframe["English_tokens"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x).tolist()

    def __len__(self):
        return len(self.kannada)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.kannada[idx], dtype=torch.long),
            "labels": torch.tensor(self.english[idx], dtype=torch.long)
        }



# Define collate function (Move outside class)
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad sequences using tokenizer.pad_token_id
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {"input_ids": input_ids_padded, "labels": labels_padded}

# Create Dataset
train_dataset = TransliterationDataset(train_df)
valid_dataset = TransliterationDataset(valid_df)
test_dataset = TransliterationDataset(test_df)

# Create DataLoader with correct collate function
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM

# Load Pre-trained IndicTrans Model
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True)

# Move model to CPU
device = torch.device("cpu")
model.to(device)

print("Model loaded on CPU")

Model loaded on CPU


In [6]:
import torch
from torch.optim import AdamW
from nltk.translate.bleu_score import sentence_bleu
from Levenshtein import distance as levenshtein_distance

# Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop with Validation Loss, Test Accuracy, BLEU Score, and CER
def train_model(model, train_loader, valid_loader, test_loader, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            # Create attention mask
            attention_mask = (input_ids != tokenizer.pad_token_id).to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Calculate validation loss
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in valid_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                attention_mask = (input_ids != tokenizer.pad_token_id).to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

        # Calculate test accuracy, BLEU Score, and CER
        correct = 0
        total = 0
        total_bleu = 0
        total_cer = 0

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                labels = batch["labels"].to(device)
                attention_mask = (input_ids != tokenizer.pad_token_id).to(device)

                outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=50)

                predicted_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                actual_texts = [tokenizer.decode(label[label != tokenizer.pad_token_id], skip_special_tokens=True) for label in labels]

                # Compute exact match accuracy
                correct += sum(1 for p, a in zip(predicted_texts, actual_texts) if p.strip().lower() == a.strip().lower())
                total += len(actual_texts)

                # Compute BLEU Score and CER
                for p, a in zip(predicted_texts, actual_texts):
                    reference = [list(a)]  # BLEU expects a list of references
                    candidate = list(p)  # Convert prediction into a list of characters
                    total_bleu += sentence_bleu(reference, candidate)

                    total_cer += levenshtein_distance(p, a) / max(len(a), 1)  # Normalize by actual text length

        test_accuracy = (correct / total) * 100
        avg_bleu = total_bleu / total
        avg_cer = total_cer / total

        # Print sample predictions for all epochs
        print("\n🔍 Sample Predictions:")
        for p, a in zip(predicted_texts[:5], actual_texts[:5]):
            print(f"Predicted: {p} | Actual: {a}")

        # Print metrics for each epoch
        print(f"Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader):.4f}, "
              f"Validation Loss: {val_loss/len(valid_loader):.4f}, "
              f"Test Accuracy: {test_accuracy:.2f}%, "
              f"BLEU Score: {avg_bleu:.4f}, "
              f"Character Error Rate (CER): {avg_cer:.4f}")

# Train the model
train_model(model, train_loader, valid_loader, test_loader, epochs=6)


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



🔍 Sample Predictions:
Predicted:   aas | Actual:   ortiz
Predicted:   chavavannu | Actual:   cheluvamba
Predicted:   jagaligal | Actual:   jateyagi
Predicted:   sasollu | Actual:   sangharshavu
Predicted:   madagidda | Actual:   madalagiddu
Epoch 1, Training Loss: 6.2469, Validation Loss: 2.7696, Test Accuracy: 0.33%, BLEU Score: 0.1724, Character Error Rate (CER): 0.4978

🔍 Sample Predictions:
Predicted:   a | Actual:   ortiz
Predicted:   chalavana | Actual:   cheluvamba
Predicted:   jateyagi | Actual:   jateyagi
Predicted:   sasharavu | Actual:   sangharshavu
Predicted:   matadalaagiddu | Actual:   madalagiddu
Epoch 2, Training Loss: 2.4052, Validation Loss: 1.9587, Test Accuracy: 2.00%, BLEU Score: 0.3733, Character Error Rate (CER): 0.3419

🔍 Sample Predictions:
Predicted:   oerj | Actual:   ortiz
Predicted:   chaluvemba | Actual:   cheluvamba
Predicted:   jateyagi | Actual:   jateyagi
Predicted:   sambaryavu | Actual:   sangharshavu
Predicted:   madalagiddu | Actual:   madalagidd

In [7]:
# Define the save path
model_save_path = "kan_transliteration_model.pth"

# Save the model state dictionary
torch.save(model.state_dict(), model_save_path)

print(f"✅ Model saved successfully at {model_save_path}")


✅ Model saved successfully at kan_transliteration_model.pth


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True)

# Load the saved model weights
model.load_state_dict(torch.load("kan_transliteration_model.pth", map_location=torch.device("cpu")))

# Move model to CPU
device = torch.device("cpu")
model.to(device)

print("✅ Model loaded successfully!")





A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/gwl/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/gwl/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/gwl/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/home/gwl/anaconda3/

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/gwl/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/gwl/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/gwl/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/home/gwl/anaconda3/

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



✅ Model loaded successfully!


In [2]:
def transliterate_kannada_to_english(kannada_text):
    model.eval()  # Set model to evaluation mode
    
    # Tokenize input Kannada text
    input_tokens = tokenizer(kannada_text, return_tensors="pt").to(device)

    # Generate prediction
    with torch.no_grad():
        output_tokens = model.generate(**input_tokens, max_length=50)

    # Decode generated tokens to English text
    english_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
    return english_text





In [3]:
sample_kannada_words = ["ನಮಸ್ಕಾರ", "ಶಿಕ್ಷಣ", "ಪ್ರೀತಿ", "ನಂಬಿಕೆ", "ಪದ್ಯ"]

for word in sample_kannada_words:
    transliterated_word = transliterate_kannada_to_english(word)
    print(f"Kannada: {word} -> English: {transliterated_word}")




Kannada: ನಮಸ್ಕಾರ -> English:   namkaara
Kannada: ಶಿಕ್ಷಣ -> English:   shikhana
Kannada: ಪ್ರೀತಿ -> English:   priati
Kannada: ನಂಬಿಕೆ -> English:   nambike
Kannada: ಪದ್ಯ -> English:   padya
