In [None]:
!pip install -U -q pydrive google-colab

In [None]:
from google.colab import drive

# Authenticate and mount Google Drive
drive.mount('/content/drive')

In [None]:
!pip install jiwer


In [None]:
!pip install evaluate


In [None]:
!pip install transformers evaluate

# Test Set 01

In [None]:
import pandas as pd

# Transliteration mapping (from Latin to Sinhala)
p = {
    # Vowel sounds
    'a': 'අ', 'aa': 'ආ', 'A': 'ඇ', 'Aa': 'ඈ', 'i': 'ඉ', 'ie': 'ඊ',
    'u': 'උ', 'uu': 'ඌ', 'e': 'එ', 'ea': 'ඒ', 'I': 'ඓ', 'o': 'ඔ',

    '-A': 'ැ', '-i': 'ි', '-u': 'ු', '-e': 'ෙ', '-o': 'ො', '-I': 'ෛ',

    # Vowel sounds - long
    '-aa': 'ා', '-Aa': 'ෑ', '-ie': 'ී', '-ei': 'ේ', '-oe': 'ෝ',
    '-uu': 'ූ', '-au': 'ෞ', '\\n': 'ං', '\\h': 'ඃ', '\\N': 'ඞ',

    # Consonants - Common
    'ka': 'ක', 'ga': 'ග', 'ma': 'ම', 'ya': 'ය', 'ra': 'ර', 'ba': 'බ',
    'cha': 'ච', 'ja': 'ජ', 'ta': 'ට', 'la': 'ල', 'Da': 'ඩ', 'wa': 'ව',
    'tha': 'ත', 'sa': 'ස', 'da': 'ද', 'ha': 'හ', 'na': 'න', 'pa': 'ප',
    'Na': 'ණ', 'La': 'ළ','mi' : 'මි',

     # Consonants - Common
    'k': 'ක්', 'g': 'ග', 'm': 'ම', 'y': 'ය', 'r': 'ර', 'b': 'බ',
    'ch': 'ච', 'j': 'ජ', 't': 'ට', 'l': 'ල', 'da': 'ඩ', 'w': 'ව',
    'th': 'ත', 's': 'ස', 'd': 'ද', 'h': 'හ', 'n': 'න', 'p': 'ප',
    'N': 'ණ', 'L': 'ළ',

    # Consonants - Aspirated
    'Ka': 'ඛ', 'Ga': 'ඝ', 'cha': 'ඡ', 'Tha': 'ඨ', 'Dha': 'ඪ',
    'Tha': 'ථ', 'dha': 'ධ', 'Pa': 'ඵ', 'bha': 'භ',

    # Consonants - Special
    'Ba': 'ඹ', 'Sa': 'ශ', 'sha': 'ෂ', 'fa': 'ෆ', 'GNa': 'ඥ',
    'KNa': 'ඤ', 'jha': 'ඣ', 'Lu': 'ළු', 'Luu': 'ළූ',

    # Special sounds
    '-R': 'ර්‍', 'Ya': '්‍ය', 'ra': '්‍ර', '-': '්',

    # Handle special cases with ZWJ (Zero Width Joiner)
    '-ru': 'ෘ', 'au': 'ඖ',

    'ki': 'කි', 'ku': 'කු', 'ke': 'කෙ', 'ko': 'කො',
    'kaa': 'කා', 'kAa': 'කෑ', 'kie': 'කී', 'kei': 'කේ',
    'koe': 'කෝ', 'kuu': 'කූ', 'kau': 'කෞ',

    'gi': 'ගි', 'gu': 'ගු', 'ge': 'ගෙ', 'go': 'ගො',
    'gaa': 'ගා', 'gAa': 'ගෑ', 'gie': 'ගී', 'gei': 'ගේ',
    'goe': 'ගෝ', 'guu': 'ගූ', 'gau': 'ගෞ',

    'mi': 'මි', 'mu': 'මු', 'me': 'මෙ', 'mo': 'මො',
    'maa': 'මා', 'mAa': 'මෑ', 'mie': 'මී', 'mei': 'මේ',
    'moe': 'මෝ', 'muu': 'මු', 'mau': 'මෞ',

    'yi': 'යි', 'yu': 'යු', 'ye': 'යේ', 'yo': 'යෝ',
    'yaa': 'යා', 'yAa': 'යෑ', 'yie': 'යී', 'yei': 'යේ',
    'yoe': 'යෝ', 'yuu': 'යූ', 'yau': 'යෞ',

    'ri': 'රි', 'ru': 'රු', 're': 'රෙ', 'ro': 'රො',
    'raa': 'රා', 'rAa': 'රෑ', 'rie': 'රී', 'rei': 'රී',
    'roe': 'රෝ', 'ruu': 'රූ', 'rau': 'රෞ',

    'bi': 'බි', 'bu': 'බු', 'be': 'බෙ', 'bo': 'බො',
    'baa': 'බා', 'bAa': 'බෑ', 'bie': 'බී', 'bei': 'බේ',
    'boe': 'බෝ', 'buu': 'බූ', 'bau': 'බෞ',

    'ci': 'චි', 'cu': 'චු', 'ce': 'චෙ', 'co': 'චො',
    'caa': 'චා', 'cAa': 'චෑ', 'cie': 'චී', 'cei': 'චේ',
    'coe': 'චෝ', 'cuu': 'චූ', 'cau': 'චෞ',

    'chi': 'චි', 'chu': 'චු', 'che': 'චෙ', 'cho': 'චො',
    'chaa': 'චා', 'chAa': 'චෑ', 'chie': 'චී', 'chei': 'චේ',
    'choe': 'චෝ', 'chuu': 'චූ', 'chau': 'චෞ',

    'ji': 'ජි', 'ju': 'ජු', 'je': 'ජෙ', 'jo': 'ජො',
    'jaa': 'ජා', 'jAa': 'ජෑ', 'jie': 'ජී', 'jei': 'ජේ',
    'joe': 'ජෝ', 'juu': 'ජූ', 'jau': 'ජෞ',

    'ti': 'ටි', 'tu': 'ටු', 'te': 'ටෙ', 'to': 'ටො',
    'taa': 'ටා', 'tAa': 'ටෑ', 'tie': 'ටී', 'tei': 'ටේ',
    'toe': 'ටෝ', 'tuu': 'ටූ', 'tau': 'ටෞ',

    'li': 'ලි', 'lu': 'ළු', 'le': 'ලෙ', 'lo': 'ලො',
    'laa': 'ලා', 'lAa': 'ලෑ', 'lie': 'ලී', 'lei': 'ලේ',
    'loe': 'ලෝ', 'luu': 'ළු', 'lau': 'ලෞ',

    'Di': 'ඩි', 'Du': 'ඩු', 'De': 'ඩෙ', 'Do': 'ඩො',
    'Daa': 'ඩා', 'DAa': 'ඩෑ', 'Die': 'ඩී', 'Dei': 'ඩේ',
    'Doe': 'ඩෝ', 'Duu': 'ඩූ', 'Dau': 'ඩෞ',

    'wi': 'වි', 'wu': 'වු', 'we': 'වේ', 'wo': 'වෝ',
    'waa': 'වා', 'wAa': 'වා', 'wie': 'වී', 'wei': 'වී',
    'woe': 'වෝ', 'wuu': 'වු', 'wau': 'වෞ',

    'thi': 'ති', 'thu': 'තු', 'the': 'තෙ', 'tho': 'තො',
    'thaa': 'තා', 'thAa': 'තා', 'thie': 'තී', 'thei': 'තේ',
    'thoe': 'තෝ', 'thuu': 'තු', 'thau': 'තෞ',

    'si': 'සි', 'su': 'සු', 'se': 'සෙ', 'so': 'සො',
    'saa': 'සා', 'sAa': 'සෑ', 'sie': 'සී', 'sei': 'සේ',
    'soe': 'සෝ', 'suu': 'සූ', 'sau': 'සෞ',

    'di': 'දි', 'du': 'දු', 'de': 'දෙ', 'do': 'දො',
    'daa': 'දා', 'dAa': 'දෑ', 'die': 'දී', 'dei': 'දේ',
    'doe': 'දෝ', 'duu': 'දූ', 'dau': 'දෞ',

    'hi': 'හි', 'hu': 'හු', 'he': 'හෙ', 'ho': 'හො',
    'haa': 'හා', 'hAa': 'හා', 'hie': 'හී', 'hei': 'හේ',
    'hoe': 'හෝ', 'huu': 'හු', 'hau': 'හෞ',

    'ni': 'නි', 'nu': 'නු', 'ne': 'නෙ', 'no': 'නො',
    'naa': 'නා', 'nAa': 'නෑ', 'nie': 'නි', 'nei': 'නී',
    'noe': 'නෝ', 'nuu': 'නු', 'nau': 'නෞ','n':'න්',

    'pi': 'පි', 'pu': 'පු', 'pe': 'පෙ', 'po': 'පො',
    'paa': 'පා', 'pAa': 'පෑ', 'pie': 'පී', 'pei': 'පේ',
    'poe': 'පෝ', 'puu': 'පූ', 'pau': 'පෞ',

    'Na': 'ණි', 'Nu': 'ණු', 'Ne': 'ණෙ', 'No': 'ණො',
    'Naa': 'ණා', 'NAa': 'ණෑ', 'Nie': 'ණී', 'Nei': 'ණේ',
    'Noe': 'ණෝ', 'Nuu': 'ණූ', 'Nau': 'ණෞ',

    'La': 'ළි', 'Lu': 'ළු', 'Le': 'ළෙ', 'Lo': 'ළො',
    'Laa': 'ළා', 'LAa': 'ළෑ', 'Lie': 'ළී', 'Lei': 'ළේ',
    'Loe': 'ළෝ', 'Luu': 'ළූ', 'Lau': 'ළෞ', 'bha':'භ','bhu':'භු','sh':'ශ'




}
# Function to transliterate Latin text to Sinhala
def transliterate(word):
    word = word.strip()

    # Ignore non Latin words (not just Sinhala characters)
    is_latin = False
    for c in word:
        if c.isalpha():  # Latin alphabet check
            is_latin = True
    if not is_latin:
        return word  # return the word as-is if it's not Latin

    result = ''
    i = 0
    while i < len(word):
        matched = False
        # Try to match the longest possible substring first: 3 letters, 2 letters, then 1 letter
        for length in range(3, 0, -1):  # Check 3 letters, then 2, then 1
            substring = word[i:i + length].lower()  # Convert to lowercase for case insensitivity
            if substring in p:
                result += p[substring]
                i += length  # Move index forward by the length of the matched substring
                matched = True
                break
        if not matched:
            # If no match is found, simply add the character as-is
            result += word[i]
            i += 1

    return result

# Function to process the CSV file
def process_csv(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Add a new column for transliterated text
    df['Transliterated'] = df['Column1'].apply(transliterate)

     # Save the dataframe to a new CSV file
    output_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Final_Result/Sinhala-Test-set-1-rulebase.csv'
    df.to_csv(output_path, index=False)
    print(f"File saved to {output_path}")

    # Print results: input text, expected output, and transliterated output
    for index, row in df.iterrows():
        print(f"Input Text: {row['Column1']}")
        print(f"Expected Output: {row['Column2']}")
        print(f"Transliterated Output: {row['Transliterated']}")
        print("-" * 50)

# Main function to run the program
def main():
    file_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Sinhala-Test-set-1.csv'
    process_csv(file_path)

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import evaluate
from string import punctuation
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

def compute_metrics(ref_str, pred_str, find_wer=True, find_cer=True, find_bleu=True, do_normalize_text=False):
    """
    Compute evaluation metrics WER, CER, and BLEU for given reference and predicted strings.
    """
    if do_normalize_text:
        pred_str = normalizer(pred_str).strip().strip(punctuation).strip()
        ref_str = normalizer(ref_str).strip().strip(punctuation).strip()
    else:
        pred_str = pred_str.strip().strip(punctuation).strip()
        ref_str = ref_str.strip().strip(punctuation).strip()

    if ref_str and pred_str:
        wer = wer_metric.compute(predictions=[pred_str], references=[ref_str]) if find_wer else None
        cer = cer_metric.compute(predictions=[pred_str], references=[ref_str]) if find_cer else None
        bleu = bleu_metric.compute(predictions=[pred_str], references=[ref_str])["bleu"] if find_bleu else None
    else:
        wer, cer, bleu = 1.0, 1.0, 0.0  # Default values for empty predictions or references

    return wer, cer, bleu

# Load evaluation metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("bleu")
normalizer = BasicTextNormalizer()

# Read the dataset
file_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Final_Result/Sinhala-Test-set-1-rulebase.csv'
df = pd.read_csv(file_path)

# Initialize lists to store metrics
wer_list, cer_list, bleu_list = [], [], []

# Compute metrics for each row
for index, row in df.iterrows():
    ref = row["Column2"]  # Expected output
    pred = row["Transliterated"]  # Rule-based transliteration output
    wer, cer, bleu = compute_metrics(ref, pred)
    wer_list.append(wer)
    cer_list.append(cer)
    bleu_list.append(bleu)

# Add the metrics to the dataframe
df["WER"] = wer_list
df["CER"] = cer_list
df["BLEU"] = bleu_list

# Save the updated dataframe to a new CSV file
output_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Final_Result/Sinhala-Test-set-1-rulebase-with-metrics.csv'
df.to_csv(output_path, index=False)
print(f"File with metrics saved to {output_path}")


In [None]:
# Calculate overall averages
average_wer = df["WER"].mean()
average_cer = df["CER"].mean()
average_bleu = df["BLEU"].mean()

# Print the averages
print(f"Overall Averages:")
print(f"WER: {average_wer:.4f}")
print(f"CER: {average_cer:.4f}")
print(f"BLEU: {average_bleu:.4f}")


# Test Set 02

In [None]:
import pandas as pd

# Transliteration mapping (from Latin to Sinhala)
p = {
    # Vowel sounds
    'a': 'අ', 'aa': 'ආ', 'A': 'ඇ', 'Aa': 'ඈ', 'i': 'ඉ', 'ie': 'ඊ',
    'u': 'උ', 'uu': 'ඌ', 'e': 'එ', 'ea': 'ඒ', 'I': 'ඓ', 'o': 'ඔ',

    '-A': 'ැ', '-i': 'ි', '-u': 'ු', '-e': 'ෙ', '-o': 'ො', '-I': 'ෛ',

    # Vowel sounds - long
    '-aa': 'ා', '-Aa': 'ෑ', '-ie': 'ී', '-ei': 'ේ', '-oe': 'ෝ',
    '-uu': 'ූ', '-au': 'ෞ', '\\n': 'ං', '\\h': 'ඃ', '\\N': 'ඞ',

    # Consonants - Common
    'ka': 'ක', 'ga': 'ග', 'ma': 'ම', 'ya': 'ය', 'ra': 'ර', 'ba': 'බ',
    'cha': 'ච', 'ja': 'ජ', 'ta': 'ට', 'la': 'ල', 'Da': 'ඩ', 'wa': 'ව',
    'tha': 'ත', 'sa': 'ස', 'da': 'ද', 'ha': 'හ', 'na': 'න', 'pa': 'ප',
    'Na': 'ණ', 'La': 'ළ','mi' : 'මි',

     # Consonants - Common
    'k': 'ක්', 'g': 'ග', 'm': 'ම', 'y': 'ය', 'r': 'ර', 'b': 'බ',
    'ch': 'ච', 'j': 'ජ', 't': 'ට', 'l': 'ල', 'da': 'ඩ', 'w': 'ව',
    'th': 'ත', 's': 'ස', 'd': 'ද', 'h': 'හ', 'n': 'න', 'p': 'ප',
    'N': 'ණ', 'L': 'ළ',

    # Consonants - Aspirated
    'Ka': 'ඛ', 'Ga': 'ඝ', 'cha': 'ඡ', 'Tha': 'ඨ', 'Dha': 'ඪ',
    'Tha': 'ථ', 'dha': 'ධ', 'Pa': 'ඵ', 'bha': 'භ',

    # Consonants - Special
    'Ba': 'ඹ', 'Sa': 'ශ', 'sha': 'ෂ', 'fa': 'ෆ', 'GNa': 'ඥ',
    'KNa': 'ඤ', 'jha': 'ඣ', 'Lu': 'ළු', 'Luu': 'ළූ',

    # Special sounds
    '-R': 'ර්‍', 'Ya': '්‍ය', 'ra': '්‍ර', '-': '්',

    # Handle special cases with ZWJ (Zero Width Joiner)
    '-ru': 'ෘ', 'au': 'ඖ',

    'ki': 'කි', 'ku': 'කු', 'ke': 'කෙ', 'ko': 'කො',
    'kaa': 'කා', 'kAa': 'කෑ', 'kie': 'කී', 'kei': 'කේ',
    'koe': 'කෝ', 'kuu': 'කූ', 'kau': 'කෞ',

    'gi': 'ගි', 'gu': 'ගු', 'ge': 'ගෙ', 'go': 'ගො',
    'gaa': 'ගා', 'gAa': 'ගෑ', 'gie': 'ගී', 'gei': 'ගේ',
    'goe': 'ගෝ', 'guu': 'ගූ', 'gau': 'ගෞ',

    'mi': 'මි', 'mu': 'මු', 'me': 'මෙ', 'mo': 'මො',
    'maa': 'මා', 'mAa': 'මෑ', 'mie': 'මී', 'mei': 'මේ',
    'moe': 'මෝ', 'muu': 'මු', 'mau': 'මෞ',

    'yi': 'යි', 'yu': 'යු', 'ye': 'යේ', 'yo': 'යෝ',
    'yaa': 'යා', 'yAa': 'යෑ', 'yie': 'යී', 'yei': 'යේ',
    'yoe': 'යෝ', 'yuu': 'යූ', 'yau': 'යෞ',

    'ri': 'රි', 'ru': 'රු', 're': 'රෙ', 'ro': 'රො',
    'raa': 'රා', 'rAa': 'රෑ', 'rie': 'රී', 'rei': 'රී',
    'roe': 'රෝ', 'ruu': 'රූ', 'rau': 'රෞ',

    'bi': 'බි', 'bu': 'බු', 'be': 'බෙ', 'bo': 'බො',
    'baa': 'බා', 'bAa': 'බෑ', 'bie': 'බී', 'bei': 'බේ',
    'boe': 'බෝ', 'buu': 'බූ', 'bau': 'බෞ',

    'ci': 'චි', 'cu': 'චු', 'ce': 'චෙ', 'co': 'චො',
    'caa': 'චා', 'cAa': 'චෑ', 'cie': 'චී', 'cei': 'චේ',
    'coe': 'චෝ', 'cuu': 'චූ', 'cau': 'චෞ',

    'chi': 'චි', 'chu': 'චු', 'che': 'චෙ', 'cho': 'චො',
    'chaa': 'චා', 'chAa': 'චෑ', 'chie': 'චී', 'chei': 'චේ',
    'choe': 'චෝ', 'chuu': 'චූ', 'chau': 'චෞ',

    'ji': 'ජි', 'ju': 'ජු', 'je': 'ජෙ', 'jo': 'ජො',
    'jaa': 'ජා', 'jAa': 'ජෑ', 'jie': 'ජී', 'jei': 'ජේ',
    'joe': 'ජෝ', 'juu': 'ජූ', 'jau': 'ජෞ',

    'ti': 'ටි', 'tu': 'ටු', 'te': 'ටෙ', 'to': 'ටො',
    'taa': 'ටා', 'tAa': 'ටෑ', 'tie': 'ටී', 'tei': 'ටේ',
    'toe': 'ටෝ', 'tuu': 'ටූ', 'tau': 'ටෞ',

    'li': 'ලි', 'lu': 'ළු', 'le': 'ලෙ', 'lo': 'ලො',
    'laa': 'ලා', 'lAa': 'ලෑ', 'lie': 'ලී', 'lei': 'ලේ',
    'loe': 'ලෝ', 'luu': 'ළු', 'lau': 'ලෞ',

    'Di': 'ඩි', 'Du': 'ඩු', 'De': 'ඩෙ', 'Do': 'ඩො',
    'Daa': 'ඩා', 'DAa': 'ඩෑ', 'Die': 'ඩී', 'Dei': 'ඩේ',
    'Doe': 'ඩෝ', 'Duu': 'ඩූ', 'Dau': 'ඩෞ',

    'wi': 'වි', 'wu': 'වු', 'we': 'වේ', 'wo': 'වෝ',
    'waa': 'වා', 'wAa': 'වා', 'wie': 'වී', 'wei': 'වී',
    'woe': 'වෝ', 'wuu': 'වු', 'wau': 'වෞ',

    'thi': 'ති', 'thu': 'තු', 'the': 'තෙ', 'tho': 'තො',
    'thaa': 'තා', 'thAa': 'තා', 'thie': 'තී', 'thei': 'තේ',
    'thoe': 'තෝ', 'thuu': 'තු', 'thau': 'තෞ',

    'si': 'සි', 'su': 'සු', 'se': 'සෙ', 'so': 'සො',
    'saa': 'සා', 'sAa': 'සෑ', 'sie': 'සී', 'sei': 'සේ',
    'soe': 'සෝ', 'suu': 'සූ', 'sau': 'සෞ',

    'di': 'දි', 'du': 'දු', 'de': 'දෙ', 'do': 'දො',
    'daa': 'දා', 'dAa': 'දෑ', 'die': 'දී', 'dei': 'දේ',
    'doe': 'දෝ', 'duu': 'දූ', 'dau': 'දෞ',

    'hi': 'හි', 'hu': 'හු', 'he': 'හෙ', 'ho': 'හො',
    'haa': 'හා', 'hAa': 'හා', 'hie': 'හී', 'hei': 'හේ',
    'hoe': 'හෝ', 'huu': 'හු', 'hau': 'හෞ',

    'ni': 'නි', 'nu': 'නු', 'ne': 'නෙ', 'no': 'නො',
    'naa': 'නා', 'nAa': 'නෑ', 'nie': 'නි', 'nei': 'නී',
    'noe': 'නෝ', 'nuu': 'නු', 'nau': 'නෞ','n':'න්',

    'pi': 'පි', 'pu': 'පු', 'pe': 'පෙ', 'po': 'පො',
    'paa': 'පා', 'pAa': 'පෑ', 'pie': 'පී', 'pei': 'පේ',
    'poe': 'පෝ', 'puu': 'පූ', 'pau': 'පෞ',

    'Na': 'ණි', 'Nu': 'ණු', 'Ne': 'ණෙ', 'No': 'ණො',
    'Naa': 'ණා', 'NAa': 'ණෑ', 'Nie': 'ණී', 'Nei': 'ණේ',
    'Noe': 'ණෝ', 'Nuu': 'ණූ', 'Nau': 'ණෞ',

    'La': 'ළි', 'Lu': 'ළු', 'Le': 'ළෙ', 'Lo': 'ළො',
    'Laa': 'ළා', 'LAa': 'ළෑ', 'Lie': 'ළී', 'Lei': 'ළේ',
    'Loe': 'ළෝ', 'Luu': 'ළූ', 'Lau': 'ළෞ', 'bha':'භ','bhu':'භු','sh':'ශ'




}
# Function to transliterate Latin text to Sinhala
def transliterate(word):
    word = word.strip()

    # Ignore non Latin words (not just Sinhala characters)
    is_latin = False
    for c in word:
        if c.isalpha():  # Latin alphabet check
            is_latin = True
    if not is_latin:
        return word  # return the word as-is if it's not Latin

    result = ''
    i = 0
    while i < len(word):
        matched = False
        # Try to match the longest possible substring first: 3 letters, 2 letters, then 1 letter
        for length in range(3, 0, -1):  # Check 3 letters, then 2, then 1
            substring = word[i:i + length].lower()  # Convert to lowercase for case insensitivity
            if substring in p:
                result += p[substring]
                i += length  # Move index forward by the length of the matched substring
                matched = True
                break
        if not matched:
            # If no match is found, simply add the character as-is
            result += word[i]
            i += 1

    return result

# Function to process the CSV file
def process_csv(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Add a new column for transliterated text
    df['Transliterated'] = df['Column1'].apply(transliterate)

     # Save the dataframe to a new CSV file
    output_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Final_Result/Sinhala-Test-set-2-rulebase.csv'
    df.to_csv(output_path, index=False)
    print(f"File saved to {output_path}")

    # Print results: input text, expected output, and transliterated output
    for index, row in df.iterrows():
        print(f"Input Text: {row['Column1']}")
        print(f"Expected Output: {row['Column2']}")
        print(f"Transliterated Output: {row['Transliterated']}")
        print("-" * 50)

# Main function to run the program
def main():
    file_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Sinhala-Test-set-2.csv'
    process_csv(file_path)

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import evaluate
from string import punctuation
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

def compute_metrics(ref_str, pred_str, find_wer=True, find_cer=True, find_bleu=True, do_normalize_text=False):
    """
    Compute evaluation metrics WER, CER, and BLEU for given reference and predicted strings.
    """
    if do_normalize_text:
        pred_str = normalizer(pred_str).strip().strip(punctuation).strip()
        ref_str = normalizer(ref_str).strip().strip(punctuation).strip()
    else:
        pred_str = pred_str.strip().strip(punctuation).strip()
        ref_str = ref_str.strip().strip(punctuation).strip()

    if ref_str and pred_str:
        wer = wer_metric.compute(predictions=[pred_str], references=[ref_str]) if find_wer else None
        cer = cer_metric.compute(predictions=[pred_str], references=[ref_str]) if find_cer else None
        bleu = bleu_metric.compute(predictions=[pred_str], references=[ref_str])["bleu"] if find_bleu else None
    else:
        wer, cer, bleu = 1.0, 1.0, 0.0  # Default values for empty predictions or references

    return wer, cer, bleu

# Load evaluation metrics
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("bleu")
normalizer = BasicTextNormalizer()

# Read the dataset
file_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Final_Result/Sinhala-Test-set-2-rulebase.csv'
df = pd.read_csv(file_path)

# Initialize lists to store metrics
wer_list, cer_list, bleu_list = [], [], []

# Compute metrics for each row
for index, row in df.iterrows():
    ref = row["Column2"]  # Expected output
    pred = row["Transliterated"]  # Rule-based transliteration output
    wer, cer, bleu = compute_metrics(ref, pred)
    wer_list.append(wer)
    cer_list.append(cer)
    bleu_list.append(bleu)

# Add the metrics to the dataframe
df["WER"] = wer_list
df["CER"] = cer_list
df["BLEU"] = bleu_list

# Save the updated dataframe to a new CSV file
output_path = '/content/drive/MyDrive/IndoNLPWorkshop_2025/Final_Result/Sinhala-Test-set-2-rulebase-with-metrics.csv'
df.to_csv(output_path, index=False)
print(f"File with metrics saved to {output_path}")

In [None]:
# Calculate overall averages
average_wer = df["WER"].mean()
average_cer = df["CER"].mean()
average_bleu = df["BLEU"].mean()

# Print the averages
print(f"Overall Averages:")
print(f"WER: {average_wer:.4f}")
print(f"CER: {average_cer:.4f}")
print(f"BLEU: {average_bleu:.4f}")