In [9]:
import re
from google.colab import files

# Tamil phonetic patterns mapping
phonetic_mapping = {
    'க்': 'k', 'க': 'k', 'கி': 'ki', 'கீ': 'kee', 'கு': 'koo', 'கூ': 'koo', 'கெ': 'ke', 'கே': 'ke',
    'கை': 'kai', 'கொ': 'ko', 'கோ': 'ko', 'கௌ': 'kau', 'ங': 'ng', 'ச': 'ch', 'சி': 'chi', 'சீ': 'chee',
    'சு': 'chu', 'சூ': 'choo', 'செ': 'che', 'சே': 'che', 'சை': 'chai', 'சொ': 'cho', 'சோ': 'cho', 'ஞ': 'nya',
    'ஜ': 'j', 'ஜி': 'ji', 'ஜீ': 'jee', 'ஜு': 'ju', 'ஜூ': 'joo', 'ஜெ': 'je', 'ஜே': 'je', 'ஜொ': 'jo',
    'ஜோ': 'jo', 'த': 'th', 'தி': 'thi', 'தீ': 'thee', 'து': 'thu', 'தூ': 'thoo', 'தெ': 'the', 'தே': 'the',
    'தை': 'thai', 'தொ': 'tho', 'தோ': 'tho', 'ந': 'n', 'நி': 'ni', 'நீ': 'nee', 'நு': 'nu', 'நூ': 'noo',
    'நெ': 'ne', 'நே': 'ne', 'நை': 'nai', 'நொ': 'no', 'நோ': 'no', 'ப': 'p', 'பி': 'pi', 'பீ': 'pee', 'பு': 'pu',
    'பூ': 'poo', 'பெ': 'pe', 'பே': 'pe', 'பை': 'pai', 'பொ': 'po', 'போ': 'po', 'ம': 'm', 'மி': 'mi', 'மீ': 'mee',
    'மு': 'mu', 'மூ': 'moo', 'மெ': 'me', 'மே': 'me', 'மை': 'mai', 'மொ': 'mo', 'மோ': 'mo', 'ய': 'y', 'யி': 'yi',
    'யீ': 'yee', 'யு': 'yu', 'யூ': 'yoo', 'யெ': 'ye', 'யே': 'ye', 'யை': 'yai', 'யொ': 'yo', 'யோ': 'yo',
    'ர': 'r', 'ரி': 'ri', 'ரீ': 'ree', 'ரு': 'ru', 'ரூ': 'roo', 'ரெ': 're', 'ரே': 're', 'ரை': 'rai', 'ரொ': 'ro',
    'ரோ': 'ro', 'ல': 'l', 'லி': 'li', 'லீ': 'lee', 'லு': 'lu', 'லூ': 'loo', 'லெ': 'le', 'லே': 'le', 'லை': 'lai',
    'லொ': 'lo', 'லோ': 'lo', 'ள': 'L', 'ளி': 'Li', 'ளீ': 'Lee', 'ளு': 'Lu', 'ளூ': 'Loo', 'ளெ': 'Le', 'ளே': 'Le',
    'ளை': 'Lai', 'ளொ': 'Lo', 'ளோ': 'Lo', 'வ': 'v', 'வி': 'vi', 'வீ': 'vee', 'வு': 'vu', 'வூ': 'voo', 'வெ': 've',
    'வே': 've', 'வை': 'vai', 'வொ': 'vo', 'வோ': 'vo', 'ழ': 'zh', 'ழி': 'zhi', 'ழீ': 'zhee', 'ழு': 'zhu', 'ழூ': 'zhoo',
    'ழெ': 'zhe', 'ழே': 'zhe', 'ழை': 'zhai', 'ழொ': 'zho', 'ழோ': 'zho', 'ன': 'n', 'ணி': 'ni', 'ணீ': 'nee', 'ணு': 'nu',
    'ணூ': 'noo', 'ணெ': 'ne', 'ணே': 'ne', 'ணை': 'nai', 'ணொ': 'no', 'ணோ': 'no'
}

# Function to convert Tamil words to phonetic codes
def tamil_to_phonetic(word):
    phonetic_code = ""

    # Iterate over the word, matching Tamil characters
    while word:
        matched = False
        for key in phonetic_mapping:
            if word.startswith(key):
                phonetic_code += phonetic_mapping[key]
                word = word[len(key):]  # Remove the matched part
                matched = True
                break
        if not matched:
            # If no match is found, add the first character as is
            phonetic_code += word[0]
            word = word[1:]

    return phonetic_code

# Function to match partial or substring phonetic similarity
def phonetic_match(word1, word2):
    phonetic_code1 = tamil_to_phonetic(word1)
    phonetic_code2 = tamil_to_phonetic(word2)

    # Allow partial or substring matching (adjust this threshold as needed)
    if phonetic_code1 in phonetic_code2 or phonetic_code2 in phonetic_code1:
        return True
    return False

# Function to load dataset from a file
def load_dataset(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            # Read all lines and remove extra spaces or newlines
            dataset = [line.strip() for line in file.readlines()]
        return dataset
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return []

# Function to get suggestions for phonetically similar words from dataset
def get_phonetic_suggestions(input_word, dataset):
    suggestions = []
    for word in dataset:
        if phonetic_match(input_word, word):
            suggestions.append(word)
    return suggestions

# Main function to process input sentence and suggest corrections
def correct_sentence(input_sentence, dataset):
    # Split the sentence into words
    words = input_sentence.split()

    # For each word in the input sentence, find phonetic suggestions
    corrections = []
    for word in words:
        suggestions = get_phonetic_suggestions(word, dataset)
        if suggestions:
            corrections.append(f"Suggestions for '{word}': {', '.join(suggestions)}")
        else:
            corrections.append(f"No suggestions for '{word}'")

    return corrections

# Upload and read the dataset file
uploaded = files.upload()

# Read the uploaded file
file_path = list(uploaded.keys())[0]

# Load the dataset
dataset = load_dataset(file_path)

if dataset:
    # paragraph to check
    has_paragraph = input("Do you have a paragraph to check? (yes/no): ").lower()

    while has_paragraph == 'yes':
        # Input sentence
        input_sentence = input("Enter a Tamil sentence: ")

        # Get corrections and suggestions
        corrections = correct_sentence(input_sentence, dataset)

        # Output the results
        for correction in corrections:
            print(correction)

        # Ask again if the user has more paragraphs
        has_paragraph = input("Do you have another paragraph to check? (yes/no): ").lower()

    print("Process finished.")
else:
    print("Dataset could not be loaded.")


Saving dataset.txt to dataset (8).txt
Do you have a paragraph to check? (yes/no): yes
Enter a Tamil sentence: சபை எளிமை பின்பற்றுங்கள
Suggestions for 'சபை': சபை, 
Suggestions for 'எளிமை': எளிமை, 
Suggestions for 'பின்பற்றுங்கள': பின்பற்றுங்கள், , பற்று
Do you have another paragraph to check? (yes/no): yes
Enter a Tamil sentence: சிறுவர்கள  சிந்தனை சார்ந்தவர்கள்
Suggestions for 'சிறுவர்கள': சிறுவர்கள், சிறுவர், 
Suggestions for 'சிந்தனை': சிந்தனை செய், சிந்தனை, 
Suggestions for 'சார்ந்தவர்கள்': சார்ந்தவர்கள், 
Do you have another paragraph to check? (yes/no): yes
Enter a Tamil sentence: நகைச்சவையான கலந்தாய்வு ஊக்கமளிக்கும்
Suggestions for 'நகைச்சவையான': நகை, 
Suggestions for 'கலந்தாய்வு': , கலந்தாய்வு
Suggestions for 'ஊக்கமளிக்கும்': ஊக்கமளிக்கும், 
Do you have another paragraph to check? (yes/no): yes
Enter a Tamil sentence: வரி சேமிக்கவும
Suggestions for 'வரி': முகவரி, வரி, 
Suggestions for 'சேமிக்கவும': சேமிக்கவும், 
Do you have another paragraph to check? (yes/no): yes
Enter a Tamil