##### This for for extraction of rhyme chain from a poem.This can be used for feature engineering.

In [None]:
import re

# ==============================================================================
# The Final, Grammatically Precise Tokenizer for Clean Syllables
# This is used in Step 2 of the main function below.


# ==============================================================================
# The Main "Syllable Splitter" Function
# ==============================================================================
def custom_syllable_splitter(text: str) -> list:
    """
    Performs a deep syllable split by breaking down all consonant stacks.
    This function uses a two-step process to achieve the required logic for
    cases like 'နက္ခတ္တ' and 'ဥက္ကဋ္ဌ'.
    """

    
    # --- Step 1: Pre-processing to split stacks using a loop ---
    # This loop is the only reliable way to handle chained stacks.
    stacked_consonant_pattern = r'([က-အ])(်?္)([က-အ])'
    processed_text = text
    while re.search(stacked_consonant_pattern, processed_text):
        processed_text = re.sub(stacked_consonant_pattern, r'\1်'  + r'\3', processed_text)
    processed_text = re.sub(r"(([A-Za-z0-9]+)|[က-အ|ဥ|ဦ](င်္|[က-အ|ဥ][ှ]*[့း]*[်]|္[က-အ]|[ါ-ှႏꩻ][ꩻ]*){0,}|.)",r"\1 ", processed_text)
    print()
   #Step 2: Tokenization of the processed parts ---
   # The string is now clean of stacks, so we can tokenize it reliably.
    final_list = processed_text.split(" ")
    
    # Filter out empty strings caused by trailing spaces
    final_list = [word for word in final_list if word.strip()]
        
    return final_list

In [8]:
# A list of all Burmese consonants (including the great 'အ')
# This is used to distinguish consonants from vowels/diacritics.
# --- Constants ---
BURMESE_CONSONANTS = "ကခဂဃငစဆဇဈညဋဌဍဎဏတထဒဓနပဖဗဘမယရလဝသဟဠအ"
# Consonantal medials (part of the onset)
CONSONANTAL_MEDIALS = "ျြှွ" 

# --- The Rhyme Group Normalization Map ---
# This is the heart of the logic. It maps an entire spelled rime
# to its canonical phonetic group. This is robust and easy to extend.
# Key: Orthographic (spelled) rime
# Value: Normalized phonetic rime for comparison
RIME_NORMALIZATION_MAP = {
    # -at sound (အက်)
    "တ်": "က်", "ပ်": "က်",  
    # -it sound (အိက်)
    "ိတ်": "ိက်", "ိပ်": "ိက်", "ိစ်": "ိက်",
    # -ut sound (အုတ်)
    "ုတ်": "ုပ်",
    # -et sound (အက်) - Note: different vowel from -at, but uses same final
    "က်": "က်",
    
    # -an sound (အန်)
    "မ်": "န်",
    # -in sound (အိန်)
    "ိမ်": "ိန်",
    # -un sound (အုန်)
    "ုန်": "ိန်", # This can sometimes be phonetic, though less common
    
    # Special finals
    "ဉ်": "န်", # Sounds like -an
    "ည်": "ယ်", # Sounds like -ay

    "ယ့်":"ဲ့"
}


def _get_onset_length(word: str) -> int:
    """Finds the length of the initial consonant cluster (onset)."""
    if not word or word[0] not in BURMESE_CONSONANTS:
        return 0
    
    onset_len = 1
    # Greedily consume any following consonantal medials
    while onset_len < len(word) and word[onset_len] in CONSONANTAL_MEDIALS:
        onset_len += 1
    return onset_len

# --- The Main Rhyme Function ---
def get_rhyme_group(word: str) -> str:
    """
    Finds the phonetically normalized rhyme part of a Burmese word.
    
    This function correctly isolates the rime (vowel + final) and normalizes it
    according to Burmese poetic rules (ကာရန်ဝဂ်), ensuring both the
    vowel and final consonant sound group are respected.
    """
    # 1. Isolate the rime by stripping the onset.
    onset_len = _get_onset_length(word)
    rime = word[onset_len:]
    rime = rime.replace('ါ', 'ာ')


    if not rime:
        return ""
    
    # 2. Normalize the rime using the map.
    # If the rime is in our map, return its canonical value. Otherwise, return the rime itself.
    return RIME_NORMALIZATION_MAP.get(rime, rime)



def get_poetic_rime(word: str) -> str:
    """Extract the phonetically normalized rhyme part of a Burmese word."""
    onset_len = _get_onset_length(word)
    rime = word[onset_len:]
    if not rime:
        return ""
    # Vowel normalization
    rime = rime.replace('ါ', 'ာ')
    # Phonetic normalization
    return RIME_NORMALIZATION_MAP.get(rime, rime)



"၁။ ခိုးသားထားပြ၊ ဟူတုံက၊ မုချကြောက်အပ်စွာ။

၂။ ကိုယ်တွင်းသူခိုး၊ မြင်သူခိုး၊ နှစ်မျိုးမှတ်ကြရာ။

၃။ ကိုယ်တွင်းထားပြ၊ ပြင်ထားပြ၊ နှစ်ဝရှိသည်သာ။

၄။ ကိုယ်တွင်းရန်သူ၊ ပြင်ရန်သူ၊ နှစ်မူခွဲရှုရာ။

၅။ ပြင်ပလူထက်၊ ကိုယ်တွင်းခက်၊ ဆက်ဆက်သိအပ်စွာ။

၆။ ကိုယ်တွင်းခိုးသား၊ လက်ခံထား၊ ပြင်ခိုးသားတွေ မွှေလိမ့်မည်။

၇။ ကိုယ်တွင်းထားပြ၊ လက်ခံကြ၊ ပြင်ကထားပြ-ချေလိမ့်မည်။

၈။ ကိုယ်တွင်းရန်သူ၊ လက်ခံမူ၊ ပြင်ရန်သူကြောင့်- သေလိမ့်မည်။ "


In [9]:
def find_all_rhyme_chains(poem_string: str):
    """
    Analyzes a poem to find all possible consecutive rhyme chains within each stanza.
    """
    print("=" * 50)
    print("📜 Pathfinding Analysis: All Consecutive Rhyme Chains")
    print("=" * 50)

    clean_poem = poem_string.strip()
    stanzas = [s.strip() for s in clean_poem.split('။') if s.strip()]
    
    for i, stanza_text in enumerate(stanzas):
        print(f"\n--- Stanza {i + 1} ---")
        
        # Prepare data: a list of lists of syllables for the stanza
        lines = [line.strip() for line in stanza_text.split('၊') if line.strip()]
        syllables_by_line = [custom_syllable_splitter(line) for line in lines]

        all_chains = []

        # Recursive function to find paths
        def find_paths_recursive(current_path, current_line_index):
            # Check if we can extend the path further
            if current_line_index >= len(syllables_by_line) - 1:
                # Reached the end of the stanza, this path is complete
                all_chains.append(current_path)
                return

            last_word_in_path = current_path[-1]
            rime_to_match = get_poetic_rime(last_word_in_path)
            
            found_a_link = False
            # Search for links in the next line
            next_line_syllables = syllables_by_line[current_line_index + 1]
            for next_word in next_line_syllables:
                if get_poetic_rime(next_word) == rime_to_match:
                    found_a_link = True
                    # Recurse down this new branch
                    find_paths_recursive(current_path + [next_word], current_line_index + 1)
            
            # If no link was found from this point, the chain ends here.
            # We record it as a valid (but shorter) chain.
            if not found_a_link:
                all_chains.append(current_path)

        # Start the search from every syllable in the first line
        if syllables_by_line:
            for start_word in syllables_by_line[0]:
                find_paths_recursive([start_word], 0)

        # Filter and print the unique chains found
        # We only care about chains of 2 or more words.
        valid_chains = [chain for chain in all_chains if len(chain) > 1]
        
        # Use a set to store unique chains as tuples to avoid duplicates
        unique_chains = sorted(list(set(tuple(chain) for chain in valid_chains)), key=len, reverse=True)

        if not unique_chains:
            print("No consecutive rhyme chains found.")
        else:
            print("Found the following rhyme chains:")
            for chain in unique_chains:
                print(f"  - {' -> '.join(chain)}")
    
    print("\n" + "=" * 50)

# --- Your Example Poem ---
poem_string = "ခိုးသားထားပြ၊ ဟူတုံက၊ မုချကြောက်အပ်စွာ။"





if __name__ == "__main__":
    find_all_rhyme_chains(poem_string)


📜 Pathfinding Analysis: All Consecutive Rhyme Chains

--- Stanza 1 ---



Found the following rhyme chains:
  - ပြ -> က -> ချ



In [10]:
def find_dominant_scheme_in_stanza(stanza_text: str, stanza_number: int):
    """
    Analyzes a stanza to find the longest consecutive rhyme chain and reports
    its positional scheme.
    """
    print(f"\n--- Stanza {stanza_number} ---")
    
    lines = [line.strip() for line in stanza_text.split('၊') if line.strip()]
    if len(lines) < 2:
        print("Not enough lines to determine a scheme.")
        return

    syllables_by_line = [custom_syllable_splitter(line) for line in lines]
    best_chain = []

    # --- Recursive helper function to find the longest path ---
    def find_longest_path_from(line_index, rime_to_match):
        # Base case: if we're past the last line, the path ends.
        if line_index >= len(syllables_by_line):
            return []

        longest_sub_path = []
        # Find all possible next steps in the current line
        for syllable_index, word in enumerate(syllables_by_line[line_index]):
            if get_poetic_rime(word) == rime_to_match:
                # Explore the path from this point onwards
                path = find_longest_path_from(line_index + 1, rime_to_match)
                # If this path is better than what we've found so far for this level, keep it
                if len(path) > len(longest_sub_path):
                    longest_sub_path = path
        
        # Find the position of the best starting point for the longest sub-path
        best_start_pos = -1
        for syllable_index, word in enumerate(syllables_by_line[line_index]):
             if get_poetic_rime(word) == rime_to_match:
                 path = find_longest_path_from(line_index + 1, rime_to_match)
                 if len(path) == len(longest_sub_path):
                     best_start_pos = syllable_index
                     break
        
        if best_start_pos != -1:
            word_info = (syllables_by_line[line_index][best_start_pos], best_start_pos + 1)
            return [word_info] + longest_sub_path
        return []

    # --- Main logic to start the search ---
    # Try starting a chain from every syllable in the first line
    for start_idx, start_word in enumerate(syllables_by_line[0]):
        rime = get_poetic_rime(start_word)
        if not rime:
            continue
        
        # Find the longest possible chain starting with this rime
        path = find_longest_path_from(1, rime)
        
        # Prepend the starting word and its position
        current_chain = [(start_word, start_idx + 1)] + path
        
        # If this chain is the best one found so far, save it
        if len(current_chain) > len(best_chain):
            best_chain = current_chain

    # Report the results
    if len(best_chain) >= 2:
        positions = [pos for word, pos in best_chain]
        words = [word for word, pos in best_chain]
        scheme_str = '-'.join(map(str, positions))
        
        print(f"✅ Found a rhyme scheme: ({scheme_str})")
        print(f"   - Rime Group: '{get_poetic_rime(words[0])}'")
        print(f"   - Rhyming Words: {', '.join(words)}")
    else:
        print("❌ No dominant consecutive rhyme scheme found.")

def analyze_poem_for_scheme(poem_string: str):
    """Main function to analyze a poem's positional rhyme scheme stanza by stanza."""
    print("=" * 50)
    print("📜 Positional Rhyme Scheme Analysis")
    print("=" * 50)
    clean_poem = poem_string.strip()
    stanzas = [s.strip() for s in clean_poem.split('။') if s.strip()]
    for i, stanza_text in enumerate(stanzas):
        find_dominant_scheme_in_stanza(stanza_text, i + 1)
    print("\n" + "=" * 50)




# --- Another test case ---
positional_poem = """တောင်ပံကွန့်မြူး၊ ပိုးစုန်းကြူး၊ အထူးအဆန်းပင်။
အရောင်တွဲ့တွဲ့၊ ထွန်းလင်းခဲ့၊ မီးရဲ့ပမာသွင်။
ပျင်းရိလျက်သာ၊ အိပ်နေပါ၊ ရောင်ဝါမပေါ်လွင်။
ထိုနည်းတူစွာ၊ လူတို့မှာ၊ ပမာနှိုင်းဆင်ခြင်။
ကိုယ်နှင့်ဆိုင်ဘိ၊ အလုပ်ရှိ၊ လျှံငြိဂုဏ်ရောင်ထင်။
ကမ္ဘာတိုက်ဘောင်၊ အလုပ်ရောင်၊ ထွန်းပြောင်အမြဲပင်။
ပျင်းပျင်းရိရိ၊ အိပ်နေဘိ၊ မရှိအလုပ်ခွင်။
မှေးတေးမှိန်တိမ်၊ သေးသိမ်သိမ်၊ အရှိန်ကင်းလိမ့်ပင်။
လူဖြစ်ပါခဲ့၊ အရှိန်မဲ့၊ တကယ့်နွားအသွင်
အပျင်းရောဂါ၊ စွဲကပ်လာ၊ မြန်စွာကုဖို့ပြင်
မကုစားဘဲ အပျင်းစွဲ၊ ငမွဲငနုံပင်။ """

if __name__ == "__main__":
 
    analyze_poem_for_scheme(positional_poem)

📜 Positional Rhyme Scheme Analysis

--- Stanza 1 ---



✅ Found a rhyme scheme: (4-3-2)
   - Rime Group: 'ူး'
   - Rhyming Words: မြူး, ကြူး, ထူး

--- Stanza 2 ---



✅ Found a rhyme scheme: (3-3-2)
   - Rime Group: 'ဲ့'
   - Rhyming Words: တွဲ့, ခဲ့, ရဲ့

--- Stanza 3 ---



✅ Found a rhyme scheme: (4-3-2)
   - Rime Group: 'ာ'
   - Rhyming Words: သာ, ပါ, ဝါ

--- Stanza 4 ---



✅ Found a rhyme scheme: (4-3-2)
   - Rime Group: 'ာ'
   - Rhyming Words: စွာ, မှာ, မာ

--- Stanza 5 ---



✅ Found a rhyme scheme: (4-3-2)
   - Rime Group: 'ိ'
   - Rhyming Words: ဘိ, ရှိ, ငြိ

--- Stanza 6 ---



✅ Found a rhyme scheme: (4-3-2)
   - Rime Group: 'ောင်'
   - Rhyming Words: ဘောင်, ရောင်, ပြောင်

--- Stanza 7 ---



✅ Found a rhyme scheme: (3-3-2)
   - Rime Group: 'ိ'
   - Rhyming Words: ရိ, ဘိ, ရှိ

--- Stanza 8 ---



✅ Found a rhyme scheme: (3-2-2)
   - Rime Group: 'ိန်'
   - Rhyming Words: မှိန်, သိမ်, ရှိန်

--- Stanza 9 ---






✅ Found a rhyme scheme: (4-3-2)
   - Rime Group: 'ဲ့'
   - Rhy