In [808]:
# -*- coding: utf-8 -*-
import os
import re
import pandas as pd
import graphviz


## set the phonology
h_tone = set("áéíóú")
l_tone = set("àèìòù")   
f_tone = set("âêîôû")   
r_tone = set("ǎěǐǒǔ")  
untoned = set("aeiou")  # for long vowels (â indicates a short vowel with fall tone; âa a long F)
vowels = h_tone | l_tone | r_tone | f_tone | untoned
moraic_coda = 1 # 1 if coda carries mora else 0


tones =  h_tone | l_tone   # Combine high and low tones into one set
special_tones =  r_tone | f_tone  # Set for special tones (F, R)
         
class Autorep:

    global check_coda
    def check_coda(syl):
        # check if a syllable contains a coda
        for i in range(1, len(syl)):
            if syl[i] not in vowels and syl[i-1] in vowels:
                return moraic_coda 
        return 0
    
    global vowel_count  # check number of vowels a syllable contains 
    def vowel_count(syl):
        vowel_count = 0
        for i in range(0, len(syl)):  
            if syl[i] in vowels or syl[i] in tones:
                vowel_count += 1
            elif syl[i] in special_tones and (i + 1 < len(syl)) and syl[i + 1] not in vowels:
                vowel_count += 2
        return vowel_count
        
    global mora_count # count the number of mora in a string 
    def mora_count(str): 
        mora_count = 0
        mora_list =[]
        syllables = str.split(".")
        for i, syl in enumerate(syllables):
            syl_weight = check_coda(syl) + vowel_count(syl)
            mora_list.append(syl_weight)
            #for j in range(syl_weight):
             #   self.assoc.append((self.tone[i],j+mora_count+1,i+1))
            mora_count += syl_weight
        return mora_count, mora_list
    

    global contour_count # count the number of contour tones in a string 
    def contour_count(s):
        count = 0  
        for i in s:
            if i in special_tones:
                count += 1
        return count

    
    def __init__(self, word="", tone='', mora=0, ocp_mel="", assoc=None):
        """
        Initialize an Autorep object.

        Parameters:
        - word (str): The word with tone markers. 
        - tone (str): The tone markers directly extracted from the word(HFLR). 
        - mel (str): The melody (F -> HL and R -> LH) before OCP. 
        - ocp_mel (str): The OCP-applied tone representation of the word. 
        - assoc (list): A list of tuples (j,k) indicating the association 
                        between tone (indexed by j)
                                mora (indexed by i) 
                                syllable (indexed by k) list.
        """

        self.word = word
        self.tone = ''
        self.mel = ''
        self.ocp_mel = ocp_mel
        self.mora = mora 
        self.assoc = assoc if assoc is not None else []

        self.tone_labels = {'H': h_tone, 'L': l_tone, 'F': f_tone, 'R': r_tone}
        self.tone += ''.join(next((k for k, v in self.tone_labels.items() if seg in v), '') for seg in self.word)
    
        syllables = self.word.split(".")
        
        #assign association 
        mora_idx = 0
        for i, syl in enumerate(syllables):
            syl_weight = check_coda(syl) + vowel_count(syl)
            for j in range(syl_weight):
                self.assoc.append((self.tone[i],j+mora_idx+1,i+1))
            mora_idx += syl_weight
       

        # Flatten F into HL
        for i, (t, m, s) in enumerate(self.assoc):    
            if self.tone[s-1] == 'F':
                # Join syllables up to s-1 to pass as a string to mora_count
                syllable_str = ''.join(syllables[:s-1])
                if m - mora_count(syllable_str)[0] == 1:
                    t = 'H'
                else:
                    t = "L"
            elif self.tone[s-1] == 'R':
                # Join syllables up to s-1 to pass as a string to mora_count
                syllable_str = ''.join(syllables[:s-1])
                if m - mora_count(syllables[:s-1])[0] == 1:
                    t = 'L'
                else:
                    t = "H"
            
            # Update the association
            self.assoc[i] = (t, m, s)

        self.mel = ''.join(tone for tone, _, _ in self.assoc)
        self.ocp_mel = re.sub(r"(.)\1+", r"\1", self.mel)
        j = 0

        i =0

        while j < len(self.assoc) and i < len(self.ocp_mel): 
            if self.assoc[j][0] == self.ocp_mel[i]:  
                t,m,s = self.assoc[j]
                t = i + 1  # Assign the new value (i + 1) to t
                self.assoc[j] = (t, m, s)  # Update self.assoc[j] with the new tone
                j += 1
            else:
                i += 1
        
    def check_empty(self):
        return (self.word == "" and self.assoc == [] and self.mel == "" and self.ocp_mel == "")     


In [809]:
a = Autorep("dú.hùu")  
b = Autorep("fà.dá.màa")  
c = Autorep("gáa.jì.màa.rée")  
print(a.ocp_mel,a.assoc)
print(b.ocp_mel,b.assoc)
print(c.ocp_mel,c.assoc)

a.check_empty()


HL [(1, 1, 1), (2, 2, 2), (2, 3, 2)]
LHL [(1, 1, 1), (2, 2, 2), (3, 3, 3), (3, 4, 3)]
HLH [(1, 1, 1), (1, 2, 1), (2, 3, 2), (2, 4, 3), (2, 5, 3), (3, 6, 4), (3, 7, 4)]


False