In [35]:
# -*- coding: utf-8 -*-
import os
import re
import pandas as pd
import graphviz

# Phonology sets
h_tone = set("áéíóú")
l_tone = set("àèìòù")
f_tone = set("âêîôû")
r_tone = set("ǎěǐǒǔ")
untoned = set("aeiou")  # For long vowels (â indicates a short vowel with fall tone; âa a long F)
vowels = h_tone | l_tone | r_tone | f_tone | untoned
moraic_coda = 1  # 1 if coda carries mora, else 0

tones = h_tone | l_tone  # Combine high and low tones
special_tones = r_tone | f_tone  # Special tones (F, R)



class Autorep:
    def __init__(self, word="", ocp_mel="", assoc=None):
        """
        Initialize an Autorep object.

        Parameters:
        - word (str): The word with tone markers.
        - tone (str): The tone markers directly extracted from the word (HFLR).
        - mel (str): The melody (F -> HL and R -> LH) before OCP.
        - ocp_mel (str): The OCP-applied tone representation of the word.
        - assoc (list): A list of tuples (j, k) indicating the association 
                        between tone (indexed by j), mora (indexed by i), 
                        and syllable (indexed by k).
        """
        self.word = word
        self.tone = ""
        self.mel = ""
        self.ocp_mel = ocp_mel
        
        self.assoc = assoc if assoc is not None else []

        self.tone_labels = {"H": h_tone, "L": l_tone, "F": f_tone, "R": r_tone}
        if self.word:
            self._process_word()

    def _process_word(self):
        """Process the word to extract tones, assign associations, and apply OCP."""
        syllables = self.word.split(".")
        self.tone = "".join(
            next((k for k, v in self.tone_labels.items() if seg in v), "") 
            for seg in self.word
        )

        mora_idx = 0
        for i, syl in enumerate(syllables):
            syl_weight = self.check_coda(syl) + self.vowel_count(syl)
            for j in range(syl_weight):
                self.assoc.append((self.tone[i], j + 1, i + 1))
            #mora_idx += syl_weight

        self._flatten_tones(syllables)  # convert F and R into HL and LH
        self.mel = "".join(tone for tone, _, _ in self.assoc)
        self.ocp_mel = re.sub(r"(.)\1+", r"\1", self.mel)
        self._update_tone_indices()

    def _flatten_tones(self, syllables):
        """Flatten F and R tones into corresponding H/L patterns."""
        for i, (t, m, s) in enumerate(self.assoc):
            if self.tone[s - 1] in {"F", "R"}:
                mora_sum = sum(
                    self.check_coda(syl) + self.vowel_count(syl)
                    for syl in syllables[: s - 1]
                )
                if m - mora_sum == 1:
                    t = "H" if self.tone[s - 1] == "F" else "L"
                else:
                    t = "L" if self.tone[s - 1] == "F" else "H"
                self.assoc[i] = (t, m, s)

    def _update_tone_indices(self):
        """Update tone indices in association list to match OCP melody."""
        j, i = 0, 0
        while j < len(self.assoc) and i < len(self.ocp_mel):
            if self.assoc[j][0] == self.ocp_mel[i]:
                t, m, s = self.assoc[j]
                self.assoc[j] = (i + 1, m, s)  # Update tone index
                j += 1
            else:
                i += 1

    def get_rightmost(self, target):
        index_map = {'t': 0, 'm': 1, 's': 2}
        if target in index_map:
            return max(
                (item[index_map[target]] for item in self.assoc if item[index_map[target]] is not None), 
                default=0
            )
        return 0  # Return 0 for invalid target




    @staticmethod
    def check_coda(syl):
        """Check if a syllable contains a coda."""
        for i in range(1, len(syl)):
            if syl[i] not in vowels and syl[i - 1] in vowels:
                return moraic_coda
        return 0

    @staticmethod
    def vowel_count(syl):
        """Count the number of vowels and adjust for special tones."""
        count = 0
        for i, char in enumerate(syl):
            if char in vowels or char in tones:
                count += 1
            elif (
                char in special_tones
                and i + 1 < len(syl)
                and syl[i + 1] not in vowels
            ):
                count += 2
        return count

    @staticmethod
    def mora_count(string):
        """Count the number of mora in a string."""
        mora_count = 0
        mora_list = []
        syllables = string.split(".")
        for syl in syllables:
            syl_weight = Autorep.check_coda(syl) + Autorep.vowel_count(syl)
            mora_list.append(syl_weight)
            mora_count += syl_weight
        return mora_count, mora_list

    @staticmethod
    def contour_count(s):
        """Count the number of contour tones in a string."""
        return sum(1 for char in s if char in special_tones)

    @staticmethod
    def index_reset(lst):
        """Reset indices of the association list to start from 1."""
        if not lst:
            return []
        t_shift, m_shift, s_shift = lst[0]
        return [
            (t - t_shift + 1, m - m_shift + 1, s - s_shift + 1) for t, m, s in lst
        ]

    def check_empty(self):
        """Check if the object is empty."""
        return not (self.word or self.assoc or self.mel or self.ocp_mel)

    def check_contain(self, ar):
        """Check if the Autorep object contains another Autorep object."""
        if ar.check_empty():
            return True
        if self.check_empty() or len(ar.assoc) > len(self.assoc):
            return False

        if ar.ocp_mel in self.ocp_mel:
            match_positions = [
                m.start() for m in re.finditer(f"(?={ar.ocp_mel})", self.ocp_mel)
            ]
            for pos in match_positions:
                start_idx = [
                    idx
                    for idx, tup in enumerate(self.assoc)
                    if tup[0] == pos + 1
                ]
                for idx in start_idx:
                    subset = self.index_reset(self.assoc[idx : idx + len(ar.assoc)])
                    if subset == ar.index_reset(ar.assoc):
                        return True
        return False
    
    
    def add_tone(self):
        """
        Add an unassociated tone in the AR by updating the melody and the association list.
        
        - A new tone ('H' or 'L') is added to the melody.
        - A new association (j, None, None) is added, where:
            - j is one-unit higher than the previous tone's number or 1 if starting fresh.
            - 'None' indicates the syllable is not associated with any tone unit.
        """
        # Copy the existing associations to avoid modifying the original
        new_assoc = self.assoc.copy()
        
        # Determine the next tone to add
        if not self.ocp_mel:  # Empty string case
            return [
                Autorep(ocp_mel='H', assoc=new_assoc + [(1, None, None)]),
                Autorep(ocp_mel='L', assoc=new_assoc + [(1, None, None)])
            ]

        # Alternate tone addition ('H' -> 'L', 'L' -> 'H')
        next_tone = 'H' if self.ocp_mel[-1] == 'L' else 'L'
        next_tone_index = self.get_rightmost('t') + 1
        
        # Create the updated autorep
        return Autorep(
            ocp_mel=self.ocp_mel + next_tone,
            assoc=new_assoc + [(next_tone_index, None, None)]
        )
    
    def add_syl(self):
        new_assoc = self.assoc.copy()
        next_syl_index = self.get_rightmost('s') + 1 if self.get_rightmost('s') is not None else 1
        return Autorep(ocp_mel= self.ocp_mel,assoc = new_assoc + [(None, None, next_syl_index)])
    
    def add_mora(self):
        new_assoc = self.assoc.copy()
        next_mora_index = self.get_rightmost('m') + 1 if self.get_rightmost('s') is not None else 1
        return Autorep(ocp_mel= self.ocp_mel,assoc = new_assoc + [(None, next_mora_index,None )])

    
    def check_float(self,target):
        index_map = {'t': 0, 'm': 1, 's': 2}
        if target in index_map:
            for item in self.assoc:
                if item[index_map[target]] is None:
                    return item
        return 0
    
    def float_tone_to_syl(self):
            """
            Associate the first floating tone to the last syllable
            e.g LH [(1,1), (2,None)] -> [(1,1), (2,1)]
            """
            if self.check_float("t"):
                doubly_linked_pair = [(j,_,k) for (j,_,k)in self.assoc if j is not None and k is not None]
                if doubly_linked_pair:
                    last_valid_tuple = max(doubly_linked_pair)
                    t,_,s = last_valid_tuple
                    first_float_tone = min(((j,_,k) for (j,_,k)in self.assoc if k is None), default=(float('inf'), None))
                    k,_,_= first_float_tone
                    first_float_tone_index = self.assoc.index(first_float_tone)
                    new_assoc = self.assoc[:]
                    new_assoc[first_float_tone_index] = (k,_,s)
                    return new_assoc
    
    def show(self):
        return(self.ocp_mel,self.assoc) 


    def __eq__(self, other):
        return self.ocp_mel == other.ocp_mel and set(self.assoc) == set(other.assoc)

    
    

In [18]:
# Creating Autorep objects
a = Autorep("dú.hùu")  
b = Autorep("fà.dá.màa")  
c = Autorep("gáa.jì.màa.rée")  
e = Autorep("")
# Testing containment
print("c contain a:",c.check_contain(a)) # False
print("c contain b:",c.check_contain(b)) # False
print("b contain a:",b.check_contain(a)) #   True
print("b contain c:",a.check_contain(c)) #  false
print("a contain b:",a.check_contain(b)) #  false
print("a contain c:",a.check_contain(c)) #  false




c contain a: False
c contain b: False
b contain a: True
b contain c: False
a contain b: False
a contain c: False


In [36]:
a = Autorep("dú.hùu")  
b = Autorep("fà.dá.màa")  
c = Autorep("gáa.jì.màa.rée")  
aa = a.add_tone()
aa.show()

('HLH', [(1, 1, 1), (2, 1, 2), (2, 2, 2), (3, None, None)])

In [44]:
aa.assoc = aa.float_tone_to_syl()
aa.assoc

TypeError: 'NoneType' object is not iterable