In [None]:
def get_tuples(filename):
    words = []
    with open(filename) as wf:
        for line in wf.readlines():
            line = line.strip()
            parts = line.split(" ")
            if len(parts) != 3:
                continue
            words.append((int(parts[0]), int(parts[1]), parts[2]))
    return words

def filter_junk(phones):
    filtered = []
    for phone in phones:
        if phone[2].endswith("cl"):
            continue
        if phone[2] in ["epi", "pau", "h#"]:
            continue
        if phone[2] == "ax-h":
            filtered.append((phone[0], phone[1], "ax"))
        else:
            filtered.append(phone)
    return filtered

def get_phonetic_words(filename):
    if filename.endswith(".WRD"):
        wordfile = filename
        phonfile = wordfile.replace(".WRD", ".PHN")
    elif filename.endswith(".PHN"):
        phonfile = filename
        wordfile = phonfile.replace(".PHN", ".WRD")
    else:
        return None
    
    words = get_tuples(wordfile)
    phones = get_tuples(phonfile)
    phones = filter_junk(phones)

    def in_word(phone, word):
        return (phone[0] >= word[0]) and (phone[1] <= word[1])
    
    merged = []
    
    i = j = 0
    while i < len(words):
        word = words[i]
        current = {
            "start": word[0],
            "end": word[1],
            "word": word[2],
            "phones": []
        }
        while j < len(phones):
            phone = phones[j]
            if in_word(phone, word):
                current["phones"].append(phone[2])
                j += 1
            elif phone[0] >= word[1]:
                # Phone starts at or after word end - move to next word
                break
            else:
                # Phone starts before word but doesn't fit - skip it
                j += 1
        merged.append(current)
        i += 1

    return merged

In [None]:
# Test the fixed function
result = get_phonetic_words("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/TRAIN/DR1/FCJF0/SA1.WRD")
for item in result:
    print(f"{item['word']}: {' '.join(item['phones'])}")

In [None]:
import glob
from collections import defaultdict

# Collect all pronunciations from the corpus
BASE_PATH = "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data"
word_pronunciations = defaultdict(list)

for wrd_file in glob.glob(f"{BASE_PATH}/**/*.WRD", recursive=True):
    phonetic_words = get_phonetic_words(wrd_file)
    if phonetic_words:
        for item in phonetic_words:
            word = item["word"].lower()
            phones = tuple(item["phones"])
            word_pronunciations[word].append(phones)

print(f"Collected pronunciations for {len(word_pronunciations)} unique words")

In [None]:
# Load TIMIT dictionary
def load_timit_dict(filename):
    """Parse TIMITDIC.TXT format: word /pronunciation/"""
    timit_dict = {}
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith(";"):
                continue
            # Format: word /p r o n u n c i a t i o n/
            if "/" not in line:
                continue
            word_part, pron_part = line.split("/", 1)
            word = word_part.strip().lower()
            pron = tuple(pron_part.rstrip("/").strip().split())
            timit_dict[word] = pron
    return timit_dict

# Try common locations for the dictionary
dict_paths = [
    f"{BASE_PATH}/TIMITDIC.TXT",
    f"{BASE_PATH}/../TIMITDIC.TXT",
    "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/TIMITDIC.TXT",
]

timit_dict = None
for path in dict_paths:
    try:
        timit_dict = load_timit_dict(path)
        print(f"Loaded dictionary from {path}: {len(timit_dict)} entries")
        break
    except FileNotFoundError:
        continue

if timit_dict is None:
    print("TIMIT dictionary not found - listing available files...")
    import os
    for item in os.listdir("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/"):
        print(f"  {item}")

In [None]:
def align_sequences(ref, hyp):
    """
    Align two phone sequences using dynamic programming.
    Returns list of operations: ('match', r, h), ('sub', r, h), ('del', r, None), ('ins', None, h)
    """
    m, n = len(ref), len(hyp)
    
    # DP table
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref[i-1] == hyp[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(
                    dp[i-1][j],      # deletion
                    dp[i][j-1],      # insertion
                    dp[i-1][j-1]     # substitution
                )
    
    # Backtrace
    ops = []
    i, j = m, n
    while i > 0 or j > 0:
        if i > 0 and j > 0 and ref[i-1] == hyp[j-1]:
            ops.append(('match', ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + 1:
            ops.append(('sub', ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
            ops.append(('del', ref[i-1], None))
            i -= 1
        else:
            ops.append(('ins', None, hyp[j-1]))
            j -= 1
    
    return list(reversed(ops))

# Test alignment
ref = ('dh', 'ax', 's', 't', 'ao', 'r')
hyp = ('dh', 'ix', 's', 'ao', 'r')
print("Reference:", ref)
print("Hypothesis:", hyp)
print("Alignment:")
for op in align_sequences(ref, hyp):
    print(f"  {op}")

In [None]:
# Extract transformations across the corpus
from collections import Counter

substitutions = Counter()  # (ref_phone, actual_phone) -> count
deletions = Counter()      # ref_phone -> count
insertions = Counter()     # actual_phone -> count
matches = Counter()        # phone -> count (for computing rates)

words_analyzed = 0
words_not_in_dict = 0

if timit_dict:
    for word, pronunciations in word_pronunciations.items():
        if word not in timit_dict:
            words_not_in_dict += 1
            continue
        
        dict_pron = timit_dict[word]
        
        for actual_pron in pronunciations:
            words_analyzed += 1
            alignment = align_sequences(dict_pron, actual_pron)
            
            for op, ref_phone, actual_phone in alignment:
                if op == 'match':
                    matches[ref_phone] += 1
                elif op == 'sub':
                    substitutions[(ref_phone, actual_phone)] += 1
                elif op == 'del':
                    deletions[ref_phone] += 1
                elif op == 'ins':
                    insertions[actual_phone] += 1

    print(f"Words analyzed: {words_analyzed}")
    print(f"Words not in dictionary: {words_not_in_dict}")
    print(f"\nUnique substitution types: {len(substitutions)}")
    print(f"Unique deletions: {len(deletions)}")
    print(f"Unique insertions: {len(insertions)}")
else:
    print("Cannot analyze - dictionary not loaded")

In [None]:
# Define phonetic feature classes for context-sensitive rules
VOWELS = {'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ay', 'eh', 'er', 'ey', 
          'ih', 'ix', 'iy', 'ow', 'oy', 'uh', 'uw', 'ux', 'axr'}
CONSONANTS = {'b', 'ch', 'd', 'dh', 'dx', 'f', 'g', 'hh', 'hv', 'jh', 
              'k', 'l', 'm', 'n', 'ng', 'nx', 'p', 'r', 's', 'sh', 
              't', 'th', 'v', 'w', 'y', 'z', 'zh', 'q'}
STOPS = {'b', 'p', 'd', 't', 'g', 'k', 'dx', 'q'}
FRICATIVES = {'f', 'v', 'th', 'dh', 's', 'z', 'sh', 'zh', 'hh', 'hv'}
NASALS = {'m', 'n', 'ng', 'nx', 'em', 'en', 'eng'}
LIQUIDS = {'l', 'r', 'el'}
GLIDES = {'w', 'y'}
SYLLABICS = {'em', 'en', 'eng', 'el', 'axr'}

def get_phone_class(phone):
    """Return a set of feature classes for a phone"""
    classes = set()
    if phone in VOWELS:
        classes.add('V')
    if phone in CONSONANTS:
        classes.add('C')
    if phone in STOPS:
        classes.add('stop')
    if phone in FRICATIVES:
        classes.add('fric')
    if phone in NASALS:
        classes.add('nasal')
    if phone in LIQUIDS:
        classes.add('liquid')
    if phone in GLIDES:
        classes.add('glide')
    if phone in SYLLABICS:
        classes.add('syllabic')
    return classes

def get_context(phones, idx):
    """
    Get the phonetic context for a phone at position idx.
    Returns (left_context, right_context) as feature sets.
    """
    left = get_phone_class(phones[idx-1]) if idx > 0 else {'#'}  # word boundary
    right = get_phone_class(phones[idx+1]) if idx < len(phones)-1 else {'#'}
    return (left, right)

def context_to_str(left, right):
    """Convert context to a readable string like 'V_V' or 'C_#'"""
    l = 'V' if 'V' in left else ('C' if 'C' in left else '#')
    r = 'V' if 'V' in right else ('C' if 'C' in right else '#')
    return f"{l}_{r}"

print("Phone classes defined:")
print(f"  Vowels: {len(VOWELS)}")
print(f"  Consonants: {len(CONSONANTS)}")
print(f"  Syllabics: {SYLLABICS}")

In [None]:
# TIMIT dialect regions
DIALECT_REGIONS = {
    'DR1': 'New England',
    'DR2': 'Northern',
    'DR3': 'North Midland', 
    'DR4': 'South Midland',
    'DR5': 'Southern',
    'DR6': 'New York City',
    'DR7': 'Western',
    'DR8': 'Army Brat (moved around)',
}

def get_dialect_from_path(filepath):
    """Extract dialect region from TIMIT file path"""
    import re
    match = re.search(r'/(DR\d)/', filepath)
    if match:
        return match.group(1)
    return None

# Re-collect pronunciations with file path info for dialect tracking
word_pronunciations_detailed = defaultdict(list)

for wrd_file in glob.glob(f"{BASE_PATH}/**/*.WRD", recursive=True):
    dialect = get_dialect_from_path(wrd_file)
    phonetic_words = get_phonetic_words(wrd_file)
    if phonetic_words:
        for item in phonetic_words:
            word = item["word"].lower()
            phones = tuple(item["phones"])
            word_pronunciations_detailed[word].append({
                'phones': phones,
                'dialect': dialect,
                'file': wrd_file
            })

# Count by dialect
dialect_counts = Counter()
for word, prons in word_pronunciations_detailed.items():
    for p in prons:
        dialect_counts[p['dialect']] += 1

print("Pronunciations by dialect region:")
for dr, name in DIALECT_REGIONS.items():
    print(f"  {dr} ({name}): {dialect_counts.get(dr, 0)}")

In [None]:
# Context-sensitive transformation extraction with dialect tracking
from collections import defaultdict

# Data structures for context-sensitive rules
# Key: (phone, context_str) -> Counter of transformations
context_substitutions = defaultdict(Counter)  # (ref_phone, context) -> {actual_phone: count}
context_deletions = defaultdict(Counter)       # (ref_phone, context) -> count
context_matches = defaultdict(Counter)         # (ref_phone, context) -> count

# Dialect-specific tracking
# Key: dialect -> (phone, context) -> Counter
dialect_substitutions = defaultdict(lambda: defaultdict(Counter))
dialect_deletions = defaultdict(lambda: defaultdict(Counter))
dialect_matches = defaultdict(lambda: defaultdict(Counter))

def align_with_positions(ref, hyp):
    """
    Align sequences and return operations with reference positions.
    Returns list of (op, ref_idx, ref_phone, hyp_phone)
    """
    m, n = len(ref), len(hyp)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref[i-1] == hyp[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    
    ops = []
    i, j = m, n
    while i > 0 or j > 0:
        if i > 0 and j > 0 and ref[i-1] == hyp[j-1]:
            ops.append(('match', i-1, ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + 1:
            ops.append(('sub', i-1, ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
            ops.append(('del', i-1, ref[i-1], None))
            i -= 1
        else:
            ops.append(('ins', None, None, hyp[j-1]))
            j -= 1
    
    return list(reversed(ops))

# Process all words with context and dialect info
if timit_dict:
    for word, pron_list in word_pronunciations_detailed.items():
        if word not in timit_dict:
            continue
        
        dict_pron = timit_dict[word]
        
        for pron_info in pron_list:
            actual_pron = pron_info['phones']
            dialect = pron_info['dialect']
            
            alignment = align_with_positions(dict_pron, actual_pron)
            
            for op, ref_idx, ref_phone, actual_phone in alignment:
                if ref_idx is None:  # insertion - no reference context
                    continue
                
                # Get context from reference pronunciation
                left, right = get_context(dict_pron, ref_idx)
                ctx = context_to_str(left, right)
                
                if op == 'match':
                    context_matches[(ref_phone, ctx)][ref_phone] += 1
                    dialect_matches[dialect][(ref_phone, ctx)][ref_phone] += 1
                elif op == 'sub':
                    context_substitutions[(ref_phone, ctx)][actual_phone] += 1
                    dialect_substitutions[dialect][(ref_phone, ctx)][actual_phone] += 1
                elif op == 'del':
                    context_deletions[(ref_phone, ctx)]['DEL'] += 1
                    dialect_deletions[dialect][(ref_phone, ctx)]['DEL'] += 1

    print(f"Context-sensitive analysis complete")
    print(f"Unique (phone, context) pairs with substitutions: {len(context_substitutions)}")
    print(f"Unique (phone, context) pairs with deletions: {len(context_deletions)}")

In [None]:
# Display context-sensitive rules
def compute_context_rules(context_matches, context_substitutions, context_deletions, 
                          min_count=5, min_rate=2.0):
    """Compute context-sensitive transformation rules"""
    rules = {}
    
    # Collect all (phone, context) pairs
    all_pairs = set(context_matches.keys())
    all_pairs.update(context_substitutions.keys())
    all_pairs.update(context_deletions.keys())
    
    for phone, ctx in all_pairs:
        # Total occurrences in this context
        total = sum(context_matches.get((phone, ctx), Counter()).values())
        total += sum(context_substitutions.get((phone, ctx), Counter()).values())
        total += sum(context_deletions.get((phone, ctx), Counter()).values())
        
        if total < min_count:
            continue
        
        transformations = []
        
        # Substitutions
        for target, count in context_substitutions.get((phone, ctx), Counter()).items():
            if count >= min_count:
                rate = count / total * 100
                if rate >= min_rate:
                    transformations.append(('sub', target, count, rate))
        
        # Deletions
        del_count = sum(context_deletions.get((phone, ctx), Counter()).values())
        if del_count >= min_count:
            rate = del_count / total * 100
            if rate >= min_rate:
                transformations.append(('del', None, del_count, rate))
        
        if transformations:
            transformations.sort(key=lambda x: -x[2])
            rules[(phone, ctx)] = {
                'total': total,
                'transforms': transformations
            }
    
    return rules

ctx_rules = compute_context_rules(context_matches, context_substitutions, context_deletions)

# Display organized by phone
print("=== Context-Sensitive Rules (min 5 occurrences, min 2% rate) ===\n")

# Group by phone
by_phone = defaultdict(list)
for (phone, ctx), data in ctx_rules.items():
    by_phone[phone].append((ctx, data))

for phone in sorted(by_phone.keys()):
    print(f"\n{phone}:")
    for ctx, data in sorted(by_phone[phone], key=lambda x: -x[1]['total']):
        print(f"  in context {ctx} (n={data['total']}):")
        for op, target, count, rate in data['transforms']:
            if op == 'del':
                print(f"    -> ∅ (delete): {count} ({rate:.1f}%)")
            else:
                print(f"    -> {target}: {count} ({rate:.1f}%)")

In [None]:
# Analyze dialect-specific transformation patterns
def compute_dialect_rules(dialect, min_count=3, min_rate=2.0):
    """Compute rules for a specific dialect"""
    d_matches = dialect_matches[dialect]
    d_subs = dialect_substitutions[dialect]
    d_dels = dialect_deletions[dialect]
    
    return compute_context_rules(d_matches, d_subs, d_dels, min_count, min_rate)

# Compare transformation rates across dialects for interesting phones
interesting_transforms = [
    ('t', 'V_V'),   # t-flapping between vowels
    ('d', 'V_V'),   # d-flapping
    ('r', 'V_#'),   # r-dropping word-finally after vowel
    ('ih', 'C_C'),  # vowel reduction
    ('ae', 'C_C'),  # vowel shifts
]

print("=== Dialect Comparison for Select Transformations ===\n")

for phone, ctx in interesting_transforms:
    print(f"\n{phone} in context {ctx}:")
    
    for dr in sorted(DIALECT_REGIONS.keys()):
        d_matches = dialect_matches[dr]
        d_subs = dialect_substitutions[dr]
        d_dels = dialect_deletions[dr]
        
        total = sum(d_matches.get((phone, ctx), Counter()).values())
        total += sum(d_subs.get((phone, ctx), Counter()).values())
        total += sum(d_dels.get((phone, ctx), Counter()).values())
        
        if total < 5:
            continue
        
        # Get top transformations
        transforms = []
        for target, count in d_subs.get((phone, ctx), Counter()).most_common(3):
            rate = count / total * 100
            if rate >= 1.0:
                transforms.append(f"{target}:{rate:.0f}%")
        
        del_count = sum(d_dels.get((phone, ctx), Counter()).values())
        if del_count > 0:
            rate = del_count / total * 100
            if rate >= 1.0:
                transforms.append(f"∅:{rate:.0f}%")
        
        if transforms:
            print(f"  {dr} ({DIALECT_REGIONS[dr]}): n={total}, {', '.join(transforms)}")

In [None]:
# Syllabic consonant analysis
# Syllabics (em, en, el, eng) typically arise from vowel + consonant -> syllabic
# e.g., "button" /b ah t ax n/ -> [b ah q en] (schwa + n -> syllabic n)

SYLLABIC_MAPPINGS = {
    'em': ('ax', 'm'),  # schwa + m -> syllabic m
    'en': ('ax', 'n'),  # schwa + n -> syllabic n  
    'el': ('ax', 'l'),  # schwa + l -> syllabic l
    'eng': ('ax', 'ng'), # schwa + ng -> syllabic ng
    'axr': ('ax', 'r'),  # can also be er -> axr
}

# Track syllabic realizations
syllabic_patterns = defaultdict(Counter)

if timit_dict:
    for word, pron_list in word_pronunciations_detailed.items():
        if word not in timit_dict:
            continue
        
        dict_pron = timit_dict[word]
        
        for pron_info in pron_list:
            actual_pron = pron_info['phones']
            
            # Look for syllabics in actual pronunciation
            for i, phone in enumerate(actual_pron):
                if phone in SYLLABIC_MAPPINGS:
                    # What was this in the dictionary?
                    # Try to find the corresponding position
                    alignment = align_with_positions(dict_pron, actual_pron)
                    
                    # Find what aligned to this syllabic
                    dict_phones_at_pos = []
                    for op, ref_idx, ref_phone, hyp_phone in alignment:
                        if hyp_phone == phone:
                            if ref_phone:
                                dict_phones_at_pos.append(ref_phone)
                    
                    if dict_phones_at_pos:
                        pattern = tuple(dict_phones_at_pos)
                        syllabic_patterns[phone][pattern] += 1

print("=== Syllabic Consonant Patterns ===")
print("(What dictionary sequences become syllabic consonants)\n")

for syllabic in sorted(syllabic_patterns.keys()):
    print(f"\n{syllabic} (syllabic {syllabic[-1] if syllabic != 'axr' else 'r'}):")
    for pattern, count in syllabic_patterns[syllabic].most_common(10):
        print(f"  {' + '.join(pattern)} -> {syllabic}: {count}")

In [None]:
# Export comprehensive rules to JSON
import json

def convert_rules_to_arpabet_format(ctx_rules, syllabic_patterns):
    """Convert all rules to ARPABET format for CMUdict application"""
    
    export = {
        'context_rules': {},
        'syllabic_rules': {},
        'dialect_weights': {}
    }
    
    # Context-sensitive rules
    for (phone, ctx), data in ctx_rules.items():
        src = TIMIT_TO_ARPABET.get(phone, phone.upper())
        if not src:
            continue
        
        key = f"{src}/{ctx}"
        transforms = []
        for op, target, count, rate in data['transforms']:
            if op == 'del':
                transforms.append({
                    'target': None,
                    'count': count,
                    'rate': round(rate, 2)
                })
            else:
                tgt = TIMIT_TO_ARPABET.get(target, target.upper())
                if tgt and tgt != src:
                    transforms.append({
                        'target': tgt,
                        'count': count,
                        'rate': round(rate, 2)
                    })
        
        if transforms:
            export['context_rules'][key] = {
                'total': data['total'],
                'transforms': transforms
            }
    
    # Syllabic rules (sequences that become syllabic consonants)
    for syllabic, patterns in syllabic_patterns.items():
        tgt = TIMIT_TO_ARPABET.get(syllabic, syllabic.upper())
        syllabic_key = f"syllabic_{tgt}"
        export['syllabic_rules'][syllabic_key] = []
        
        for pattern, count in patterns.most_common(5):
            src_pattern = [TIMIT_TO_ARPABET.get(p, p.upper()) for p in pattern]
            src_pattern = [p for p in src_pattern if p]  # filter empties
            if src_pattern:
                export['syllabic_rules'][syllabic_key].append({
                    'source': src_pattern,
                    'count': count
                })
    
    # Dialect weights (relative frequency of transformations by dialect)
    for dr in DIALECT_REGIONS:
        dialect_rules = compute_dialect_rules(dr, min_count=3, min_rate=1.0)
        if dialect_rules:
            export['dialect_weights'][dr] = {
                'name': DIALECT_REGIONS[dr],
                'rule_count': len(dialect_rules)
            }
    
    return export

comprehensive_rules = convert_rules_to_arpabet_format(ctx_rules, syllabic_patterns)

with open("timit_comprehensive_rules.json", "w") as f:
    json.dump(comprehensive_rules, f, indent=2)

print(f"Exported comprehensive rules:")
print(f"  Context-sensitive rules: {len(comprehensive_rules['context_rules'])}")
print(f"  Syllabic patterns: {len(comprehensive_rules['syllabic_rules'])}")
print(f"  Dialect regions: {len(comprehensive_rules['dialect_weights'])}")

In [None]:
# Apply context-sensitive rules to generate pronunciation variants
def generate_context_variants(pronunciation, rules, max_variants=20):
    """
    Generate pronunciation variants using context-sensitive rules.
    """
    # Define ARPABET phone classes
    ARPABET_VOWELS = {'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 
                      'IH', 'IY', 'OW', 'OY', 'UH', 'UW'}
    
    def get_arpabet_context(pron, idx):
        """Get V/C/# context for ARPABET phone"""
        def classify(phone):
            base = ''.join(c for c in phone if not c.isdigit())
            if base in ARPABET_VOWELS:
                return 'V'
            return 'C'
        
        left = classify(pron[idx-1]) if idx > 0 else '#'
        right = classify(pron[idx+1]) if idx < len(pron)-1 else '#'
        return f"{left}_{right}"
    
    variants = set()
    variants.add(tuple(pronunciation))
    
    for i, phone in enumerate(pronunciation):
        phone_base = ''.join(c for c in phone if not c.isdigit())
        stress = ''.join(c for c in phone if c.isdigit())
        ctx = get_arpabet_context(pronunciation, i)
        
        rule_key = f"{phone_base}/{ctx}"
        
        if rule_key in rules['context_rules']:
            rule_data = rules['context_rules'][rule_key]
            for transform in rule_data['transforms']:
                target = transform['target']
                new_pron = list(pronunciation)
                
                if target is None:
                    # Deletion
                    new_pron = new_pron[:i] + new_pron[i+1:]
                else:
                    # Substitution - preserve stress
                    new_pron[i] = target + stress
                
                variants.add(tuple(new_pron))
                
                if len(variants) >= max_variants:
                    return list(variants)
    
    return list(variants)

# Example: "butter" with context-sensitive t-flapping
example_pron = ['B', 'AH1', 'T', 'ER0']
print(f"Base pronunciation: {' '.join(example_pron)}")
print("\nContext-sensitive variants:")
for var in generate_context_variants(example_pron, comprehensive_rules):
    print(f"  {' '.join(var)}")

# Example: "button" with syllabic n
example_pron2 = ['B', 'AH1', 'T', 'AH0', 'N']
print(f"\nBase pronunciation: {' '.join(example_pron2)}")
print("Context-sensitive variants:")
for var in generate_context_variants(example_pron2, comprehensive_rules):
    print(f"  {' '.join(var)}")

In [None]:
# Display most common transformations
print("=== Top 20 Substitutions ===")
for (ref, actual), count in substitutions.most_common(20):
    # Compute rate: how often this phone gets this substitution vs staying the same
    total_occurrences = matches[ref] + sum(c for (r, _), c in substitutions.items() if r == ref)
    rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
    print(f"  {ref} -> {actual}: {count} ({rate:.1f}%)")

print("\n=== Top 20 Deletions ===")
for phone, count in deletions.most_common(20):
    total_occurrences = matches[phone] + deletions[phone] + sum(c for (r, _), c in substitutions.items() if r == phone)
    rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
    print(f"  {phone} deleted: {count} ({rate:.1f}%)")

print("\n=== Top 20 Insertions ===")
for phone, count in insertions.most_common(20):
    print(f"  {phone} inserted: {count}")

In [None]:
# Build transformation rules with probabilities
# Format suitable for applying to CMUdict

def compute_transformation_rules(matches, substitutions, deletions, min_count=5, min_rate=1.0):
    """
    Compute transformation rules from the collected statistics.
    Returns dict: phone -> list of (target, probability) where target can be a phone or None (deletion)
    """
    rules = {}
    
    # Get all phones that appear in the reference
    all_ref_phones = set(matches.keys())
    all_ref_phones.update(r for r, _ in substitutions.keys())
    all_ref_phones.update(deletions.keys())
    
    for phone in all_ref_phones:
        # Total occurrences of this phone in reference
        total = matches[phone]
        total += deletions.get(phone, 0)
        total += sum(c for (r, _), c in substitutions.items() if r == phone)
        
        if total == 0:
            continue
        
        transformations = []
        
        # Add substitutions
        for (ref, actual), count in substitutions.items():
            if ref == phone and count >= min_count:
                rate = count / total * 100
                if rate >= min_rate:
                    transformations.append((actual, count, rate))
        
        # Add deletions
        del_count = deletions.get(phone, 0)
        if del_count >= min_count:
            rate = del_count / total * 100
            if rate >= min_rate:
                transformations.append((None, del_count, rate))
        
        if transformations:
            # Sort by count descending
            transformations.sort(key=lambda x: -x[1])
            rules[phone] = transformations
    
    return rules

rules = compute_transformation_rules(matches, substitutions, deletions)

print("=== Transformation Rules (min 5 occurrences, min 1% rate) ===")
for phone, transforms in sorted(rules.items()):
    print(f"\n{phone}:")
    for target, count, rate in transforms:
        if target is None:
            print(f"  -> ∅ (delete): {count} ({rate:.1f}%)")
        else:
            print(f"  -> {target}: {count} ({rate:.1f}%)")

In [None]:
# TIMIT to ARPABET (CMUdict) phone mapping
# TIMIT uses a slightly different phoneset than CMUdict
TIMIT_TO_ARPABET = {
    # Vowels - TIMIT often has more distinctions
    'ax': 'AH',      # schwa
    'ix': 'IH',      # reduced high front (often schwa-like)
    'ux': 'UW',      # reduced high back
    'axr': 'ER',     # schwa + r
    'ax-h': 'AH',    # breathy schwa
    'em': 'M',       # syllabic m (CMU doesn't have this)
    'en': 'N',       # syllabic n (CMU doesn't have this)  
    'eng': 'NG',     # syllabic ng
    'el': 'L',       # syllabic l (CMU doesn't have this)
    'nx': 'N',       # flap (alveolar nasal)
    'dx': 'D',       # flap (often realized as D or T)
    'q': '',         # glottal stop (not in CMU)
    'hv': 'HH',      # voiced h
    # Direct mappings (lowercase to uppercase)
    'aa': 'AA', 'ae': 'AE', 'ah': 'AH', 'ao': 'AO', 'aw': 'AW',
    'ay': 'AY', 'eh': 'EH', 'er': 'ER', 'ey': 'EY', 'ih': 'IH',
    'iy': 'IY', 'ow': 'OW', 'oy': 'OY', 'uh': 'UH', 'uw': 'UW',
    'b': 'B', 'ch': 'CH', 'd': 'D', 'dh': 'DH', 'f': 'F', 'g': 'G',
    'hh': 'HH', 'jh': 'JH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N',
    'ng': 'NG', 'p': 'P', 'r': 'R', 's': 'S', 'sh': 'SH', 't': 'T',
    'th': 'TH', 'v': 'V', 'w': 'W', 'y': 'Y', 'z': 'Z', 'zh': 'ZH',
}

def timit_to_arpabet(timit_phones):
    """Convert TIMIT phone sequence to ARPABET (CMUdict format)"""
    result = []
    for phone in timit_phones:
        mapped = TIMIT_TO_ARPABET.get(phone, phone.upper())
        if mapped:  # Skip empty mappings (like glottal stop)
            result.append(mapped)
    return tuple(result)

# Convert rules to ARPABET
arpabet_rules = {}
for phone, transforms in rules.items():
    src = TIMIT_TO_ARPABET.get(phone, phone.upper())
    if not src:
        continue
    if src not in arpabet_rules:
        arpabet_rules[src] = []
    for target, count, rate in transforms:
        if target is None:
            arpabet_rules[src].append((None, count, rate))
        else:
            tgt = TIMIT_TO_ARPABET.get(target, target.upper())
            if tgt and tgt != src:  # Don't add identity mappings
                arpabet_rules[src].append((tgt, count, rate))

print("=== Rules in ARPABET format ===")
for phone, transforms in sorted(arpabet_rules.items()):
    if transforms:
        print(f"\n{phone}:")
        for target, count, rate in transforms:
            if target is None:
                print(f"  -> ∅ (delete): {count} ({rate:.1f}%)")
            else:
                print(f"  -> {target}: {count} ({rate:.1f}%)")

In [None]:
# Export rules to JSON for later use
import json

export_rules = {}
for phone, transforms in arpabet_rules.items():
    if transforms:
        export_rules[phone] = [
            {"target": t, "count": c, "rate": round(r, 2)} 
            for t, c, r in transforms
        ]

with open("timit_transformation_rules.json", "w") as f:
    json.dump(export_rules, f, indent=2)
    
print(f"Exported {len(export_rules)} phone rules to timit_transformation_rules.json")

In [None]:
# Example: Generate pronunciation variants for a CMUdict entry
def generate_variants(pronunciation, rules, max_variants=10):
    """
    Generate pronunciation variants by applying transformation rules.
    Uses a simple approach: apply one rule at a time to generate variants.
    """
    variants = set()
    variants.add(tuple(pronunciation))
    
    for i, phone in enumerate(pronunciation):
        # Strip stress markers for lookup
        phone_base = ''.join(c for c in phone if not c.isdigit())
        
        if phone_base in rules:
            for rule in rules[phone_base]:
                target = rule["target"]
                # Create variant
                new_pron = list(pronunciation)
                if target is None:
                    # Deletion
                    new_pron = new_pron[:i] + new_pron[i+1:]
                else:
                    # Preserve stress marker if present
                    stress = ''.join(c for c in phone if c.isdigit())
                    new_pron[i] = target + stress
                variants.add(tuple(new_pron))
                
                if len(variants) >= max_variants:
                    break
        
        if len(variants) >= max_variants:
            break
    
    return list(variants)

# Example with a word
example_pron = ['W', 'AO1', 'T', 'ER0']  # "water" in CMUdict format
print(f"Base pronunciation: {' '.join(example_pron)}")
print("Variants:")
for var in generate_variants(example_pron, export_rules):
    print(f"  {' '.join(var)}")