In [None]:
def get_tuples(filename):
    words = []
    with open(filename) as wf:
        for line in wf.readlines():
            line = line.strip()
            parts = line.split(" ")
            if len(parts) != 3:
                continue
            words.append((int(parts[0]), int(parts[1]), parts[2]))
    return words

def filter_junk(phones):
    filtered = []
    for phone in phones:
        if phone[2].endswith("cl"):
            continue
        if phone[2] in ["epi", "pau", "h#"]:
            continue
        if phone[2] == "ax-h":
            filtered.append((phone[0], phone[1], "ax"))
        else:
            filtered.append(phone)
    return filtered

def get_phonetic_words(filename):
    if filename.endswith(".WRD"):
        wordfile = filename
        phonfile = wordfile.replace(".WRD", ".PHN")
    elif filename.endswith(".PHN"):
        phonfile = filename
        wordfile = phonfile.replace(".PHN", ".WRD")
    else:
        return None
    
    words = get_tuples(wordfile)
    phones = get_tuples(phonfile)
    phones = filter_junk(phones)

    def in_word(phone, word):
        return (phone[0] >= word[0]) and (phone[1] <= word[1])
    
    merged = []
    
    i = j = 0
    while i < len(words):
        word = words[i]
        current = {
            "start": word[0],
            "end": word[1],
            "word": word[2],
            "phones": []
        }
        while j < len(phones):
            phone = phones[j]
            if in_word(phone, word):
                current["phones"].append(phone[2])
                j += 1
            elif phone[0] >= word[1]:
                # Phone starts at or after word end - move to next word
                break
            else:
                # Phone starts before word but doesn't fit - skip it
                j += 1
        merged.append(current)
        i += 1

    return merged

In [None]:
# Test the fixed function
result = get_phonetic_words("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data/TRAIN/DR1/FCJF0/SA1.WRD")
for item in result:
    print(f"{item['word']}: {' '.join(item['phones'])}")

In [None]:
import glob
from collections import defaultdict

# Collect all pronunciations from the corpus
BASE_PATH = "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/data"
word_pronunciations = defaultdict(list)

for wrd_file in glob.glob(f"{BASE_PATH}/**/*.WRD", recursive=True):
    phonetic_words = get_phonetic_words(wrd_file)
    if phonetic_words:
        for item in phonetic_words:
            word = item["word"].lower()
            phones = tuple(item["phones"])
            word_pronunciations[word].append(phones)

print(f"Collected pronunciations for {len(word_pronunciations)} unique words")

In [None]:
# Load TIMIT dictionary
def load_timit_dict(filename):
    """Parse TIMITDIC.TXT format: word /pronunciation/"""
    timit_dict = {}
    with open(filename) as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith(";"):
                continue
            # Format: word /p r o n u n c i a t i o n/
            if "/" not in line:
                continue
            word_part, pron_part = line.split("/", 1)
            word = word_part.strip().lower()
            pron = tuple(pron_part.rstrip("/").strip().split())
            timit_dict[word] = pron
    return timit_dict

# Try common locations for the dictionary
dict_paths = [
    f"{BASE_PATH}/TIMITDIC.TXT",
    f"{BASE_PATH}/../TIMITDIC.TXT",
    "/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/TIMITDIC.TXT",
]

timit_dict = None
for path in dict_paths:
    try:
        timit_dict = load_timit_dict(path)
        print(f"Loaded dictionary from {path}: {len(timit_dict)} entries")
        break
    except FileNotFoundError:
        continue

if timit_dict is None:
    print("TIMIT dictionary not found - listing available files...")
    import os
    for item in os.listdir("/kaggle/input/darpa-timit-acousticphonetic-continuous-speech/"):
        print(f"  {item}")

In [None]:
def align_sequences(ref, hyp):
    """
    Align two phone sequences using dynamic programming.
    Returns list of operations: ('match', r, h), ('sub', r, h), ('del', r, None), ('ins', None, h)
    """
    m, n = len(ref), len(hyp)
    
    # DP table
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Initialize
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    
    # Fill table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if ref[i-1] == hyp[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(
                    dp[i-1][j],      # deletion
                    dp[i][j-1],      # insertion
                    dp[i-1][j-1]     # substitution
                )
    
    # Backtrace
    ops = []
    i, j = m, n
    while i > 0 or j > 0:
        if i > 0 and j > 0 and ref[i-1] == hyp[j-1]:
            ops.append(('match', ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + 1:
            ops.append(('sub', ref[i-1], hyp[j-1]))
            i -= 1
            j -= 1
        elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
            ops.append(('del', ref[i-1], None))
            i -= 1
        else:
            ops.append(('ins', None, hyp[j-1]))
            j -= 1
    
    return list(reversed(ops))

# Test alignment
ref = ('dh', 'ax', 's', 't', 'ao', 'r')
hyp = ('dh', 'ix', 's', 'ao', 'r')
print("Reference:", ref)
print("Hypothesis:", hyp)
print("Alignment:")
for op in align_sequences(ref, hyp):
    print(f"  {op}")

In [None]:
# Extract transformations across the corpus
from collections import Counter

substitutions = Counter()  # (ref_phone, actual_phone) -> count
deletions = Counter()      # ref_phone -> count
insertions = Counter()     # actual_phone -> count
matches = Counter()        # phone -> count (for computing rates)

words_analyzed = 0
words_not_in_dict = 0

if timit_dict:
    for word, pronunciations in word_pronunciations.items():
        if word not in timit_dict:
            words_not_in_dict += 1
            continue
        
        dict_pron = timit_dict[word]
        
        for actual_pron in pronunciations:
            words_analyzed += 1
            alignment = align_sequences(dict_pron, actual_pron)
            
            for op, ref_phone, actual_phone in alignment:
                if op == 'match':
                    matches[ref_phone] += 1
                elif op == 'sub':
                    substitutions[(ref_phone, actual_phone)] += 1
                elif op == 'del':
                    deletions[ref_phone] += 1
                elif op == 'ins':
                    insertions[actual_phone] += 1

    print(f"Words analyzed: {words_analyzed}")
    print(f"Words not in dictionary: {words_not_in_dict}")
    print(f"\nUnique substitution types: {len(substitutions)}")
    print(f"Unique deletions: {len(deletions)}")
    print(f"Unique insertions: {len(insertions)}")
else:
    print("Cannot analyze - dictionary not loaded")

In [None]:
# Display most common transformations
print("=== Top 20 Substitutions ===")
for (ref, actual), count in substitutions.most_common(20):
    # Compute rate: how often this phone gets this substitution vs staying the same
    total_occurrences = matches[ref] + sum(c for (r, _), c in substitutions.items() if r == ref)
    rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
    print(f"  {ref} -> {actual}: {count} ({rate:.1f}%)")

print("\n=== Top 20 Deletions ===")
for phone, count in deletions.most_common(20):
    total_occurrences = matches[phone] + deletions[phone] + sum(c for (r, _), c in substitutions.items() if r == phone)
    rate = count / total_occurrences * 100 if total_occurrences > 0 else 0
    print(f"  {phone} deleted: {count} ({rate:.1f}%)")

print("\n=== Top 20 Insertions ===")
for phone, count in insertions.most_common(20):
    print(f"  {phone} inserted: {count}")

In [None]:
# Build transformation rules with probabilities
# Format suitable for applying to CMUdict

def compute_transformation_rules(matches, substitutions, deletions, min_count=5, min_rate=1.0):
    """
    Compute transformation rules from the collected statistics.
    Returns dict: phone -> list of (target, probability) where target can be a phone or None (deletion)
    """
    rules = {}
    
    # Get all phones that appear in the reference
    all_ref_phones = set(matches.keys())
    all_ref_phones.update(r for r, _ in substitutions.keys())
    all_ref_phones.update(deletions.keys())
    
    for phone in all_ref_phones:
        # Total occurrences of this phone in reference
        total = matches[phone]
        total += deletions.get(phone, 0)
        total += sum(c for (r, _), c in substitutions.items() if r == phone)
        
        if total == 0:
            continue
        
        transformations = []
        
        # Add substitutions
        for (ref, actual), count in substitutions.items():
            if ref == phone and count >= min_count:
                rate = count / total * 100
                if rate >= min_rate:
                    transformations.append((actual, count, rate))
        
        # Add deletions
        del_count = deletions.get(phone, 0)
        if del_count >= min_count:
            rate = del_count / total * 100
            if rate >= min_rate:
                transformations.append((None, del_count, rate))
        
        if transformations:
            # Sort by count descending
            transformations.sort(key=lambda x: -x[1])
            rules[phone] = transformations
    
    return rules

rules = compute_transformation_rules(matches, substitutions, deletions)

print("=== Transformation Rules (min 5 occurrences, min 1% rate) ===")
for phone, transforms in sorted(rules.items()):
    print(f"\n{phone}:")
    for target, count, rate in transforms:
        if target is None:
            print(f"  -> ∅ (delete): {count} ({rate:.1f}%)")
        else:
            print(f"  -> {target}: {count} ({rate:.1f}%)")

In [None]:
# TIMIT to ARPABET (CMUdict) phone mapping
# TIMIT uses a slightly different phoneset than CMUdict
TIMIT_TO_ARPABET = {
    # Vowels - TIMIT often has more distinctions
    'ax': 'AH',      # schwa
    'ix': 'IH',      # reduced high front (often schwa-like)
    'ux': 'UW',      # reduced high back
    'axr': 'ER',     # schwa + r
    'ax-h': 'AH',    # breathy schwa
    'em': 'M',       # syllabic m (CMU doesn't have this)
    'en': 'N',       # syllabic n (CMU doesn't have this)  
    'eng': 'NG',     # syllabic ng
    'el': 'L',       # syllabic l (CMU doesn't have this)
    'nx': 'N',       # flap (alveolar nasal)
    'dx': 'D',       # flap (often realized as D or T)
    'q': '',         # glottal stop (not in CMU)
    'hv': 'HH',      # voiced h
    # Direct mappings (lowercase to uppercase)
    'aa': 'AA', 'ae': 'AE', 'ah': 'AH', 'ao': 'AO', 'aw': 'AW',
    'ay': 'AY', 'eh': 'EH', 'er': 'ER', 'ey': 'EY', 'ih': 'IH',
    'iy': 'IY', 'ow': 'OW', 'oy': 'OY', 'uh': 'UH', 'uw': 'UW',
    'b': 'B', 'ch': 'CH', 'd': 'D', 'dh': 'DH', 'f': 'F', 'g': 'G',
    'hh': 'HH', 'jh': 'JH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N',
    'ng': 'NG', 'p': 'P', 'r': 'R', 's': 'S', 'sh': 'SH', 't': 'T',
    'th': 'TH', 'v': 'V', 'w': 'W', 'y': 'Y', 'z': 'Z', 'zh': 'ZH',
}

def timit_to_arpabet(timit_phones):
    """Convert TIMIT phone sequence to ARPABET (CMUdict format)"""
    result = []
    for phone in timit_phones:
        mapped = TIMIT_TO_ARPABET.get(phone, phone.upper())
        if mapped:  # Skip empty mappings (like glottal stop)
            result.append(mapped)
    return tuple(result)

# Convert rules to ARPABET
arpabet_rules = {}
for phone, transforms in rules.items():
    src = TIMIT_TO_ARPABET.get(phone, phone.upper())
    if not src:
        continue
    if src not in arpabet_rules:
        arpabet_rules[src] = []
    for target, count, rate in transforms:
        if target is None:
            arpabet_rules[src].append((None, count, rate))
        else:
            tgt = TIMIT_TO_ARPABET.get(target, target.upper())
            if tgt and tgt != src:  # Don't add identity mappings
                arpabet_rules[src].append((tgt, count, rate))

print("=== Rules in ARPABET format ===")
for phone, transforms in sorted(arpabet_rules.items()):
    if transforms:
        print(f"\n{phone}:")
        for target, count, rate in transforms:
            if target is None:
                print(f"  -> ∅ (delete): {count} ({rate:.1f}%)")
            else:
                print(f"  -> {target}: {count} ({rate:.1f}%)")

In [None]:
# Export rules to JSON for later use
import json

export_rules = {}
for phone, transforms in arpabet_rules.items():
    if transforms:
        export_rules[phone] = [
            {"target": t, "count": c, "rate": round(r, 2)} 
            for t, c, r in transforms
        ]

with open("timit_transformation_rules.json", "w") as f:
    json.dump(export_rules, f, indent=2)
    
print(f"Exported {len(export_rules)} phone rules to timit_transformation_rules.json")

In [None]:
# Example: Generate pronunciation variants for a CMUdict entry
def generate_variants(pronunciation, rules, max_variants=10):
    """
    Generate pronunciation variants by applying transformation rules.
    Uses a simple approach: apply one rule at a time to generate variants.
    """
    variants = set()
    variants.add(tuple(pronunciation))
    
    for i, phone in enumerate(pronunciation):
        # Strip stress markers for lookup
        phone_base = ''.join(c for c in phone if not c.isdigit())
        
        if phone_base in rules:
            for rule in rules[phone_base]:
                target = rule["target"]
                # Create variant
                new_pron = list(pronunciation)
                if target is None:
                    # Deletion
                    new_pron = new_pron[:i] + new_pron[i+1:]
                else:
                    # Preserve stress marker if present
                    stress = ''.join(c for c in phone if c.isdigit())
                    new_pron[i] = target + stress
                variants.add(tuple(new_pron))
                
                if len(variants) >= max_variants:
                    break
        
        if len(variants) >= max_variants:
            break
    
    return list(variants)

# Example with a word
example_pron = ['W', 'AO1', 'T', 'ER0']  # "water" in CMUdict format
print(f"Base pronunciation: {' '.join(example_pron)}")
print("Variants:")
for var in generate_variants(example_pron, export_rules):
    print(f"  {' '.join(var)}")