In [1]:
with open("train.txt", "r", encoding="utf-8") as f: 
    lines = f.readlines()
    
lines = [line.strip() for line in lines]

In [2]:
clean_lines = []

for line in lines:
    if not line.strip():
        continue  # remove empty lines
    if line.startswith("=") and line.endswith("="):
        continue  # remove titles
    clean_lines.append(line)

clean_lines[:3]

['MANILA - Isang task force ang binuo ng ng National Capital Region Police Office (NCRPO) nitong Lunes upang tugisin ang suspek sa pagpatay ng dalawang opisyal ng Commission on Elections (Comelec). Ang suspek ay napabalitang nakatakas kamakailan mula sa piitan ng isang kampo ng pulisya.',
 'Ayon kay Chief Superintendent Eric Javier, NCRPO officer-in-charge, nilikha ang Task Force Ampatuan upang hanapin si PO2 Basser Ampatuan. Ngunit pinabulaanan ni Javier na nakatakas ang suspek, dahil di naman daw ito nake-ditene.',
 'Si Chief Inspector Agapito Quimson, ang namumuno sa task force, ayon kay Javier.']

In [3]:
import re
import unicodedata

# Set to store unique words
all_words_set = set()

# Regex to match Tagalog-character words (letters + hyphen)
word_pattern = re.compile(r"[a-zA-ZñÑáéíóúÁÉÍÓÚ\-]+")

for line in clean_lines:
    # Normalize Unicode to NFC
    line = unicodedata.normalize("NFC", line)
    
    # Standardize whitespace
    line = re.sub(r"\s+", " ", line).strip()
    
    # Separate punctuation from words
    line = re.sub(r"([.,!?;:()\"“”‘’])", r" \1 ", line)
    
    # Find all candidate words
    words = word_pattern.findall(line)
    
    for word in words:
        word_clean = word.strip()
    
        # Skip words with any uppercase (proper nouns, acronyms, etc.)
        if any(c.isupper() for c in word_clean):
            continue
        
        # Skip short or very long words
        if not (3 <= len(word_clean) <= 25):
            continue
        
        all_words_set.add(word_clean)

# sorted() ensures deterministic order every run (sets are unordered)
all_words = sorted(all_words_set)
print(f"Number of filtered words: {len(all_words)}")

Number of filtered words: 227175


In [4]:
import random

# all_words is sorted for deterministic input; seed fixes the random selection
random.seed(42)

if len(all_words) > 10000:
    candidates_10k = random.sample(all_words, 10000)
else:
    candidates_10k = all_words

len(candidates_10k)

10000

In [5]:
# ── MORPHEME COVERAGE PRE-CHECK (run on candidates_10k before CSV export) ──
import re

prefixes    = ['mag', 'nag', 'pag', 'mang', 'nang', 'sang', 'tag', 'ma', 'na', 'ka', 'pa', 'i']
suffixes    = ['in', 'an', 'han', 'hin', 'ng']
infixes     = ['um', 'in']
circumfixes = [('pag', 'in'), ('pag', 'an'), ('ka', 'an'), ('ma', 'an')]
reduplication_pattern = re.compile(r'^(\w{2,})-\1$')

prefix_candidates, suffix_candidates = [], []
infix_candidates, circumfix_candidates, reduplication_candidates = [], [], []

for word in candidates_10k:
    if reduplication_pattern.match(word):
        reduplication_candidates.append(word)
        continue

    is_circumfix = False
    for (pre, suf) in circumfixes:
        if word.startswith(pre) and word.endswith(suf) and len(word) > len(pre) + len(suf):
            circumfix_candidates.append(word)
            is_circumfix = True
            break
    if is_circumfix:
        continue

    if any(word.startswith(p) for p in prefixes):
        prefix_candidates.append(word)
    if any(word.endswith(s) for s in suffixes):
        suffix_candidates.append(word)
    for inf in infixes:
        idx = word.find(inf)
        if 0 < idx <= 3:
            infix_candidates.append(word)
            break

targets = {'Prefix': 1260, 'Suffix': 980, 'Infix': 595, 'Circumfix': 560, 'Reduplication': 105}
counts  = {
    'Prefix': len(prefix_candidates), 'Suffix': len(suffix_candidates),
    'Infix': len(infix_candidates), 'Circumfix': len(circumfix_candidates),
    'Reduplication': len(reduplication_candidates),
}

print('── Morpheme Coverage Pre-check (on 10k pool) ──')
print(f'{"Type":<15} {"Candidates":<12} {"Target":<10} {"Ratio":<8} Status')
print('-' * 58)
for morpheme, target in targets.items():
    count = counts[morpheme]
    ratio = count / target
    status = 'OK' if ratio >= 1.9 else 'LOW — expand source text'
    print(f'{morpheme:<15} {count:<12} {target:<10} {ratio:<8.1f} {status}')

print('\nNote: counts overlap (a word can be both prefix + suffix candidate).')
print('This is a rough headroom check only — manual sorting is your quality gate.')

── Morpheme Coverage Pre-check (on 10k pool) ──
Type            Candidates   Target     Ratio    Status
----------------------------------------------------------
Prefix          4436         1260       3.5      OK
Suffix          2077         980        2.1      OK
Infix           1229         595        2.1      OK
Circumfix       264          560        0.5      LOW — expand source text
Reduplication   23           105        0.2      LOW — expand source text

Note: counts overlap (a word can be both prefix + suffix candidate).
This is a rough headroom check only — manual sorting is your quality gate.


In [6]:
# ── TOP-UP: Build new pool from scratch to guarantee morpheme coverage ──
# Single seed at the top — all_words is sorted so shuffles are fully deterministic
random.seed(42)

# Helper functions
def is_suffix(w): return any(w.endswith(s) for s in suffixes)
def is_infix(w):  return any(0 < w.find(inf) <= 3 for inf in infixes)
def is_circ(w):   return any(w.startswith(p) and w.endswith(s) and len(w) > len(p)+len(s) for p,s in circumfixes)
def is_redup(w):  return bool(reduplication_pattern.match(w))

# Step 1: Collect all candidates per type from sorted all_words
pool_suffix = [w for w in all_words if is_suffix(w)]
pool_infix  = [w for w in all_words if is_infix(w)]
pool_circ   = [w for w in all_words if is_circ(w)]
pool_redup  = [w for w in all_words if is_redup(w)]

# Step 2: Shuffle each pool (deterministic because all_words is sorted + seed is set)
random.shuffle(pool_suffix)
random.shuffle(pool_infix)
random.shuffle(pool_circ)
random.shuffle(pool_redup)

# Step 3: Take 2x target from each type, deduplicate
guaranteed = list(dict.fromkeys(
    pool_suffix[:(980*2)] +
    pool_infix[:(595*2)] +
    pool_circ[:(560*2)] +
    pool_redup[:(105*2)]
))
print(f"Guaranteed morpheme words (deduped): {len(guaranteed)}")

# Step 4: Fill remaining slots from all_words excluding guaranteed
guaranteed_set = set(guaranteed)
remaining_pool = [w for w in all_words if w not in guaranteed_set]
random.shuffle(remaining_pool)

slots_left = 10000 - len(guaranteed)
filler = remaining_pool[:slots_left]

# Step 5: Combine and shuffle
candidates_10k = guaranteed + filler
random.shuffle(candidates_10k)

print(f"Final pool size: {len(candidates_10k)}")

Guaranteed morpheme words (deduped): 4416
Final pool size: 10000


In [7]:
# ── RE-CHECK after top-up ──
prefix_candidates, suffix_candidates = [], []
infix_candidates, circumfix_candidates, reduplication_candidates = [], [], []

for word in candidates_10k:
    if reduplication_pattern.match(word):
        reduplication_candidates.append(word)
        continue

    is_circumfix = False
    for (pre, suf) in circumfixes:
        if word.startswith(pre) and word.endswith(suf) and len(word) > len(pre) + len(suf):
            circumfix_candidates.append(word)
            is_circumfix = True
            break
    if is_circumfix:
        continue

    if any(word.startswith(p) for p in prefixes):
        prefix_candidates.append(word)
    if any(word.endswith(s) for s in suffixes):
        suffix_candidates.append(word)
    for inf in infixes:
        idx = word.find(inf)
        if 0 < idx <= 3:
            infix_candidates.append(word)
            break

counts = {
    'Prefix': len(prefix_candidates), 'Suffix': len(suffix_candidates),
    'Infix': len(infix_candidates), 'Circumfix': len(circumfix_candidates),
    'Reduplication': len(reduplication_candidates),
}

print('── Re-check after top-up ──')
print(f'{"Type":<15} {"Candidates":<12} {"Target":<10} {"Ratio":<8} Status')
print('-' * 58)
for morpheme, target in targets.items():
    count = counts[morpheme]
    ratio = count / target
    status = 'OK' if ratio >= 1.9 else 'LOW — still need more'
    print(f'{morpheme:<15} {count:<12} {target:<10} {ratio:<8.1f} {status}')

print(f'\nFinal pool size: {len(candidates_10k)}')

── Re-check after top-up ──
Type            Candidates   Target     Ratio    Status
----------------------------------------------------------
Prefix          3473         1260       2.8      OK
Suffix          3256         980        3.3      OK
Infix           2181         595        3.7      OK
Circumfix       1447         560        2.6      OK
Reduplication   234          105        2.2      OK

Final pool size: 10000


In [8]:
import pandas as pd

# Convert list to DataFrame (one word per row)
df = pd.DataFrame(candidates_10k, columns=["word"])

# Save to CSV
df.to_csv("candidates_10k.csv", index=False, encoding="utf-8")