In [1]:
from pathlib import Path
from datetime import datetime

In [2]:
now = datetime.now()
now

datetime.datetime(2024, 3, 22, 23, 19, 40, 57029)

In [3]:
def flatten_list(nested_list):
    result = []
    for element in nested_list:
        if isinstance(element, list):
            result.extend(flatten_list(element))
        else:
            result.append(element)
    return result

In [4]:
dbp_path = sorted(Path("./source/dbp_clean/").glob("*.list"))
dbp_path

[PosixPath('source/dbp_clean/johor.list'),
 PosixPath('source/dbp_clean/kedah.list'),
 PosixPath('source/dbp_clean/kelantan.list'),
 PosixPath('source/dbp_clean/melaka.list'),
 PosixPath('source/dbp_clean/negeri.list'),
 PosixPath('source/dbp_clean/pahang.list')]

In [5]:
single_path = Path("./source/single.list")
common_path = Path("./source/common.list")
syllable_path = Path("./source/syllable.list")

In [6]:
paths = [syllable_path,single_path,common_path] + dbp_path
paths

[PosixPath('source/syllable.list'),
 PosixPath('source/single.list'),
 PosixPath('source/common.list'),
 PosixPath('source/dbp_clean/johor.list'),
 PosixPath('source/dbp_clean/kedah.list'),
 PosixPath('source/dbp_clean/kelantan.list'),
 PosixPath('source/dbp_clean/melaka.list'),
 PosixPath('source/dbp_clean/negeri.list'),
 PosixPath('source/dbp_clean/pahang.list')]

In [7]:
contents = []
for path in paths:
    with open(path, 'r') as file:
        content = file.read()
        contents.append(content)

In [8]:
contents = flatten_list(contents)

In [9]:
contentss = "\n".join(contents).split("\n")
contentss[-10:]

['kelestarian lingkungan|K EH L EH S T AA R IY AA N L IH NG K UW NG AA N',
 'pemanasan global|P EH M AA N AA S AA N G L OW B AA L',
 'energi terbarukan|EH N ER JH IY T ER B AA R UW K AA N',
 'pengelolaan sumber daya|P EH NG EH L OW L AA N S UW M B ER D AA Y AA',
 'teknologi hijau|T EH K N OW L OW G IY HH IY JH UW',
 'realitas maya|R IY AA L IH T AA S M AA Y AA',
 'kecerdasan buatan|K EH CH ER D AA S AA N B UW AA T AA N',
 'media digital|M IY D IY AA D IH JH IH T AA L',
 'jaringan tanpa wayar|J AA R IH NG AA N T AA N P AA W AA Y AA R',
 'pemasaran online|P EH M AA S AA R AA N OW N L AY N']

In [10]:
# Create a dictionary to store unique text entries and their associated ARPABET representations
unique_entries = {}

# Iterate through each entry and update the dictionary with only the first occurrence of each text entry
for entry in contentss:
    if not entry:
        continue
    text, arpabet = entry.split('|')
    if text not in unique_entries:
        unique_entries[text] = arpabet

# Convert the unique dictionary entries back to a list of strings
unique_list = [f"{text.strip()}|{arpabet.strip()}" for text, arpabet in unique_entries.items()]

# Print the unique entries
print(unique_list)

['a|AH0', 'al|AH0 L', 'am|AH0 M', 'an|AH0 N', 'ang|AH0 NG', 'ap|AH0 P', 'ar|AH0 R', 'as|AH0 S', 'at|AH0 T', 'au|AW1', 'b|B', 'ba|B AA1', 'bab|B AE1 B', 'bah|B AA1', 'bai|B AY1', 'bak|B AE1 K', 'bal|B AE1 L', 'ban|B AE1 N', 'bang|B AE1 NG', 'bar|B AA1 R', 'bas|B AE1 S', 'bau|B AW1', 'be|B IY1', 'beh|B EH1', 'bek|B EH1 K', 'ben|B EH1 N', 'beng|B EH1 NG', 'ber|B ER', 'bes|B EH1 S', 'bi|B IH', 'big|B IH G', 'bih|B IH', 'bin|B IH N', 'bing|B IH NG', 'bio|B IH OW1', 'bit|B IH1 T', 'bloc|B L AA1 K', 'bo|B OW1', 'boh|B OW1 HH', 'bok|B OW1 K', 'bon|B OW1 N', 'bong|B OW1 NG', 'bot|B OW1 T', 'bu|B UW1', 'bua|B UW1 AH0', 'buah|B UW1 AH0 HH', 'buat|B UW1 AH0 T', 'bun|B UW1 N', 'but|B UW1 T', 'c|CH', 'ca|CH AH0', 'cak|CH AH0 K', 'cam|CH AH0 M', 'cang|CH AH0 NG', 'cap|CH AH0 P', 'ce|CH EY0', 'cem|CH EY0 M', 'cer|CH ER', 'chi|CH IY1', 'ci|CH IH', 'cik|CH IH K', 'cil|CH IH L', 'cin|CH IH N', 'cing|CH IH NG', 'clo|CH L OW1', 'co|CH OW1', 'cok|CH OW1 K', 'com|CH OW1 M', 'cot|CH OW1 T', 'crypto|K R IH1 P 

In [11]:
# Open a new file for writing the combined contents
with open('combine.rep', 'w+') as combined_file:
    
    combined_file.write(f"## Date:  {now.date()}\n")
    combined_file.write("## Generated by DSMO\n\n")
    for data in unique_list:
        combined_file.write(data.replace("|",'  ') + '\n')  # Adding a newline as a delimiter