### Load Data

In [None]:
%%capture
!pip install datasets

In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets
import json
import pandas as pd

Bilingual Dictionary

In [None]:
PATH_DATA = "https://raw.githubusercontent.com/joanitolopo/instruction-tuning-mkn/refs/heads/main/app/data.json?token=GHSAT0AAAAAACNOVZ2MHTV4WXYU3L7TZC3SZY7XKVA"
!wget $PATH_DATA -O data.json

In [None]:
with open("data.json", 'r') as f:
  dictionary_data = json.load(f)

Panlex Meanings

In [None]:
ds_mkn = load_dataset('cointegrated/panlex-meanings', name='mkn', split='train')
ds_eng = load_dataset('cointegrated/panlex-meanings', name='eng', split='train')
df_mkn = ds_mkn.to_pandas()
df_eng = ds_eng.to_pandas()

df_mkn_eng = df_mkn.merge(df_eng, on='meaning', suffixes=['_mkn', '_eng']).drop_duplicates(subset=['txt_mkn', 'txt_eng'])
df_mkn_eng.sample(5)[['txt_mkn', 'txt_eng', 'langvar_uid_mkn']]

README.md:   0%|          | 0.00/69.8k [00:00<?, ?B/s]

data/mkn.tsv:   0%|          | 0.00/234k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4934 [00:00<?, ? examples/s]

eng.tsv:   0%|          | 0.00/786M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15886665 [00:00<?, ? examples/s]

Unnamed: 0,txt_mkn,txt_eng,langvar_uid_mkn
4653,kasi idop,stimulate,mkn-000
3161,garui,dense,mkn-000
1151,bangsa,like,mkn-000
7414,kasi brenti,restrain,mkn-000
376,anggor,grapevine,mkn-000


BibleNLP

In [None]:
biblenlp_mkn = load_dataset("davidstap/biblenlp-corpus-mmteb", "eng-mkn", trust_remote_code=True, split="train+validation+test")
biblenlp_mkn

Dataset({
    features: ['eng', 'mkn'],
    num_rows: 1229
})

Bhinneka Korpus

In [None]:
bhinneka_path = "https://github.com/joanitolopo/bhinneka-korpus/raw/refs/heads/main/parallel.xlsx"

In [None]:
bhinneka_korpus = pd.read_excel(bhinneka_path, index_col=0)
bhinneka_korpus = bhinneka_korpus[["ind", "mkn"]]
bhinneka_korpus.reset_index(drop=True, inplace=True)
bhinneka_korpus = Dataset.from_pandas(bhinneka_korpus)
bhinneka_korpus

Dataset({
    features: ['ind', 'mkn'],
    num_rows: 4000
})

Tapaleuk

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

In [None]:
# Precompile regex patterns for efficiency
TX_PATTERN = re.compile(r'\\tx (.*?)\n')
FT_PATTERN = re.compile(r'\\ft (.*?)\.')

def fetch_url(url):
    """Fetch the content of a URL and return the BeautifulSoup object."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return None

def extract_href_list(soup):
    """Extract href links from the specified div class."""
    if not soup:
        return []
    field = soup.find_all("div", class_="views-field views-field-fgs-label-s")
    return [f"https://archive.mpi.nl{div.find('a').get('href')}" for div in field if div.find("a")]

def extract_tbt_links(href_list):
    """Extract TBT links from the href list."""
    tbt_links = []

    def process_href(href):
      """Fetch the href and extract the TBT link if available."""
      soup = fetch_url(href)
      if soup:
          caption_div = soup.find("div", class_="flat-compound-caption")
          if caption_div:  # Check if the div exists
              link = caption_div.find('a')
              if link:
                  return f"https://archive.mpi.nl{link.get('href')}"
      # Log the missing case
      print(f"Missing 'flat-compound-caption' or link for href: {href}")
      return None

    with ThreadPoolExecutor() as executor:
        tbt_links = list(filter(None, executor.map(process_href, href_list)))

    return tbt_links

def parse_segments(tbt_links):
    """Parse MKN and ENG segments from the TBT links."""
    mkn_segment_sentence, eng_segment_sentence = [], []
    mkn_sentence, eng_sentence = [], []

    def process_tbt_link(tbt_link):
        soup = fetch_url(tbt_link)
        if not soup:
            return [], []

        text_segments = soup.find("div", class_="plain-text").find_all("p")
        temp_mkn, temp_eng = [], []

        for segment in text_segments:
            tx_match = TX_PATTERN.search(segment.text)
            ft_match = FT_PATTERN.search(segment.text)

            tx_text = tx_match.group(1).strip() if tx_match else ""
            ft_text = ft_match.group(1).strip() if ft_match else ""

            tx_text = re.sub(r'\s+', ' ', tx_text)
            ft_text = re.sub(r'\s+', ' ', ft_text)

            temp_mkn.append(tx_text)
            temp_eng.append(ft_text)

        return temp_mkn, temp_eng

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_tbt_link, tbt_links))

    for temp_mkn, temp_eng in results:
        mkn_segment_sentence.extend(temp_mkn)
        eng_segment_sentence.extend(temp_eng)
        mkn_sentence.append(" ".join(temp_mkn))
        eng_sentence.append(" ".join(temp_eng))

    return mkn_segment_sentence, eng_segment_sentence, mkn_sentence, eng_sentence

In [None]:
def main(base_url, pages=5):
    # base_url = "https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5ACF_5"

    mkn_segment_sentence_full, eng_segment_sentence_full = [], []
    mkn_sentence_full, eng_sentence_full = [], []

    for i in range(pages):
        url = base_url if i == 0 else f"{base_url}?page={i}"
        print(f"=== Parsing page {i} ===")

        soup = fetch_url(url)
        href_list = extract_href_list(soup)
        tbt_links = extract_tbt_links(href_list)

        mkn_segment_sentence, eng_segment_sentence, mkn_sentence, eng_sentence = parse_segments(tbt_links)

        mkn_segment_sentence_full.extend(mkn_segment_sentence)
        eng_segment_sentence_full.extend(eng_segment_sentence)
        mkn_sentence_full.extend(mkn_sentence)
        eng_sentence_full.extend(eng_sentence)

        print(f"=== Finished parsing page {i} ===")

    return mkn_segment_sentence_full, eng_segment_sentence_full, mkn_sentence_full, eng_sentence_full

In [None]:
mkn_segment_sentence_full, eng_segment_sentence_full, mkn_sentence_full, eng_sentence_full = main("https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5ACF_5")

(4708, 4708, 84, 84)

In [None]:
df = pd.DataFrame({
    'mkn_segment_sentence_full': mkn_segment_sentence_full,
    'eng_segment_sentence_full': eng_segment_sentence_full
})

# Save to CSV
df.to_csv('tapaleuk_segment.csv', index=False)

In [None]:
df = pd.DataFrame({
    'mkn_sentence_full': mkn_sentence_full,
    'eng_sentence_full': eng_sentence_full
})

# Save to CSV
df.to_csv('tapaleuk_full_sentence.csv', index=False)

In [None]:
df = pd.read_csv("tapaleuk_segment.csv")

[Jakarta Field Station](https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5AD1_C)

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor

# Precompile regex patterns for efficiency
TX_PATTERN = re.compile(r'\\tx (.*?)\n')
FT_PATTERN = re.compile(r'\\ft (.*?)\.')


def fetch_url(url):
    """Fetch the content of a URL and return the BeautifulSoup object."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return None


def extract_href_list(soup):
    """Extract href links from the specified div class."""
    if not soup:
        return []
    field = soup.find_all("div", class_="views-field views-field-fgs-label-s")
    return [f"https://archive.mpi.nl{div.find('a').get('href')}" for div in field if div.find("a")]


def extract_tbt_links(href_list):
    """Extract TBT links from the href list."""
    def process_href(href):
        soup = fetch_url(href)
        if soup:
            links = soup.find_all("div", class_="flat-compound-child")
            if len(links) >= 2:  # Ensure there are enough links
                link = links[1].find('a')
                if link:
                    return f"https://archive.mpi.nl{link.get('href')}"
        return None

    with ThreadPoolExecutor() as executor:
        tbt_links = list(filter(None, executor.map(process_href, href_list)))

    return tbt_links


def parse_segments(tbt_links):
    """Parse MKN and ENG segments from the TBT links."""
    mkn_segment_sentence, eng_segment_sentence = [], []
    mkn_sentence, eng_sentence = [], []

    def process_tbt_link(tbt_link):
        soup = fetch_url(tbt_link)
        if not soup:
            return [], []

        plain_text_div = soup.find("div", class_="plain-text")
        if not plain_text_div:
            return [], []

        text_segments = plain_text_div.find_all("p")
        temp_mkn, temp_eng = [], []

        for segment in text_segments:
            tx_match = TX_PATTERN.search(segment.text)
            ft_match = FT_PATTERN.search(segment.text)

            tx_text = tx_match.group(1).strip() if tx_match else ""
            ft_text = ft_match.group(1).strip() if ft_match else ""

            tx_text = re.sub(r'\s+', ' ', tx_text)
            ft_text = re.sub(r'\s+', ' ', ft_text)

            temp_mkn.append(tx_text)
            temp_eng.append(ft_text)

        return temp_mkn, temp_eng

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_tbt_link, tbt_links))

    for temp_mkn, temp_eng in results:
        mkn_segment_sentence.extend(temp_mkn)
        eng_segment_sentence.extend(temp_eng)
        mkn_sentence.append(" ".join(temp_mkn))
        eng_sentence.append(" ".join(temp_eng))

    return mkn_segment_sentence, eng_segment_sentence, mkn_sentence, eng_sentence


def main(base_url):
    mkn_segment_sentence_full, eng_segment_sentence_full = [], []
    mkn_sentence_full, eng_sentence_full = [], []

    for i in range(5):
        url = base_url if i == 0 else f"{base_url}?page={i}"
        print(f"=== Parsing page {i} ===")

        soup = fetch_url(url)
        href_list = extract_href_list(soup)
        tbt_links = extract_tbt_links(href_list)

        mkn_segment_sentence, eng_segment_sentence, mkn_sentence, eng_sentence = parse_segments(tbt_links)

        mkn_segment_sentence_full.extend(mkn_segment_sentence)
        eng_segment_sentence_full.extend(eng_segment_sentence)
        mkn_sentence_full.extend(mkn_sentence)
        eng_sentence_full.extend(eng_sentence)

        print(f"=== Finished parsing page {i} ===")

    return mkn_segment_sentence_full, eng_segment_sentence_full, mkn_sentence_full, eng_sentence_full

# Example usage
base_url = "https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5AD1_C"
mkn_segment_sentence_full, eng_segment_sentence_full, mkn_sentence_full, eng_sentence_full = main(base_url)

=== Parsing page 0 ===
=== Finished parsing page 0 ===
=== Parsing page 1 ===
=== Finished parsing page 1 ===
=== Parsing page 2 ===
=== Finished parsing page 2 ===
=== Parsing page 3 ===
=== Finished parsing page 3 ===
=== Parsing page 4 ===
=== Finished parsing page 4 ===


In [None]:
df = pd.DataFrame({
    'mkn_segment_sentence_full': mkn_segment_sentence_full,
    'eng_segment_sentence_full': eng_segment_sentence_full
})

# Save to CSV
df.to_csv('jakarta_field_station_segment.csv', index=False)

In [None]:
df = pd.DataFrame({
    'mkn_sentence_full': mkn_sentence_full,
    'eng_sentence_full': eng_sentence_full
})

# Save to CSV
df.to_csv('jakarta_field_full_sentence.csv', index=False)

In [None]:
df = pd.read_csv("jakarta_field_station_segment.csv")
df.isna().sum()

Unnamed: 0,0
mkn_segment_sentence_full,77
eng_segment_sentence_full,31801


In [None]:
PATH = "/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn"

In [None]:
# tambahkan dataset
jfs_data = pd.read_csv(f"{PATH}/jakarta_field_station_segment.csv")
tapeleuk_data = pd.read_csv(f"{PATH}/tapaleuk_segment.csv")

In [None]:
jfs_data.dropna(inplace=True)
tapeleuk_data.dropna(inplace=True)

In [None]:
jfs_dataset = Dataset.from_pandas(jfs_data.reset_index(drop=True))
tapeleuk_dataset = Dataset.from_pandas(tapeleuk_data.reset_index(drop=True))

### Translate Data English using NLLB

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
from huggingface_hub import login
# login()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
checkpoint = "facebook/nllb-200-distilled-1.3B"
tokenizer_nllb = AutoTokenizer.from_pretrained(checkpoint)
model_nllb = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
target_lang = "ind_Latn"
source_lang = "eng_Latn"

translator = pipeline("translation", model=model_nllb, tokenizer=tokenizer_nllb,
                      src_lang=source_lang, tgt_lang=target_lang, max_length = 400,
                      device=device, batch_size=128)

# translated_texts = []
# for i, example in enumerate(jfs_dataset):
#     translated_text = translator(jfs_dataset['eng_segment_sentence_full'][i])[0]["translation_text"]
#     translated_texts.append(translated_text)
#     print(f"\rProgress: {i+1}/{len(jfs_dataset)}", end=" ")

Device set to use cuda


In [None]:
translations = translator(tapeleuk_dataset['eng_segment_sentence_full'], max_length=400)

In [None]:
len(translations)

3351

In [None]:
# Batch translation for efficiency
batch_size = 1000
translated_texts = []

for i in range(0, len(jfs_dataset['eng_segment_sentence_full']), batch_size):
    batch = jfs_dataset['eng_segment_sentence_full'][i:i + batch_size]
    translations = translator(batch, max_length=400)
    translated_texts.extend([t["translation_text"] for t in translations])
    print(f"\rProgress: {min(i + batch_size, len(jfs_dataset))}/{len(jfs_dataset)}", end=" ")

print("\nTranslation completed.")

In [None]:
jfs_dataset = jfs_dataset.add_column("ind", translated_texts)
jfs_dataset = jfs_dataset.remove_columns(['eng'])
jfs_dataset

Dataset({
    features: ['mkn', 'ind'],
    num_rows: 1229
})

In [None]:
parallel_data = concatenate_datasets([bhinneka_korpus, biblenlp_mkn])
parallel_data

Dataset({
    features: ['ind', 'mkn'],
    num_rows: 5229
})

In [None]:
parallel_df = parallel_data.shuffle(seed=42).train_test_split(test_size=0.2)
parallel_df

DatasetDict({
    train: Dataset({
        features: ['ind', 'mkn'],
        num_rows: 4183
    })
    test: Dataset({
        features: ['ind', 'mkn'],
        num_rows: 1046
    })
})

In [None]:
parallel_df.push_to_hub("joanitolopo/parallel-data-mkn",
                        token="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")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/joanitolopo/parallel-data-mkn/commit/9ed46e161e63a27bebd50f487fbd4c0bad2cd4c1', commit_message='Upload dataset', commit_description='', oid='9ed46e161e63a27bebd50f487fbd4c0bad2cd4c1', pr_url=None, pr_revision=None, pr_num=None)

### Get Melayu Kupang Word Embedding

In [None]:
from gensim.models import FastText
from nltk.tokenize import word_tokenize
import nltk
import re
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
PATH = "/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn"

In [None]:
# tambahkan dataset
# parallel_mkn = load_dataset("joanitolopo/parallel-data-mkn", split="train+test")
a = pd.read_csv(f"{PATH}/tapaleuk_new.csv")
b = pd.read_csv(f"{PATH}/jakarta_field_station_full_sentence.csv")['mkn_sentence_full']
c = pd.read_csv(f"{PATH}/tapaleuk_full_sentence.csv")['mkn_sentence_full']
d = load_dataset("HuggingFaceFW/fineweb-2", name="mkn_Latn_removed")['train']
e = pd.read_csv(f"{PATH}/puisi.csv", encoding='latin1')

In [None]:
# Prepare data for FastText
fasttext_data = []

# Extract from parallel_mkn
fasttext_data.extend(parallel_mkn['mkn'])

# Extract from dataset a
fasttext_data.extend(a['text'].dropna().tolist())

# Extract from dataset b
fasttext_data.extend(b.dropna().tolist())

# Extract from dataset c
fasttext_data.extend(c.dropna().tolist())

# Extract from dataset d
fasttext_data.extend(d['text'])

# Extract from dataset e
fasttext_data.extend(e['text'])

In [None]:
fasttext_data = [
    re.sub(r'\s+', ' ',
        re.sub(r'\((.*?)\)', r'\1',
            re.sub(r'\b[x]+[\.\s]*\b', '',
                re.sub(r'\.+', '.',
                    re.sub(r'\d+', '',
                        re.sub(r"\[(.*?)\]", r"\1", data)
                    )
                )
            )
        )
    ).strip()
    for data in fasttext_data
]

In [None]:
sentences = [word_tokenize(data) for data in fasttext_data]

In [None]:
fasttext_model = FastText(sentences=sentences, vector_size=100,
                          window=5, min_count=2, workers=4)

w2v = fasttext_model.wv

In [None]:
w2v.similar_by_word("sonde")

[('sonde.', 0.9954634308815002),
 ('sonde-sonde', 0.9940887093544006),
 ("'sonde", 0.9837379455566406),
 ('sond', 0.9799175262451172),
 ('sony', 0.9519493579864502),
 ('sono', 0.9480435252189636),
 ('sonu', 0.9438668489456177),
 ('sone', 0.9414058327674866),
 ('sonto', 0.9275649189949036),
 ('sonba', 0.9207874536514282)]

In [None]:
fasttext_model.save(f"{PATH}/fasttext_model_100.bin")

# Dont dibuka

In [None]:
# mkn_segment_sentence_full = []
# eng_segment_sentence_full = []
# mkn_sentence_full = []
# eng_sentence_full = []

# for i in range(5):
#   if i == 0:
#     url = "https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5ACF_5"
#   else:
#     url = f"https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5ACF_5?page={i}"

#   print("===Parsing page", i, "===")

#   response = requests.get(url)
#   soup = BeautifulSoup(response.text, "html.parser")
#   field = soup.find_all("div", class_="views-field views-field-fgs-label-s")

#   href_list = []
#   for div in field:
#     a_tag = div.find("a")
#     if a_tag:
#       href = a_tag.get("href")
#       if href:
#           href_list.append(f"https://archive.mpi.nl{href}")

#   tbt_links = []
#   for href in href_list:
#     response = requests.get(href)
#     soup = BeautifulSoup(response.text, "html.parser")
#     links = soup.find("div", class_="flat-compound-caption").find('a').get('href')
#     tbt_links.append(f"https://archive.mpi.nl{links}")

#   mkn_segment_sentence = []
#   eng_segment_sentence = []
#   mkn_sentence = []
#   eng_sentence = []
#   for tbt_link in tbt_links:
#     response = requests.get(tbt_link)
#     soup = BeautifulSoup(response.text, "html.parser")
#     text = soup.find("div", class_="plain-text").find_all("p")
#     temp_mkn = []
#     temp_eng = []
#     for segments in text:
#       # Extract the relevant parts
#       tx_match = re.search(r'\\tx (.*?)\n', segments.text)
#       ft_match = re.search(r'\\ft (.*?)\.', segments.text)

#       # Get the extracted parts
#       tx_text = tx_match.group(1).strip() if tx_match else ""
#       ft_text = ft_match.group(1).strip() if ft_match else ""

#       tx_text = re.sub(r'\s+', ' ', tx_text).strip()
#       ft_text = re.sub(r'\s+', ' ', ft_text).strip()

#       # Combine the results
#       mkn_segment_sentence.append(tx_text)
#       eng_segment_sentence.append(ft_text)
#       temp_mkn.append(tx_text)
#       temp_eng.append(ft_text)

#     mkn_sentence.append(" ".join(temp_mkn))
#     eng_sentence.append(" ".join(temp_eng))

#   mkn_segment_sentence_full.append(mkn_segment_sentence)
#   eng_segment_sentence_full.append(eng_segment_sentence)
#   mkn_sentence_full.append(mkn_sentence)
#   eng_sentence_full.append(eng_sentence)

#   print("===Finished parsing page", i, "===")

In [None]:
# url = "https://archive.mpi.nl/tla/islandora/object/tla%3A1839_00_0000_0000_0022_5ACF_5"
# response = requests.get(url)
# soup = BeautifulSoup(response.text, "html.parser")
# field = soup.find_all("div", class_="views-field views-field-fgs-label-s")

# href_list = []
# for div in field:
#   a_tag = div.find("a")
#   if a_tag:
#     href = a_tag.get("href")
#     if href:
#         href_list.append(f"https://archive.mpi.nl{href}")

# tbt_links = []
# for href in href_list:
#   response = requests.get(href)
#   soup = BeautifulSoup(response.text, "html.parser")
#   links = soup.find("div", class_="flat-compound-caption").find('a').get('href')
#   tbt_links.append(f"https://archive.mpi.nl{links}")

# mkn_segment_sentence = []
# eng_segment_sentence = []
# mkn_full_sentence = []
# eng_full_sentence = []
# for tbt_link in tbt_links:
#   response = requests.get(tbt_link)
#   soup = BeautifulSoup(response.text, "html.parser")
#   text = soup.find("div", class_="plain-text").find_all("p")
#   temp_mkn = []
#   temp_eng = []
#   for segments in text:
#     # Extract the relevant parts
#     tx_match = re.search(r'\\tx (.*?)\n', segments.text)
#     ft_match = re.search(r'\\ft (.*?)\.', segments.text)

#     # Get the extracted parts
#     tx_text = tx_match.group(1).strip() if tx_match else ""
#     ft_text = ft_match.group(1).strip() if ft_match else ""

#     tx_text = re.sub(r'\s+', ' ', tx_text).strip()
#     ft_text = re.sub(r'\s+', ' ', ft_text).strip()

#     # Combine the results
#     mkn_segment_sentence.append(tx_text)
#     eng_segment_sentence.append(ft_text)
#     temp_mkn.append(tx_text)
#     temp_eng.append(ft_text)

#   mkn_full_sentence.append(" ".join(temp_mkn))
#   eng_full_sentence.append(" ".join(temp_eng))