In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Dataset

In [1]:
%%capture
!pip install datasets

In [2]:
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from datasets import load_from_disk
import pandas as pd
import re

In [None]:
PATH = "/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn"

In [None]:
parallel_dataset = load_from_disk(PATH+"/dataset/parallel")
jfs_dataset = load_from_disk(PATH+"/dataset/jfs_dataset")
tapeleuk_dataset = load_from_disk(PATH+"/dataset/tapeleuk_dataset")
parallel_data = concatenate_datasets([parallel_dataset, jfs_dataset, tapeleuk_dataset])

## Preprocessing

In [None]:
parallel_data

Dataset({
    features: ['ind', 'mkn'],
    num_rows: 50861
})

In [None]:
filtered_parallel_data = parallel_data.filter(lambda example: 'xxx' not in example['mkn'] and 'xxx' not in example['ind'])
print(f"Filtered dataset size: {len(filtered_parallel_data)}")

Filtered dataset size: 50259


In [None]:
def remove_extra_dots(text):
  """Removes extra dots from a given text."""
  return re.sub(r'\.{2,}', '.', text)

# Apply the function to the 'mkn' and 'ind' columns
filtered_parallel_data = filtered_parallel_data.map(lambda example: {
    'mkn': remove_extra_dots(example['mkn']),
    'ind': remove_extra_dots(example['ind'])
})

In [None]:
def filter_short_sentences(example):
  return len(example['ind']) > 1 and len(example['mkn']) > 1

In [None]:
filtered_parallel_data = filtered_parallel_data.filter(filter_short_sentences)
filtered_parallel_data

Filter:   0%|          | 0/50259 [00:00<?, ? examples/s]

Dataset({
    features: ['ind', 'mkn'],
    num_rows: 48045
})

In [None]:
# Remove duplicate columns (if any exist)
# Convert to pandas DataFrame for easier duplicate column handling
df = pd.DataFrame(filtered_parallel_data)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

# Convert back to Hugging Face Dataset
filtered_parallel_data = Dataset.from_pandas(df)
filtered_parallel_data

Dataset({
    features: ['ind', 'mkn'],
    num_rows: 43922
})

In [None]:
parallel_df = filtered_parallel_data.shuffle(seed=42).train_test_split(test_size=0.2)
parallel_df

DatasetDict({
    train: Dataset({
        features: ['ind', 'mkn'],
        num_rows: 35137
    })
    test: Dataset({
        features: ['ind', 'mkn'],
        num_rows: 8785
    })
})

In [None]:
parallel_df.push_to_hub("joanitolopo/KupangMalay-ParallelCorpus-v1", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/437 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/joanitolopo/KupangMalay-ParallelCorpus-v1/commit/821b9d7d67d100815422d0d43ec1a71229ed7552', commit_message='Upload dataset', commit_description='', oid='821b9d7d67d100815422d0d43ec1a71229ed7552', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/joanitolopo/KupangMalay-ParallelCorpus-v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='joanitolopo/KupangMalay-ParallelCorpus-v1'), pr_revision=None, pr_num=None)

## Baseline Prompt

In [None]:
parallel_data = load_dataset("joanitolopo/KupangMalay-ParallelCorpus-v1")

README.md:   0%|          | 0.00/428 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.07M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/510k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/35350 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8838 [00:00<?, ? examples/s]

In [None]:
parallel_data

DatasetDict({
    train: Dataset({
        features: ['ind', 'mkn'],
        num_rows: 35350
    })
    test: Dataset({
        features: ['ind', 'mkn'],
        num_rows: 8838
    })
})

In [None]:
import random

In [None]:
# prompt di adopsi dari https://github.com/IndoNLP/cendol/blob/main/evaluation/prompt_utils.py
prompt_instruction = ['Terjemahkan teks berikut dari bahasa {SOURCE} ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                      '{INPUT}\nTerjemahkan teks di atas dari bahasa {SOURCE} ke bahasa {TARGET}.',
                      'Teks dalam bahasa {SOURCE}: {INPUT}\nApa terjemahannya dalam bahasa {TARGET}?',
                      'Terjemahkan teks bahasa {SOURCE} berikut ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                      'Teks dalam bahasa {SOURCE}: {INPUT}\nTeks dalam bahasa {TARGET}:'
                      ]

In [None]:
# parallel_20 = parallel_data.train_test_split(test_size=0.2, shuffle=True, seed=42)
# parallel_40 = parallel_data.train_test_split(test_size=0.4, shuffle=True, seed=42)
# parallel_60 = parallel_data.train_test_split(test_size=0.6, shuffle=True, seed=42)

# print("Scenario 1: ", parallel_20.shape)
# print("Scenario 2: ", parallel_40.shape)
# print("Scenario 3: ", parallel_60.shape)

Scenario 1:  {'train': (40688, 2), 'test': (10173, 2)}
Scenario 2:  {'train': (30516, 2), 'test': (20345, 2)}
Scenario 3:  {'train': (20344, 2), 'test': (30517, 2)}


In [None]:
instruction_data_train = []
instruction_data_test = []

for data in parallel_data['train']:
  prompt = random.choice(prompt_instruction)
  instruction = prompt.format(SOURCE="Indonesia", TARGET="Melayu Kupang", INPUT = data["ind"])
  instruction_data_train.append({
      "prompt": prompt,
      "input": instruction,
      "output": data["mkn"]

  })

for data in parallel_data['test']:
  prompt = random.choice(prompt_instruction)
  instruction = prompt.format(SOURCE="Indonesia", TARGET="Melayu Kupang", INPUT = data["ind"])
  instruction_data_test.append({
      "prompt": prompt,
      "input": instruction,
      "output": data["mkn"]
  })

In [None]:
train_dataset = Dataset.from_list(instruction_data_train)
test_dataset = Dataset.from_list(instruction_data_test)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'input', 'output'],
        num_rows: 35350
    })
    test: Dataset({
        features: ['prompt', 'input', 'output'],
        num_rows: 8838
    })
})

In [None]:
dataset.push_to_hub("joanitolopo/KupangMalay-InstructCorpus-v1", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/joanitolopo/KupangMalay-InstructCorpus-v1/commit/bf25a7ec3d8621de4548bbc1f300a8dc5ee21e4e', commit_message='Upload dataset', commit_description='', oid='bf25a7ec3d8621de4548bbc1f300a8dc5ee21e4e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/joanitolopo/KupangMalay-InstructCorpus-v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='joanitolopo/KupangMalay-InstructCorpus-v1'), pr_revision=None, pr_num=None)

## Instruksional Linguistik

In [3]:
%%capture
!pip install keybert

In [4]:
import random
import re
import string
from difflib import SequenceMatcher
from keybert import KeyBERT
from transformers import pipeline
from gensim.models import FastText
import pandas as pd
from tqdm import tqdm

In [9]:
# import json
# # Open the file and load the JSON data
# with open('/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/source/data.json', 'r', encoding='utf-8') as file:
#   bilingual_dict = json.load(file)

import json
# Open the file and load the JSON data
with open('/content/drive/MyDrive/instruction-tuning-mkn/source/data.json', 'r', encoding='utf-8') as file:
  bilingual_dict = json.load(file)

In [10]:
dictionary = []
for entry in bilingual_dict:
  src = random.choice(entry['word']['target'])
  trg = entry['word']['source']
  dictionary.append((src, trg))

# extract mkn_words
mkn_words = []
for pair in dictionary:
  mkn_words.append(pair[1])

def extract_phonetic_rules(dictionary):
  rules = {}
  for ind, kup in dictionary:
    for i, (ind_char, kup_char) in enumerate(zip(ind, kup)):
      if ind_char != kup_char:
        if ind_char in rules:
          if kup_char not in rules[ind_char]:
              rules[ind_char].append(kup_char)
        else:
          rules[ind_char] = [kup_char]
  return rules

phonetic_rules = extract_phonetic_rules(dictionary)

In [11]:
# w2v = FastText.load("/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/embedding/fastText/fasttext_model_100.bin").wv
w2v = FastText.load("/content/drive/MyDrive/instruction-tuning-mkn/embedding/fastText/fasttext_model_100.bin").wv

In [14]:
parallel_data = load_dataset("joanitolopo/KupangMalay-ParallelCorpus-v1")

train-00000-of-00001.parquet:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/518k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/35137 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8785 [00:00<?, ? examples/s]

In [15]:
class InstructionGenerator:
  def __init__(self, bilingual_dict, w2v_model, phonetic_rules):
    """
    Initializes the InstructionGenerator.

    :param bilingual_dict: A dictionary containing bilingual mappings and example sentences.
    :param w2v_model: A word2vec model to find similar words.
    :param phonetic_rules: A dictionary of phonetic transformation rules.
    """
    self.bilingual_dict = bilingual_dict
    self.w2v_model = w2v_model
    self.phonetic_rules = phonetic_rules
    self.prompt_instruction = ['Terjemahkan teks berikut dari bahasa {SOURCE} ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                               '{INPUT}\nTerjemahkan teks di atas dari bahasa {SOURCE} ke bahasa {TARGET}.',
                               'Teks dalam bahasa {SOURCE}: {INPUT}\nApa terjemahannya dalam bahasa {TARGET}?',
                               'Terjemahkan teks bahasa {SOURCE} berikut ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                               'Teks dalam bahasa {SOURCE}: {INPUT}\nTeks dalam bahasa {TARGET}:']
    self.contextual_promp = ["Diberikan kalimat dalam bahasa {TARGET} :\n{CONTEXT}\n\n{INSTRUCTION}"]
    self.semantic_promp = ["Diberikan sinonim dalam bahasa {SOURCE} dan {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]
    self.keyword_prompt = ["Diberikan kalimat dalam bahasa {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]
    self.list_group_label_prompt = ["Diberikan kategori kata dalam bahasa {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]

  def random_word_prob(self):
    """
    Randomly choose words from bilingual .
    """
    keywords = []
    for i in range(5):
      bb = random.choice(bilingual_dict)['word']['source']
      cc = random.random()
      keywords.append((bb, cc))

    return keywords

  def get_special_word(self, sentence, keywords, top_n=5, diversity=0.7, n_highest=1):
    """
    Extracts the most significant words from a sentence.

    :param sentence: Input sentence to process.
    :param top_n: Number of top keywords to extract.
    :param diversity: Diversity of the keywords.
    :param n_highest: Number of highest-ranked keywords to return.
    :keywords: List of keywords in sentences.
    :return: List of top keywords.
    """
    if keywords == []:
      keywords = self.random_word_prob()

    try:
        num_in_sent = int(re.search(r'\d+', sentence).group())
    except:
        num_in_sent = ""

    filtered_keywords = [
        kw for kw in keywords
        if kw[0] != str(num_in_sent) and kw[0] not in string.punctuation
    ]

    if filtered_keywords == []:
      filtered_keywords = self.random_word_prob()

    return sorted(filtered_keywords, key=lambda x: x[1], reverse=True)[:n_highest]

  def find_paralel_ind2mkn(self, words):
    """
    Finds the parallel Melayu Kupang word for a indonesia word in the bilingual dictionary.

    :param words: List of words to search for.
    :return: The first matching parallel Melayu Kupang word or the original word.
    """
    for word in words:
      for entry in self.bilingual_dict:
        if word[0] in entry['word']['target']:
          return entry['word']['source']
    return words[0][0]

  def find_parallel_mkn2ind(self, nearest_words):
    """
    Finds the parallel Indonesia word for list of Melayu Kupang word in the bilingual dictionary..

    :param nearest_words: List of tuples containing words and their similarity scores.
    :return: List of tuples with source and target words.
    """
    result = []
    for word in nearest_words:
      for entry in self.bilingual_dict:
        if word[0] == entry['word']['source']:
          word_ind = random.choice(entry['word']['target'])
          result.append((word_ind, word[0]))
          break
      else:
        result.append((word[0], word[0]))

    return result

  def get_example_sentences(self, word, n):
    """
    Retrieves example sentences containing a given word.

    :param word: Word to search for in the bilingual dictionary.
    :param n: Number of sentences to retrieve.
    :return: List of example sentences.
    """
    sentences = [
            sentence
            for entry in self.bilingual_dict
            for sentence in entry['sentences']['source']
            ]
    related_sentences = [s for s in sentences if word in s.split()]

    if not related_sentences:
      return random.sample(sentences, min(n, len(sentences)))

    if len(related_sentences) < n:
      chosen_sentences = related_sentences + random.sample(sentences, n - len(related_sentences))
      return chosen_sentences

    return random.sample(related_sentences, n)

  def phonetic_representation(self, word):
    """
    Converts a word to its phonetic representation using given rules.

    :param word: Input word.
    :param rules: Dictionary of phonetic transformation rules.
    :return: Phonetic representation of the word.
    """
    phonetic_word = word.lower()
    for src_char, trg_chars in self.phonetic_rules.items():
      for trg_char in trg_chars:
        phonetic_word = phonetic_word.replace(src_char, trg_char)

    return phonetic_word

  def get_similar_sounding_words(self, word, n_similar):
    """
    Retrieves words that sound similar based on phonetic rules.

    :param word: Input word.
    :param n_similar: Number of similar words to retrieve.
    :param phonetic_rules: Dictionary of phonetic transformation rules.
    :return: List of similar-sounding words.
    """
    candidates = [entry['word']['source'] for entry in self.bilingual_dict]
    word_phonetic = self.phonetic_representation(word)
    similar_words = [
        candidate
        for candidate in candidates
        if SequenceMatcher(None, word_phonetic, self.phonetic_representation(candidate)).ratio() > 0.7
        ]

    return similar_words[:n_similar]

  def generate_contextual_inst(self, sentence, w_star, n=2):
    """
    Generates a contextual instruction based on a sentence.

    :param sentence: Input sentence.
    :param w_star: A word that represents a sentence.
    :param n: Number of example sentences to include.
    :return: Contextual instruction as a formatted string.
    """
    w_mkn_star = self.find_paralel_ind2mkn(w_star)
    example_sentences = self.get_example_sentences(w_mkn_star, n)
    context = "\n".join([f"- {sent}" for sent in example_sentences])

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.contextual_promp).format(TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)
    return final_prompt

  def generate_semantic_inst(self, sentence, w_star, n=2):
    """
    Generates a semantic instruction based on a sentence.

    :param sentence: Input sentence.
    :param w_star: A word that represents a sentence.
    :param n: Number of nearest words to include.
    :return: Semantic instruction as a formatted string.
    """
    w_mkn_star = self.find_paralel_ind2mkn(w_star)
    nearest_words = self.w2v_model.most_similar(w_mkn_star, topn=n)
    nearest_words.append((w_mkn_star, 1))
    paralel_n_words = self.find_parallel_mkn2ind(nearest_words)
    context = "\n".join([f"- {src}:{trg}" for src, trg in paralel_n_words])

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.semantic_promp).format(SOURCE="Indonesia", TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)
    return final_prompt

  def generate_keyword_inst(self, sentence, w_star, n_similar=5):
    """
    Generates a keyword-based instruction based on a sentence.

    :param sentence: Input sentence.
    :param w_star: A word that represents a sentence.
    :param n_similar: Number of similar-sounding words to include.
    :return: Keyword instruction as a formatted string.
    """
    w_mkn_star = self.find_paralel_ind2mkn(w_star)
    similar_words = self.get_similar_sounding_words(w_mkn_star, n_similar)
    sentences = []
    for word in similar_words:
      sentences.extend(self.get_example_sentences(word, n=1))
    context = "\n".join(sentences)

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.keyword_prompt).format(TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)

    return final_prompt

  def generate_list_group_label_inst(self, sentence, w_stars, n_nearest=2):
    """
    Generates a list-group-label instruction based on a sentence.

    :param sentence: Input sentence.
    :param w_stars: List of words represents a sentence.
    :param n_nearest: Number of nearest word in list of words.
    :return: Keyword instruction as a formatted string.
    """
    w_parallel = []
    for idx, word in enumerate(w_stars):
      pair = self.find_paralel_ind2mkn([word])
      w_parallel.append((pair, word[0]))

    groups = []
    for idx, word in enumerate(w_parallel):
      nearest_words = self.w2v_model.most_similar(word[0], topn=n_nearest)
      nearest_words.append((word[0], 1))
      groups.append({f"Label {idx+1}": nearest_words})

    context = ""
    for idx, group in enumerate(groups):
      for key, value in group.items():
        context += f"{key}:"
        for word in value:
          context += f" {word[0]},"
        context += "\n"

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.list_group_label_prompt).format(SOURCE="Indonesia", TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)

    return final_prompt

In [16]:
generator = InstructionGenerator(bilingual_dict, w2v, phonetic_rules)

In [18]:
# extract keywords in batch first
indobertmodel = pipeline("feature-extraction", model="indobenchmark/indobert-large-p2")
keybert_model = KeyBERT(model=indobertmodel)
keywords = keybert_model.extract_keywords(parallel_data['train']["ind"],
                                          keyphrase_ngram_range=(1, 1), top_n=5,
                                          diversity=0.7) # at leats 14 menitan

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [21]:
instruction_data_train = []
instruction_data_test = []

# Fungsi untuk memproses data dengan berbagai jenis instruksi
def process_data(parallel_data, generator, keywords):
  instruction_data = []
  for idx, data in enumerate(tqdm(parallel_data, desc="Processing data")):
    src = data["ind"]
    trg = data['mkn']

    # Precompute instructions
    w_star = generator.get_special_word(src, keywords=keywords[idx], n_highest=5)
    contextual_inst = generator.generate_keyword_inst(src, w_star)
    semantic_inst = generator.generate_semantic_inst(src, w_star)
    keyword_inst = generator.generate_keyword_inst(src, w_star, n_similar=3)
    list_group_label_inst = generator.generate_list_group_label_inst(src, w_star, n_nearest=2)

    # Tambahkan ke daftar hasil
    instruction_data.extend([
        {"input": src, "prompt": contextual_inst, "output": trg},
        {"input": src, "prompt": semantic_inst, "output": trg},
        {"input": src, "prompt": keyword_inst, "output": trg},
        {"input": src, "prompt": list_group_label_inst, "output": trg}
      ])
  return instruction_data

# Proses data train
instruction_data_train = process_data(parallel_data['train'], generator, keywords)

Processing data:  13%|█▎        | 4473/35137 [14:09<1:37:06,  5.26it/s]


KeyboardInterrupt: 

In [None]:
1471

In [None]:
# instruction_data_test = process_data(parallel_data['test'], generator)

In [None]:
train_dataset = Dataset.from_list(instruction_data_train)
test_dataset = Dataset.from_list(instruction_data_test)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
dataset