In [None]:
# prompt di adopsi dari https://github.com/IndoNLP/cendol/blob/main/evaluation/prompt_utils.py
prompt_instruction = ['Terjemahkan teks berikut dari bahasa {SOURCE} ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                      '{INPUT}\nTerjemahkan teks di atas dari bahasa {SOURCE} ke bahasa {TARGET}.',
                      'Teks dalam bahasa {SOURCE}: {INPUT}\nApa terjemahannya dalam bahasa {TARGET}?',
                      'Terjemahkan teks bahasa {SOURCE} berikut ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                      'Teks dalam bahasa {SOURCE}: {INPUT}\nTeks dalam bahasa {TARGET}:'
                      ]

In [None]:
import json
# Open the file and load the JSON data
with open('/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/app/data.json', 'r', encoding='utf-8') as file:
  bilingual_dict = json.load(file)

## Representasi Kalimat

Given an Indonesia sentence, we would like to get the important or key word representation of the sentence. It will become a proxy to obtain paralel word in Malay Kupang Dictionary. Therefore, to ensure we get the meaningfull representation, we use `Keybert` technique with `indobert-large-p2` model to get the embedding  

In [None]:
%%capture
!pip install keybert

In [None]:
from keybert import KeyBERT
from transformers.pipelines import pipeline
import re
import string

  from tqdm.autonotebook import tqdm, trange


In [None]:
def get_special_word(sentence, top_n=5, diversity=0.7, n_highest=1):
  indobertmodel = pipeline("feature-extraction", model="indobenchmark/indobert-large-p2")
  model = KeyBERT(model=indobertmodel)

  # Extract top_n keywords
  # They use [CLS] token sebagai representasi vektornya
  keywords_top_n = model.extract_keywords(sentence, keyphrase_ngram_range=(1, 1), top_n=top_n, diversity=diversity)

  # Sort keywords by score and return the n_highest results
  sorted_keywords = sorted(keywords_top_n, key=lambda x: x[1], reverse=True)

  # Find numeric in sentence
  # Delete integer if they contained top_word or contained punctuation
  try:
    num_in_sent = int(re.search(r'\d+', sentence).group())
  except:
    num_in_sent = ""

  for data in sorted_keywords:
    if data[0] == str(num_in_sent):
      sorted_keywords.remove(data)
    if data[0] in string.punctuation:
      sorted_keywords.remove(data)

  return sorted_keywords[:n_highest]

In [None]:
sentence = "Saya dan mba marina sedang mengerjakan tugas"
w_star = get_special_word(sentence, n_highest=5)
w_star

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('saya', 0.5631),
 ('tugas', 0.5412),
 ('mengerjakan', 0.5302),
 ('marina', 0.5135),
 ('mba', 0.5095)]

## Instruksi Berbasis Konteks

In [None]:
import random

In [None]:
# Perhatikan baik-baik lagi kodingan ini, kbususnya di bagian while
# sepertinya ketika entry w_star pertama tidak ada di dictionary pertama
# maka tidak mencari di keseluruhan dictionary tapi langsung berganti kata  (idx += 1)
# seharusnya dicari dulu keseluruh data dictionary baru kalau tidak ada ganti katanya

# def find_parallel_word(w_star, bilingual_dict):
#   temp = ""
#   for entry in bilingual_dict:
#     idx = 0
#     while idx < len(w_star):
#       # if the available w_star not in the dictionary
#       # simple use the w_star as a w_mkn_star
#       if w_star[idx][0] in entry['word']['target']:
#         temp += entry['word']['source']
#         break
#       else:
#         # if the highest score not appeared in dictionary then
#         # check the second, third, and so on to find in the dictionary
#         idx += 1

#     if temp != "":
#       break

#   if temp == "":
#     return w_star[0][0]
#   else:
#     return temp

def find_parallel_word(w_star, bilingual_dict):
  for w in w_star:
    for entry in bilingual_dict:
      if w[0] in entry['word']['target']:
        return entry['word']['source']

  return w_star[0][0]

In [None]:
# def get_example_sentences(w_mkn_star, bilingual_dict, n):
#   sentences = []
#   for entry in bilingual_dict:
#     # tambahkan kalimat yang tersedia dalam kamus
#     if entry["sentences"]["source"] != "":
#       sentences.extend(entry["sentences"]["source"])

#   # cari dan tambahkan kalimat yang mengandung kata penting dalam kalimat
#   list_relate_sentences = []
#   for sentence in sentences:
#     sentence_split = sentence.split(" ")
#     if w_mkn_star in sentence_split:
#       list_relate_sentences.append(sentence)

#   # if there is no sentence that contains kalimat penting, just get random sample
#   # from sentence even not contain w_mkn_star
#   if list_relate_sentences == []:
#     chose_sentences = random.sample(sentences, min(n, len(sentences)))
#   elif len(list_relate_sentences) < n: # if n_relate list sentence less than the max of n_sentenc
#     # add current relate list sentence with sample from sentences
#     chose_sentences = random.sample(list_relate_sentences, min(n, len(list_relate_sentences)))
#     chose_sentences.extend(random.sample(sentences, n-len(list_relate_sentences)))
#   else: # sampling sentence based on number of context example
#     chose_sentences = random.sample(list_relate_sentences, min(n, len(list_relate_sentences)))

#   return chose_sentences

def get_example_sentences(w_mkn_star, bilingual_dict, n):
  # Ambil semua kalimat dari bilingual_dict
  sentences = [
            pairs
            for entry in bilingual_dict
            for pairs in zip(entry['sentences']['source'], entry['sentences']['target'])
        ]

  # Cari kalimat yang mengandung w_mkn_star
  related_sentences = [sentence for sentence in sentences if w_mkn_star in sentence[0].split()]

  # Jika tidak ada kalimat yang mengandung w_mkn_star, ambil sampel acak dari semua kalimat
  if not related_sentences:
      return random.sample(sentences, min(n, len(sentences)))

  # Jika jumlah kalimat yang relevan kurang dari n, tambahkan sampel dari semua kalimat
  if len(related_sentences) < n:
      chosen_sentences = random.sample(related_sentences, len(related_sentences))
      chosen_sentences.extend(random.sample(sentences, n - len(related_sentences)))
      return chosen_sentences

  # Jika jumlah kalimat yang relevan cukup, ambil sampel dari kalimat yang relevan
  return random.sample(related_sentences, n)

In [None]:
def generate_contextual_inst(sentence, bilingual_dict, n=2):
  # get special word
  w_star = get_special_word(sentence, n_highest=5)

  # temukan paralel word relate to special word
  w_mkn_star = find_parallel_word(w_star, bilingual_dict)

  # get example sentence
  example_sentences = get_example_sentences(w_mkn_star, bilingual_dict, n)

  # formatting example sentence
  formated_example = ""
  for idx, sent in enumerate(example_sentences):
    formated_example += f"- {sent[0]} : {sent[1]}\n"

  # construct promp context
  prompt_context = ["Diberikan pasangan kalimat dalam bahasa {SOURCE} dan {TARGET} :\n{CONTEXT}\n\n{INSTRUCTION}"]

  # construct the prompt
  instruction = random.choice(prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
  final_prompt = random.choice(prompt_context).format(TARGET="Melayu Kupang", SOURCE="Indonesia", CONTEXT=formated_example, INSTRUCTION=instruction)

  return final_prompt

In [None]:
sentence = "Saya tidak pernah memanggil anda dengan kata yang tidak senonoh."
data = generate_contextual_inst(sentence, bilingual_dict, n=3)
print(data)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Diberikan pasangan kalimat dalam bahasa Indonesia dan Melayu Kupang :
- Beta acu deng dia, tagal dia pinjam be pung doi, ma sonde tau kasi kambali. : Saya tidak memperdulikannya, karena dia telah meminjam uang saya, tetapi tidak pernah mengembalikannya.
- Lia do, te be pung oma bo'i su datang. : Lihat, Oma saya terkasih telah tiba.
- Malam minggu tu, be pung waktu pi apel beta pung nona. : Malam minggu adalah waktu saya mengunjungi pacar saya.


Saya tidak pernah memanggil anda dengan kata yang tidak senonoh.
Terjemahkan teks di atas dari bahasa Indonesia ke bahasa Melayu Kupang.


## Instruksi Berbasis Pemetaan Semantik

Tolong diperhatikan!!
Word2vec untuk melayu kupang coba diambil dari beberapa sumber lagi:
- Tapaleuk
- Keseluruhan Bible
- dll

In [None]:
from gensim.models import FastText
import pandas as pd

In [None]:
w2v = FastText.load("/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/fasttext_model_100.bin").wv

In [None]:
# def find_parallel_word_reverse(nearest_words, bilingual_dict):
#   words = []
#   for word in nearest_words:
#     temp = []
#     for entry in bilingual_dict:
#       if word[0] == entry['word']['source']:
#         src = random.choice(entry['word']['target'])
#         trg = word[0]
#         pairs = (src, trg)
#         temp.append(pairs)
#         break
#       else:
#         continue

#     if temp == []:
#       pairs = [(word[0], word[0])]
#       words.extend(pairs)
#     else:
#       words.extend(temp)

#   return words

def find_parallel_word_reverse(nearest_words, bilingual_dict):
  words = []
  for word in nearest_words:
    # Cari entri yang cocok dalam bilingual_dict
    for entry in bilingual_dict:
        if word[0] == entry['word']['source']:
          # Jika ditemukan, pilih secara acak dari target dan tambahkan ke hasil
          src = random.choice(entry['word']['target'])
          words.append((src, word[0]))
          break  # Keluar dari loop setelah menemukan pasangan
    else:
        # Jika tidak ditemukan, tambahkan pasangan (word[0], word[0])
        words.append((word[0], word[0]))

  return words

In [None]:
def generate_semantic_inst(sentence, bilingual_dict, w2v, n=2):
  # get n special word
  w_star = get_special_word(sentence, n_highest=5)

  # get n paralel
  w_mkn_star = find_parallel_word(w_star, bilingual_dict)

  # get nearest words
  nearest_words = w2v.most_similar(w_mkn_star, topn=n)
  nearest_words.append((w_mkn_star, 1))

  # get paralel nearest word
  paralel_n_words = find_parallel_word_reverse(nearest_words, bilingual_dict)

  # formatting example sentence
  formated_example = ""
  for idx, sent in enumerate(paralel_n_words):
    formated_example += f"- {sent[0]}:{sent[1]}\n"

  # construct the prompt context
  prompt_context = ["Diberikan sinonim dalam bahasa {SOURCE} dan {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]

  # construct the prompt
  instruction = random.choice(prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
  final_prompt = random.choice(prompt_context).format(SOURCE="Indonesia", TARGET="Melayu Kupang", CONTEXT=formated_example, INSTRUCTION=instruction)

  return final_prompt

In [None]:
sentence = "Saya tidak pernah memanggil anda dengan kata yang tidak senonoh."
semant_prompt = generate_semantic_inst(sentence, bilingual_dict, w2v, n=3)
print(semant_prompt)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Diberikan sinonim dalam bahasa Indonesia dan Melayu Kupang:
- bermain:bermain
- bera:bera
- beres:beres
- aku:be


Terjemahkan teks berikut dari bahasa Indonesia ke bahasa Melayu Kupang.
Teks: Saya tidak pernah memanggil anda dengan kata yang tidak senonoh.
Terjemahan:


## Instruksi Berbasis Keyword

In [None]:
from gensim.models import FastText
import pandas as pd
import random
from difflib import SequenceMatcher

In [None]:
w2v = FastText.load("/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/fasttext_model_100.bin").wv

In [None]:
dictionary = []
for entry in bilingual_dict:
  print(entry['word']['target'])
  src = random.choice(entry['word']['target'])
  trg = entry['word']['source']
  dictionary.append((src, trg))

# extract mkn_words
mkn_words = []
for pair in dictionary:
  mkn_words.append(pair[1])

In [None]:
def extract_phonetic_rules(dictionary):
  rules = {}
  for ind, kup in dictionary:
    for i, (ind_char, kup_char) in enumerate(zip(ind, kup)):
      if ind_char != kup_char:
        if ind_char in rules:
          if kup_char not in rules[ind_char]:
              rules[ind_char].append(kup_char)
        else:
          rules[ind_char] = [kup_char]
  return rules

def phonetic_representation(word, rules):
  phonetic_word = word.lower()
  for ind_char, kup_chars in rules.items():
    for kup_char in kup_chars:
      phonetic_word = phonetic_word.replace(ind_char, kup_char)
  return phonetic_word

def get_similar_sounding_words(word, candidates, rules):
  word_phonetic = phonetic_representation(word, rules)

  similar_words = []

  for candidate in candidates:
    candidate_phonetic = phonetic_representation(candidate, rules)
    similarity = SequenceMatcher(None, word_phonetic, candidate_phonetic).ratio()

    if similarity > 0.7:
        similar_words.append(candidate)

  return similar_words

phonetic_rules = extract_phonetic_rules(dictionary)

In [None]:
def generate_keyword_inst(sentence, bilingual_dict, mkn_words, phonetic_rules, n_similar=5):
  # get special word
  w_star = get_special_word(sentence, n_highest=5)

  # temukan paralel word relate to special word
  w_mkn_star = find_parallel_word(w_star, bilingual_dict)

  # get phonetic words
  similar_words = get_similar_sounding_words(w_mkn_star, mkn_words, phonetic_rules)[:n_similar]

  # get example sentence
  sentences = []
  for word in similar_words:
    sentences.extend(get_example_sentences(word, bilingual_dict, n=1))

  # groupuing each sentence in one paragraph
  group_sentence = "\n".join(sentences)

  # construct promp context
  prompt_context = ["Diberikan kalimat dalam bahasa {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]

  # construct the prompt
  instruction = random.choice(prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
  final_prompt = random.choice(prompt_context).format(TARGET="Melayu Kupang", CONTEXT=group_sentence, INSTRUCTION=instruction)

  return final_prompt

In [None]:
sentence = "Saya sedang mengerjakan tugas dengan mba alfa dan mba marina."
data = generate_keyword_inst(sentence, bilingual_dict, mkn_words, phonetic_rules)
print(data)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Diberikan kalimat dalam bahasa Melayu Kupang:
Orang pintar su taro tangan ko sonto bekin bae itu orang saki.
Orang dong ada batasibu bekin bae jalan, te Pak Gub mau datang.
Beta son pake payung, andia ko be su itam marege.
Dia angka satu ana.
Bosong tanam bibit bamusu deng dong, sakarang bosong dapa dia pung boa su!

Teks dalam bahasa Indonesia: Saya sedang mengerjakan tugas dengan mba alfa dan mba marina.
Apa terjemahannya dalam bahasa Melayu Kupang?


## Instruksi Berbasis List-Group-Label

In [None]:
from gensim.models import FastText

In [None]:
w2v = FastText.load("/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/fasttext_model_100.bin").wv

In [None]:
def generate_lgl_inst(sentence, bilingual_dict, w2v, n_group=2, n_nearest=2):
  # get n special word
  w_star = get_special_word(sentence, n_highest=n_group)

  # get paralel words
  w_parallel = []
  for idx, word in enumerate(w_star):
    pair = find_parallel_word([word], bilingual_dict)
    w_parallel.append((pair, word[0]))

  # get n groups
  groups = []
  for idx, word in enumerate(w_parallel):
    # get n important words
    nearest_words = w2v.most_similar(word[0], topn=n_nearest)
    nearest_words.append((word[0], 1))
    groups.append({f"Label {idx+1}": nearest_words})

  # formating groups
  formated_example = ""
  for idx, group in enumerate(groups):
    for key, value in group.items():
      formated_example += f"{key}:"
      for word in value:
        formated_example += f" {word[0]},"
      formated_example += "\n"

  # construct promp context
  prompt_context = ["Diberikan kategori kata dalam bahasa {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]

  # construct the prompt
  instruction = random.choice(prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
  final_prompt = random.choice(prompt_context).format(TARGET="Melayu Kupang", CONTEXT=formated_example, INSTRUCTION=instruction)

  return final_prompt

In [None]:
sentence = "Saya makan nasi dan minum susu dengan air teh"
lgl_prompt = generate_lgl_inst(sentence, bilingual_dict, w2v, n_group=4, n_nearest=1)
print(lgl_prompt)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Diberikan kategori kata dalam bahasa Melayu Kupang:
Label 1: paru, alas paru,
Label 2: pasi, nasi,
Label 3: paru, alas paru,
Label 4: bermain, be,


Saya makan nasi dan minum susu dengan air teh
Terjemahkan teks di atas dari bahasa Indonesia ke bahasa Melayu Kupang.


## Whole

In [None]:
import random
import re
import string
from difflib import SequenceMatcher
from keybert import KeyBERT
from transformers import pipeline
from gensim.models import FastText
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


In [None]:
dictionary = []
for entry in bilingual_dict:
  src = random.choice(entry['word']['target'])
  trg = entry['word']['source']
  dictionary.append((src, trg))

# extract mkn_words
mkn_words = []
for pair in dictionary:
  mkn_words.append(pair[1])

def extract_phonetic_rules(dictionary):
  rules = {}
  for ind, kup in dictionary:
    for i, (ind_char, kup_char) in enumerate(zip(ind, kup)):
      if ind_char != kup_char:
        if ind_char in rules:
          if kup_char not in rules[ind_char]:
              rules[ind_char].append(kup_char)
        else:
          rules[ind_char] = [kup_char]
  return rules

phonetic_rules = extract_phonetic_rules(dictionary)

In [None]:
class InstructionGenerator:
  def __init__(self, bilingual_dict, w2v_model, phonetic_rules):
    """
    Initializes the InstructionGenerator.

    :param bilingual_dict: A dictionary containing bilingual mappings and example sentences.
    :param w2v_model: A word2vec model to find similar words.
    :param phonetic_rules: A dictionary of phonetic transformation rules.
    """
    self.bilingual_dict = bilingual_dict
    self.w2v_model = w2v_model
    self.indobertmodel = pipeline("feature-extraction", model="indobenchmark/indobert-large-p2")
    self.keybert_model = KeyBERT(model=self.indobertmodel)
    self.phonetic_rules = phonetic_rules

    self.prompt_instruction = ['Terjemahkan teks berikut dari bahasa {SOURCE} ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                               '{INPUT}\nTerjemahkan teks di atas dari bahasa {SOURCE} ke bahasa {TARGET}.',
                               'Teks dalam bahasa {SOURCE}: {INPUT}\nApa terjemahannya dalam bahasa {TARGET}?',
                               'Terjemahkan teks bahasa {SOURCE} berikut ke bahasa {TARGET}.\nTeks: {INPUT}\nTerjemahan:',
                               'Teks dalam bahasa {SOURCE}: {INPUT}\nTeks dalam bahasa {TARGET}:']
    self.contextual_promp = ["Diberikan kalimat dalam bahasa {TARGET} :\n{CONTEXT}\n\n{INSTRUCTION}"]
    self.semantic_promp = ["Diberikan sinonim dalam bahasa {SOURCE} dan {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]
    self.keyword_prompt = ["Diberikan kalimat dalam bahasa {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]
    self.list_group_label_prompt = ["Diberikan kategori kata dalam bahasa {TARGET}:\n{CONTEXT}\n\n{INSTRUCTION}"]

  def get_special_word(self, sentence, top_n=5, diversity=0.7, n_highest=1):
    """
    Extracts the most significant words from a sentence.

    :param sentence: Input sentence to process.
    :param top_n: Number of top keywords to extract.
    :param diversity: Diversity of the keywords.
    :param n_highest: Number of highest-ranked keywords to return.
    :return: List of top keywords.
    """
    keywords = self.keybert_model.extract_keywords(
        sentence, keyphrase_ngram_range=(1, 1), top_n=top_n, diversity=diversity
    )

    # remove numeric and punctuation keywords from obtained keywords
    try:
        num_in_sent = int(re.search(r'\d+', sentence).group())
    except:
        num_in_sent = ""

    filtered_keywords = [
        kw for kw in keywords
        if kw[0] != str(num_in_sent) and kw[0] not in string.punctuation
    ]

    return sorted(filtered_keywords, key=lambda x: x[1], reverse=True)[:n_highest]

  def find_paralel_ind2mkn(self, words):
    """
    Finds the parallel Melayu Kupang word for a indonesia word in the bilingual dictionary.

    :param words: List of words to search for.
    :return: The first matching parallel Melayu Kupang word or the original word.
    """
    for word in words:
      for entry in self.bilingual_dict:
        if word[0] in entry['word']['target']:
          return entry['word']['source']
    return words[0][0]

  def find_parallel_mkn2ind(self, nearest_words):
    """
    Finds the parallel Indonesia word for list of Melayu Kupang word in the bilingual dictionary..

    :param nearest_words: List of tuples containing words and their similarity scores.
    :return: List of tuples with source and target words.
    """
    result = []
    for word in nearest_words:
      for entry in self.bilingual_dict:
        if word[0] == entry['word']['source']:
          word_ind = random.choice(entry['word']['target'])
          result.append((word_ind, word[0]))
          break
      else:
        result.append((word[0], word[0]))

    return result

  def get_example_sentences(self, word, n):
    """
    Retrieves example sentences containing a given word.

    :param word: Word to search for in the bilingual dictionary.
    :param n: Number of sentences to retrieve.
    :return: List of example sentences.
    """
    sentences = [
            sentence
            for entry in self.bilingual_dict
            for sentence in entry['sentences']['source']
            ]
    related_sentences = [s for s in sentences if word in s.split()]

    if not related_sentences:
      return random.sample(sentences, min(n, len(sentences)))

    if len(related_sentences) < n:
      chosen_sentences = related_sentences + random.sample(sentences, n - len(related_sentences))
      return chosen_sentences

    return random.sample(related_sentences, n)

  def phonetic_representation(self, word):
    """
    Converts a word to its phonetic representation using given rules.

    :param word: Input word.
    :param rules: Dictionary of phonetic transformation rules.
    :return: Phonetic representation of the word.
    """
    phonetic_word = word.lower()
    for src_char, trg_chars in self.phonetic_rules.items():
      for trg_char in trg_chars:
        phonetic_word = phonetic_word.replace(src_char, trg_char)

    return phonetic_word

  def get_similar_sounding_words(self, word, n_similar):
    """
    Retrieves words that sound similar based on phonetic rules.

    :param word: Input word.
    :param n_similar: Number of similar words to retrieve.
    :param phonetic_rules: Dictionary of phonetic transformation rules.
    :return: List of similar-sounding words.
    """
    candidates = [entry['word']['source'] for entry in self.bilingual_dict]
    word_phonetic = self.phonetic_representation(word)
    similar_words = [
        candidate
        for candidate in candidates
        if SequenceMatcher(None, word_phonetic, self.phonetic_representation(candidate)).ratio() > 0.7
        ]

    return similar_words[:n_similar]

  def generate_contextual_inst(self, sentence, n=2):
    """
    Generates a contextual instruction based on a sentence.

    :param sentence: Input sentence.
    :param n: Number of example sentences to include.
    :return: Contextual instruction as a formatted string.
    """
    w_star = self.get_special_word(sentence, n_highest=5)
    w_mkn_star = self.find_paralel_ind2mkn(w_star)
    example_sentences = self.get_example_sentences(w_mkn_star, n)
    context = "\n".join([f"- {sent}" for sent in example_sentences])

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.contextual_promp).format(TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)
    return final_prompt

  def generate_semantic_inst(self, sentence, n=2):
    """
    Generates a semantic instruction based on a sentence.

    :param sentence: Input sentence.
    :param n: Number of nearest words to include.
    :return: Semantic instruction as a formatted string.
    """
    w_star = self.get_special_word(sentence, n_highest=5)
    w_mkn_star = self.find_paralel_ind2mkn(w_star)
    nearest_words = self.w2v_model.most_similar(w_mkn_star, topn=n)
    nearest_words.append((w_mkn_star, 1))
    paralel_n_words = self.find_parallel_mkn2ind(nearest_words)
    context = "\n".join([f"- {src}:{trg}" for src, trg in paralel_n_words])

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.semantic_promp).format(SOURCE="Indonesia", TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)
    return final_prompt

  def generate_keyword_inst(self, sentence, n_similar=5):
    """
    Generates a keyword-based instruction based on a sentence.

    :param sentence: Input sentence.
    :param n_similar: Number of similar-sounding words to include.
    :return: Keyword instruction as a formatted string.
    """
    w_star = self.get_special_word(sentence, n_highest=5)
    w_mkn_star = self.find_paralel_ind2mkn(w_star)
    similar_words = self.get_similar_sounding_words(w_mkn_star, n_similar)
    sentences = []
    for word in similar_words:
      sentences.extend(self.get_example_sentences(word, n=1))
    context = "\n".join(sentences)

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.keyword_prompt).format(TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)

    return final_prompt

  def generate_list_group_label_inst(self, sentence, n_group=2, n_nearest=2):
    w_star = self.get_special_word(sentence, n_highest=n_group)
    w_parallel = []
    for idx, word in enumerate(w_star):
      pair = self.find_paralel_ind2mkn([word])
      w_parallel.append((pair, word[0]))

    groups = []
    for idx, word in enumerate(w_parallel):
      nearest_words = self.w2v_model.most_similar(word[0], topn=n_nearest)
      nearest_words.append((word[0], 1))
      groups.append({f"Label {idx+1}": nearest_words})

    context = ""
    for idx, group in enumerate(groups):
      for key, value in group.items():
        context += f"{key}:"
        for word in value:
          context += f" {word[0]},"
        context += "\n"

    instruction = random.choice(self.prompt_instruction).format(SOURCE="Indonesia", INPUT=sentence, TARGET="Melayu Kupang")
    final_prompt = random.choice(self.list_group_label_prompt).format(SOURCE="Indonesia", TARGET="Melayu Kupang", CONTEXT=context, INSTRUCTION=instruction)

    return final_prompt

In [None]:
w2v = FastText.load("/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn/embedding/fastText/fasttext_model_100.bin").wv

In [None]:
generator = InstructionGenerator(bilingual_dict, w2v, phonetic_rules)
print(generator.generate_list_group_label_inst("Sudahkah kamu pergi ke pasar?"))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Diberikan kategori kata dalam bahasa Melayu Kupang:
Label 1: sudah, susa, sudahkah,
Label 2: pata, abiatar, pasar,


Terjemahkan teks bahasa Indonesia berikut ke bahasa Melayu Kupang.
Teks: Sudahkah kamu pergi ke pasar?
Terjemahan:
