Paper: [Improving Text Embeddings with Large Language Models](https://arxiv.org/pdf/2401.00368)

In [None]:
"""
Install the Google AI Python SDK

$ pip install google-generativeai

See the getting started guide for more information:
https://ai.google.dev/gemini-api/docs/get-started/python
"""
!pip install google-generativeai



In [None]:
!pip install faiss-cpu
!pip install -U FlagEmbedding



In [None]:
import os
from pprint import pprint

os.environ["GEMINI_API_KEY"] = "AIzaSyDca6ILjIbCCwyHIaxcWYMOMB9pwvsfACo"

In [None]:
import google.generativeai as genai

def gemini_generate(system_message):
  genai.configure(api_key=os.environ["GEMINI_API_KEY"])

  # Create the model
  # See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
  generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
  }

  safety_settings = [
      {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
      {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
      {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
      {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
  ]

  model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
    safety_settings=safety_settings
  )

  chat = model.start_chat(history=[])

  response = chat.send_message(
      system_message,
      safety_settings=safety_settings,
      generation_config=generation_config
  )
  return response.text

In [None]:
import ast

def convert_task_md_to_list(task_markdown):
  task_list = ast.literal_eval(
      task_markdown
        .replace('python', '')
        .replace('\n', '')
        .replace('```', '')
        .strip()
  )

  return task_list

# Asymmetric tasks

## Text Retrieval Tasks (short-long)

In [None]:
text_retrieval_tasks_template = """
Brainstorm a list of potentially useful text retrieval tasks.
Here are a few examples for your reference:
- Retrieve relevant documents for a short keyword web search query that asks for weather information.
- Search for documents that answers a FAQ-style query on children’s nutrition.
Please adhere to the following guidelines:
- Specify what the query is, and what the desired documents are.
- Each retrieval task should cover a wide range of queries, and should not be too specific.
Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct
retrieval task in one sentence. Do not explain yourself or output anything else. Be creative!
"""

In [None]:
text_retrieval_tasks_markdown = gemini_generate(text_retrieval_tasks_template)

In [None]:
text_retrieval_tasks_markdown

'```python\n[\n"Retrieve scientific articles that describe the latest advancements in artificial intelligence.",\n"Search for legal documents related to intellectual property rights.",\n"Find recipes for healthy and delicious dishes based on user-provided dietary restrictions.",\n"Retrieve news articles about a specific event or topic from various sources.",\n"Discover research papers on the effectiveness of different teaching methods.",\n"Find customer reviews and feedback on a particular product or service.",\n"Search for historical documents that provide insights into a specific historical event.",\n"Retrieve scientific data and visualizations related to climate change.",\n"Find blog posts or articles that offer advice on personal development.",\n"Search for tutorials and guides on learning a new skill.",\n"Discover relevant resources for students studying a particular subject.",\n"Retrieve literary works that explore a specific theme or motif.",\n"Find code examples and solutions f

In [None]:
task_list = convert_task_md_to_list(text_retrieval_tasks_markdown)
pprint(task_list)

['Retrieve scientific articles that describe the latest advancements in '
 'artificial intelligence.',
 'Search for legal documents related to intellectual property rights.',
 'Find recipes for healthy and delicious dishes based on user-provided dietary '
 'restrictions.',
 'Retrieve news articles about a specific event or topic from various sources.',
 'Discover research papers on the effectiveness of different teaching methods.',
 'Find customer reviews and feedback on a particular product or service.',
 'Search for historical documents that provide insights into a specific '
 'historical event.',
 'Retrieve scientific data and visualizations related to climate change.',
 'Find blog posts or articles that offer advice on personal development.',
 'Search for tutorials and guides on learning a new skill.',
 'Discover relevant resources for students studying a particular subject.',
 'Retrieve literary works that explore a specific theme or motif.',
 'Find code examples and solutions for

In [None]:
task = task_list[-1]
query_type = "extremely long-tail" # ["extremely long-tail", "long-tail", "common"]
query_length = "less than 5 words" # ["less than 5 words", "5 to 15 words", "at least 10 words"]
clarity = "clear" # ["clear", "understandable with some effort", "ambiguous"]
num_words = 50 # [50, 100, 200, 300, 400, 500]
language = "Vietnamese"
difficulty = "high school" # ["high school", "college", "PhD"]


synthetic_sample_text_retrieval_template = f"""You have been assigned a retrieval task: {task}
Your mission is to write one text retrieval example for this task in JSON format. The JSON object must contain the following
keys:
- "user_query": a string, a random user search query specified by the retrieval task.
- "positive_document": a string, a relevant document for the user query.
- "hard_negative_document": a string, a hard negative document that only appears relevant to the query.
Please adhere to the following guidelines:
- The "user_query" should be {query_type}, {query_length}, {clarity}, and diverse in topic.
- All documents must be created independent of the query. Avoid copying the query verbatim. It’s acceptable if some parts of
the "positive_document" are not topically related to the query.
- All documents should be at least {num_words} words long.
- The "hard_negative_document" contains some useful information, but it should be less useful or comprehensive compared
to the "positive_document".
- Both the query and documents should be in {language}.
- Do not provide any explanation in any document on why it is relevant or not relevant to the query.
- Both the query and documents require {difficulty} level education to understand.
Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative!
"""

In [None]:
synthetic_sample_text_retrieval_template

'You have been assigned a retrieval task: Discover research papers on the impact of social media on society.\nYour mission is to write one text retrieval example for this task in JSON format. The JSON object must contain the following\nkeys:\n- "user_query": a string, a random user search query specified by the retrieval task.\n- "positive_document": a string, a relevant document for the user query.\n- "hard_negative_document": a string, a hard negative document that only appears relevant to the query.\nPlease adhere to the following guidelines:\n- The "user_query" should be extremely long-tail, less than 5 words, clear, and diverse in topic.\n- All documents must be created independent of the query. Avoid copying the query verbatim. It’s acceptable if some parts of\nthe "positive_document" are not topically related to the query.\n- All documents should be at least 50 words long.\n- The "hard_negative_document" contains some useful information, but it should be less useful or comprehen

In [None]:
text_retrieval_sample = gemini_generate(synthetic_sample_text_retrieval_template)

In [None]:
import json

text_retrieval_sample_json_string = text_retrieval_sample.replace("json", "").replace("```", "").replace("\n", "").strip()

text_retrieval_sample_json = json.loads(text_retrieval_sample_json_string)

print(json.dumps(text_retrieval_sample_json, indent=4, ensure_ascii=False))

{
    "user_query": "Ảnh hưởng mạng xã hội thanh thiếu niên",
    "positive_document": "Sự phổ biến của mạng xã hội đã tạo ra một môi trường mới cho thanh thiếu niên tương tác, kết nối và chia sẻ thông tin. Tuy nhiên, bên cạnh những lợi ích, việc sử dụng mạng xã hội cũng mang đến nhiều nguy cơ tiềm ẩn đối với thế hệ trẻ. Một trong những vấn đề đáng lo ngại là ảnh hưởng tiêu cực của mạng xã hội đến sức khỏe tâm thần của thanh thiếu niên. Việc tiếp xúc quá nhiều với thông tin tiêu cực, áp lực so sánh bản thân với người khác trên mạng, sự cô lập xã hội và nghiện mạng xã hội là những yếu tố chính gây ra trầm cảm, lo âu và các vấn đề về sức khỏe tâm thần khác. Bên cạnh đó, mạng xã hội còn là nơi phát sinh bạo lực mạng, bắt nạt trực tuyến và các hành vi vi phạm pháp luật khác, ảnh hưởng trực tiếp đến sự an toàn và hạnh phúc của thanh thiếu niên. Ngoài ra, việc tiếp xúc quá nhiều với mạng xã hội còn có thể làm giảm khả năng tập trung, sự chú ý và kỹ năng giao tiếp trực tiếp của các em.",
    

In [None]:
from itertools import product
import time
import random

import faiss
import numpy as np
from FlagEmbedding import BGEM3FlagModel

embedding_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

n_samples = 200
dataset = []

dimension = 1024

if os.path.exists('index_file'):
  index = faiss.read_index('index_file')
else:
  index = faiss.IndexFlatL2(dimension)

threshold = 0.3

sample_per_task = 10

while True:
  if len(dataset) >= n_samples:
    break
  try:
    text_retrieval_tasks_markdown = gemini_generate(text_retrieval_tasks_template)
    task_list = convert_task_md_to_list(text_retrieval_tasks_markdown)
  except Exception as e:
    print('Error in generating tasks:', str(e))
    continue

  query_type_list = ["extremely long-tail", "long-tail", "common"]
  query_length_list = ["less than 5 words", "5 to 15 words", "at least 10 words"]
  clarity_list = ["clear", "understandable with some effort", "ambiguous"]
  num_words_list = [50, 100, 200, 300, 400, 500]
  language_list = ["English", "Vietnamese", "Germany", "French"]
  difficulty_list = ["high school", "college", "PhD"]

  for task in task_list:
    for _ in range(sample_per_task):
      query_type = random.choice(query_type_list)
      query_length = random.choice(query_length_list)
      clarity = random.choice(clarity_list)
      num_words = random.choice(num_words_list)
      language = random.choice(language_list)
      difficulty = random.choice(difficulty_list)
      synthetic_sample_text_retrieval_template = f"""You have been assigned a retrieval task: {task}
  Your mission is to write one text retrieval example for this task in JSON format. The JSON object must contain the following
  keys:
  - "user_query": a string, a random user search query specified by the retrieval task.
  - "positive_document": a string, a relevant document for the user query.
  - "hard_negative_document": a string, a hard negative document that only appears relevant to the query.
  Please adhere to the following guidelines:
  - The "user_query" should be {query_type}, {query_length}, {clarity}, and diverse in topic.
  - All documents must be created independent of the query. Avoid copying the query verbatim. It’s acceptable if some parts of
  the "positive_document" are not topically related to the query.
  - All documents should be at least {num_words} words long.
  - The "hard_negative_document" contains some useful information, but it should be less useful or comprehensive compared
  to the "positive_document".
  - Both the query and documents should be in {language}.
  - Do not provide any explanation in any document on why it is relevant or not relevant to the query.
  - Both the query and documents require {difficulty} level education to understand.
  Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative!
  """
      n_retries = 0
      success = False
      while True:
        try:
          text_retrieval_sample = gemini_generate(synthetic_sample_text_retrieval_template)
          text_retrieval_sample_json_string = text_retrieval_sample.replace("json", "").replace("```", "").replace("\n", "").strip()
          text_retrieval_sample_json = json.loads(text_retrieval_sample_json_string)
          user_query = text_retrieval_sample_json['user_query']
          success = True
          break
        except Exception as e:
          print('Error in generating sample:', str(e))
          time.sleep(3)
          n_retries += 1
          if n_retries > 2:
            break
      if success is False:
        continue

      user_query_embedding = embedding_model.encode([user_query],
                              batch_size=1,
                              max_length=512,
                              )['dense_vecs'].reshape(1, dimension).astype('float32')

      print('user query embedding shape', user_query_embedding.shape)

      distances, indices = index.search(user_query_embedding, 1)
      closest_distance = distances[0][0]

      if closest_distance > threshold:
        index.add(user_query_embedding)

        faiss.write_index(index, 'index_file')
        print("New vector added to the index.")
        dataset.append(text_retrieval_sample_json)
        if len(dataset) >= n_samples:
          break
      else:
        print("New vector is too similar to an existing vector and was not added.")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Error in generating sample: Expecting ',' delimiter: line 1 column 1011 (char 1010)
Error in generating sample: Expecting ',' delimiter: line 1 column 519 (char 518)
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector added to the index.
Error in generating sample: Expecting property name enclosed in double quotes: line 1 column 5877 (char 5876)
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector added to the index.
user query embedding shape (1, 1024)
New vector is too similar to an existing vector and was not added.
user query embedding shape (1, 1024)
New vector added to the index.
user que

In [None]:
dataset

[{'user_query': 'Những phát hiện mới nhất về tác động của chế độ ăn uống giàu chất béo đến bệnh tiểu đường loại 2 ở người lớn tuổi?',
  'positive_document': 'Bệnh tiểu đường loại 2 là một bệnh mãn tính ảnh hưởng đến cách cơ thể bạn sử dụng đường glucose. Đường glucose là nguồn năng lượng chính của cơ thể bạn. Với bệnh tiểu đường loại 2, cơ thể bạn hoặc không sản xuất đủ insulin hoặc không sử dụng insulin một cách hiệu quả. Insulin là một hormone giúp glucose vào tế bào để cung cấp năng lượng. Khi có quá nhiều glucose trong máu, nó có thể dẫn đến nhiều vấn đề sức khỏe nghiêm trọng, bao gồm bệnh tim, đột quỵ, tổn thương thần kinh và mất thị lực. \n\nChế độ ăn uống đóng một vai trò quan trọng trong việc quản lý bệnh tiểu đường loại 2. Một chế độ ăn uống lành mạnh có thể giúp bạn kiểm soát lượng đường trong máu, giảm nguy cơ biến chứng và cải thiện sức khỏe tổng thể. \n\nMột chế độ ăn uống giàu chất béo có thể có tác động tiêu cực đến bệnh tiểu đường loại 2. Chất béo bão hòa và chất béo ch

## Text Classification Tasks (long-short)

In [None]:
text_classification_tasks_template = """Brainstorm a list of potentially useful text classification tasks.
Please adhere to the following guidelines:
- Tasks should cover a diverse range of domains and task types.
Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct
text classification task in one sentence. Do not explain yourself or output anything else. Be creative!"""

In [None]:
text_classification_tasks_markdown = gemini_generate(text_classification_tasks_template)

In [None]:
text_classification_tasks_markdown

'```python\n[\n"Categorize online reviews as positive, negative, or neutral.",\n"Identify the sentiment expressed in social media posts about a product launch.",\n"Classify news articles based on their topic (e.g., politics, sports, business).",\n"Determine the genre of a movie based on its plot summary.",\n"Classify emails as spam or legitimate.",\n"Identify the author of a document based on their writing style.",\n"Categorize scientific research papers based on their research area.",\n"Classify customer support tickets based on their urgency.",\n"Identify the intent of a user\'s query in a chatbot.",\n"Determine the political leaning of a news source.",\n"Classify job postings based on their required skills.",\n"Identify the emotional tone of a piece of text.",\n"Categorize legal documents based on their type (e.g., contract, will, complaint).",\n"Classify student essays based on their quality.",\n"Determine the target audience of a marketing campaign.",\n"Identify the source of a qu

In [None]:
text_classification_task_list = convert_task_md_to_list(text_classification_tasks_markdown)
pprint(text_classification_task_list)

['Categorize online reviews as positive, negative, or neutral.',
 'Identify the sentiment expressed in social media posts about a product '
 'launch.',
 'Classify news articles based on their topic (e.g., politics, sports, '
 'business).',
 'Determine the genre of a movie based on its plot summary.',
 'Classify emails as spam or legitimate.',
 'Identify the author of a document based on their writing style.',
 'Categorize scientific research papers based on their research area.',
 'Classify customer support tickets based on their urgency.',
 "Identify the intent of a user's query in a chatbot.",
 'Determine the political leaning of a news source.',
 'Classify job postings based on their required skills.',
 'Identify the emotional tone of a piece of text.',
 'Categorize legal documents based on their type (e.g., contract, will, '
 'complaint).',
 'Classify student essays based on their quality.',
 'Determine the target audience of a marketing campaign.',
 'Identify the source of a quote

In [None]:
task = text_classification_task_list[0]
num_words = "at least 50" # ["less than 10", "at least 10", "at least 50", "at least 100", "at least 200"]
language = "Vietnamese"
clarity = "understandable with some effort" # ["clear", "understandable with some effort", "ambiguous"]
difficulty = "high school" # ["high school", "college", "PhD"]

synthetic_sample_text_classification_template = f"""You have been assigned a text classification task: {task}
Your mission is to write one text classification example for this task in JSON format. The JSON object must contain the
following keys:
- "input_text": a string, the input text specified by the classification task.
- "label": a string, the correct label of the input text.
- "misleading_label": a string, an incorrect label that is related to the task.
Please adhere to the following guidelines:
- The "input_text" should be {num_words} words and diverse in expression.
- The "misleading_label" must be a valid label for the given task, but not as appropriate as the "label" for the "input_text".
- The values for all fields should be in {language}.
- Avoid including the values of the "label" and "misleading_label" fields in the "input_text", that would make the task too
easy.
- The "input_text" is {clarity} and requires {difficulty} level education to comprehend.
Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative!
"""

In [None]:
synthetic_sample_text_classification_template

'You have been assigned a text classification task: Categorize online reviews as positive, negative, or neutral.\nYour mission is to write one text classification example for this task in JSON format. The JSON object must contain the\nfollowing keys:\n- "input_text": a string, the input text specified by the classification task.\n- "label": a string, the correct label of the input text.\n- "misleading_label": a string, an incorrect label that is related to the task.\nPlease adhere to the following guidelines:\n- The "input_text" should be at least 50 words and diverse in expression.\n- The "misleading_label" must be a valid label for the given task, but not as appropriate as the "label" for the "input_text".\n- The values for all fields should be in Vietnamese.\n- Avoid including the values of the "label" and "misleading_label" fields in the "input_text", that would make the task too\neasy.\n- The "input_text" is understandable with some effort and requires high school level education 

In [None]:
text_classification_sample = gemini_generate(synthetic_sample_text_classification_template)

In [None]:
import json

text_classification_sample_json_string = text_classification_sample.replace("json", "").replace("```", "").replace("\n", "").strip()

text_classification_sample_json = json.loads(text_classification_sample_json_string)

print(json.dumps(text_classification_sample_json, indent=4, ensure_ascii=False))

{
    "input_text": "Tôi đã mua chiếc điện thoại này cách đây một tháng và tôi rất hài lòng với nó. Camera chụp ảnh đẹp, màn hình hiển thị sắc nét, và hiệu năng hoạt động mượt mà. Pin cũng rất bền, tôi có thể sử dụng cả ngày mà không cần sạc. Tuy nhiên, tôi thấy thiết kế của máy hơi trơn, dễ tuột khỏi tay. Nói chung, đây là một chiếc điện thoại đáng để mua.",
    "label": "Tích cực",
    "misleading_label": "Trung lập"
}


## Text Maching Tasks (short-short)

In [None]:
text_matching_tasks_template = """Brainstorm a list of text matching tasks where both the queries and the groundtruth documents are very short (one or two
sentences, even a short phrase).
Here are a few examples:
- Given a scientific paper title, retrieve the title of papers that cite the given paper.
- Match a word with its definition.
- Provided a notable person’s name, identify their occupation or achievement.
Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct
task in one sentence. Do not explain yourself or output anything else. Be creative!"""

In [None]:
text_matching_tasks_markdown = gemini_generate(text_matching_tasks_template)

In [None]:
text_matching_tasks_markdown

'```python\n[\n"Given a song title, identify the artist who performed it.",\n"Match a food ingredient with its culinary category (e.g., vegetable, spice).",\n"Given a brand name, find its associated product category.",\n"Identify the country a city belongs to.",\n"Match a movie title with its genre.",\n"Given a historical event, identify the year it occurred.",\n"Match a book title with its author.",\n"Given a common phrase, identify its meaning.",\n"Identify the capital city of a given country.",\n"Match a scientific term with its definition.",\n"Given a symptom, suggest a possible medical condition.",\n"Identify the species of a given animal.",\n"Match a type of weather with its associated season.",\n"Given a chemical formula, identify the compound name.",\n"Match a musical note with its frequency.",\n"Identify the instrument played in a given musical excerpt.",\n"Given a color, identify its complementary color.",\n"Match a famous landmark with its location.",\n"Given a piece of clot

In [None]:
text_matching_task_list = convert_task_md_to_list(text_matching_tasks_markdown)
pprint(text_matching_task_list)

['Given a song title, identify the artist who performed it.',
 'Match a food ingredient with its culinary category (e.g., vegetable, spice).',
 'Given a brand name, find its associated product category.',
 'Identify the country a city belongs to.',
 'Match a movie title with its genre.',
 'Given a historical event, identify the year it occurred.',
 'Match a book title with its author.',
 'Given a common phrase, identify its meaning.',
 'Identify the capital city of a given country.',
 'Match a scientific term with its definition.',
 'Given a symptom, suggest a possible medical condition.',
 'Identify the species of a given animal.',
 'Match a type of weather with its associated season.',
 'Given a chemical formula, identify the compound name.',
 'Match a musical note with its frequency.',
 'Identify the instrument played in a given musical excerpt.',
 'Given a color, identify its complementary color.',
 'Match a famous landmark with its location.',
 'Given a piece of clothing, identify

In [None]:
task = text_matching_task_list[0]

language = "Vietnamese"


synthetic_text_matching_template = f"""You have been assigned a text matching task: {task}
Your mission is to write one example for this task in JSON format. The JSON object must contain the following keys:
- "input": a string, a random input specified by the task.
- "positive_document": a string, a relevant document for the "input" according to the task.
Please adhere to the following guidelines:
- The values of all fields should be in {language}.
- Both the "input" and "positive_document" should be very short (a sentence or a phrase), avoid substantial word overlaps,
otherwise the task would be too easy.
- The "input" and "positive_document" should be independent of each other.
Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative!"""

In [None]:
synthetic_text_matching_template

'You have been assigned a text matching task: Given a song title, identify the artist who performed it.\nYour mission is to write one example for this task in JSON format. The JSON object must contain the following keys:\n- "input": a string, a random input specified by the task.\n- "positive_document": a string, a relevant document for the "input" according to the task.\nPlease adhere to the following guidelines:\n- The values of all fields should be in Vietnamese.\n- Both the "input" and "positive_document" should be very short (a sentence or a phrase), avoid substantial word overlaps,\notherwise the task would be too easy.\n- The "input" and "positive_document" should be independent of each other.\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!'

In [None]:
text_matching_sample = gemini_generate(synthetic_text_matching_template)

In [None]:
text_matching_sample

'{"input": "Bài hát \\"Em gái mưa\\" nổi tiếng.", "positive_document": "Hương Tràm là ca sĩ thể hiện ca khúc này."} \n'

In [None]:
import json

text_matching_sample_json_string = text_matching_sample.replace("json", "").replace("```", "").replace("\n", "").strip()

text_matching_sample_json = json.loads(text_matching_sample_json_string)

print(json.dumps(text_matching_sample_json, indent=4, ensure_ascii=False))

{
    "input": "Bài hát \"Em gái mưa\" nổi tiếng.",
    "positive_document": "Hương Tràm là ca sĩ thể hiện ca khúc này."
}


## Text Matching (long-long)

In [None]:
text_matching_tasks_template = """Brainstorm a list of text matching tasks where the queries are long documents.
Here are a few examples:
- Given a document that supports a debatable argument, find another document that contains opposite arguments.
- Provided a lengthy business proposal, retrieve competitive business strategies in the same industry.
Your output must always be a python list of strings only, with about 20 elements, and each element corresponds to a distinct
task in one sentence. Do not explain yourself or output anything else. Be creative!"""

In [None]:
text_matching_tasks_markdown = gemini_generate(text_matching_tasks_template)

In [None]:
text_matching_tasks_markdown

"```python\n[\n'Given a research paper, find related articles that provide opposing viewpoints.',\n'Identify news articles that report on the same event as a long-form investigative piece.',\n'Given a book manuscript, retrieve reviews and critiques from literary publications.',\n'Using a lengthy historical document, find contemporaneous sources that provide context.',\n'For a legal brief, locate relevant case law and legislation.',\n'Given a scientific research paper, find other papers that cite or are cited by it.',\n'Given a legal contract, find related contracts or legal precedents.',\n'For a news article about a political scandal, find related news articles from different political perspectives.',\n'Given a long-form interview, find other interviews with the same person.',\n'For a detailed product review, find other reviews that mention the same product features.',\n'Given a screenplay, find other screenplays with similar plot structures.',\n'Using a long-form blog post, find other

In [None]:
text_matching_task_list = convert_task_md_to_list(text_matching_tasks_markdown)
pprint(text_matching_task_list)

['Given a research paper, find related articles that provide opposing '
 'viewpoints.',
 'Identify news articles that report on the same event as a long-form '
 'investigative piece.',
 'Given a book manuscript, retrieve reviews and critiques from literary '
 'publications.',
 'Using a lengthy historical document, find contemporaneous sources that '
 'provide context.',
 'For a legal brief, locate relevant case law and legislation.',
 'Given a scientific research paper, find other papers that cite or are cited '
 'by it.',
 'Given a legal contract, find related contracts or legal precedents.',
 'For a news article about a political scandal, find related news articles '
 'from different political perspectives.',
 'Given a long-form interview, find other interviews with the same person.',
 'For a detailed product review, find other reviews that mention the same '
 'product features.',
 'Given a screenplay, find other screenplays with similar plot structures.',
 'Using a long-form blog po

In [None]:
task = text_matching_task_list[1]

language = "Vietnamese"


synthetic_text_matching_template = f"""You have been assigned a text matching task: {task}
Your mission is to write one example for this task in JSON format. The JSON object must contain the following keys:
- "input": a string, a random input specified by the task.
- "positive_document": a string, a relevant document for the "input" according to the task.
Please adhere to the following guidelines:
- The values of all fields should be in {language}.
- Both the "input" and "positive_document" should be long documents (at least 300 words), avoid substantial word overlaps,
otherwise the task would be too easy.
- The "input" and "positive_document" should be independent of each other.
Your output must always be a JSON object only, do not explain yourself or output anything else. Be creative!"""

In [None]:
synthetic_text_matching_template

'You have been assigned a text matching task: Identify news articles that report on the same event as a long-form investigative piece.\nYour mission is to write one example for this task in JSON format. The JSON object must contain the following keys:\n- "input": a string, a random input specified by the task.\n- "positive_document": a string, a relevant document for the "input" according to the task.\nPlease adhere to the following guidelines:\n- The values of all fields should be in Vietnamese.\n- Both the "input" and "positive_document" should be long documents (at least 300 words), avoid substantial word overlaps,\notherwise the task would be too easy.\n- The "input" and "positive_document" should be independent of each other.\nYour output must always be a JSON object only, do not explain yourself or output anything else. Be creative!'

In [None]:
text_matching_sample = gemini_generate(synthetic_text_matching_template)

In [None]:
text_matching_sample

'```json\n{\n"input": "Ngày 27 tháng 10 năm 2023, Ủy ban Chứng khoán Nhà nước (UBCKNN) đã công bố quyết định xử phạt vi phạm hành chính đối với Công ty Cổ phần Đầu tư và Phát triển Bất động sản An Gia (mã chứng khoán: AGG) với tổng số tiền phạt 3,5 tỷ đồng. Lý do phạt là do AGG đã vi phạm quy định về công bố thông tin, cụ thể là việc công bố thông tin không đầy đủ, không chính xác, không kịp thời về hoạt động của công ty. Cụ thể, UBCKNN đã phát hiện AGG đã không công bố đầy đủ thông tin về các dự án bất động sản mà công ty đang đầu tư, cũng như không công bố kịp thời các thông tin liên quan đến kết quả kinh doanh của công ty. Việc vi phạm này đã gây ảnh hưởng đến quyền lợi của các nhà đầu tư, đặc biệt là các nhà đầu tư nhỏ lẻ. Bên cạnh việc phạt tiền, UBCKNN cũng yêu cầu AGG phải khắc phục hậu quả bằng cách công bố đầy đủ, chính xác, kịp thời các thông tin về hoạt động của công ty trong thời gian tới. Việc xử phạt này cho thấy UBCKNN đang ngày càng tăng cường công tác thanh tra, giám s

In [None]:
import json

text_matching_sample_json_string = text_matching_sample.replace("json", "").replace("```", "").replace("\n", "").strip()

text_matching_sample_json = json.loads(text_matching_sample_json_string)

print(json.dumps(text_matching_sample_json, indent=4, ensure_ascii=False))

{
    "input": "Ngày 27 tháng 10 năm 2023, Ủy ban Chứng khoán Nhà nước (UBCKNN) đã công bố quyết định xử phạt vi phạm hành chính đối với Công ty Cổ phần Đầu tư và Phát triển Bất động sản An Gia (mã chứng khoán: AGG) với tổng số tiền phạt 3,5 tỷ đồng. Lý do phạt là do AGG đã vi phạm quy định về công bố thông tin, cụ thể là việc công bố thông tin không đầy đủ, không chính xác, không kịp thời về hoạt động của công ty. Cụ thể, UBCKNN đã phát hiện AGG đã không công bố đầy đủ thông tin về các dự án bất động sản mà công ty đang đầu tư, cũng như không công bố kịp thời các thông tin liên quan đến kết quả kinh doanh của công ty. Việc vi phạm này đã gây ảnh hưởng đến quyền lợi của các nhà đầu tư, đặc biệt là các nhà đầu tư nhỏ lẻ. Bên cạnh việc phạt tiền, UBCKNN cũng yêu cầu AGG phải khắc phục hậu quả bằng cách công bố đầy đủ, chính xác, kịp thời các thông tin về hoạt động của công ty trong thời gian tới. Việc xử phạt này cho thấy UBCKNN đang ngày càng tăng cường công tác thanh tra, giám sát hoạt

# Symmetric tasks

## Monolingual STS

In [None]:
unit = "sentence" # ["sentence", "phrase", "passage"]
language = "Vietnamese"
high_score = 4 # [4, 4.5, 5]
low_score = 2.5 # 2.5, 3, 3.5
difficulty = "high school" # ["elementary school", "high school", "college"]

mono_sts_template = f"""Write a {unit} triple with varying semantic similarity scores in JSON format. The semantic similarity score ranges from 1 to
5, with 1 denotes least similar and 5 denotes most similar.
Please adhere to the following guidelines:
- The keys in JSON are "S1", "S2", and "S3", the values are all strings in {language}, do not add any other keys.
- There should be some word overlaps between all three {unit}s.
- The similarity score between S1 and S2 should be {high_score}.
- The similarity score between S1 and S3 should be {low_score}.
- The {unit}s require {difficulty} level education to understand and should be diverse in terms of topic and length.
Your output must always be a JSON object only with three keys "S1", "S2" and "S3", do not explain yourself or output
anything else. Be creative!"""

In [None]:
mono_sts_sample = gemini_generate(mono_sts_template)

In [None]:
mono_sts_sample

'{"S1": "Trong cuộc chiến tranh thế giới thứ hai, quân đội Đức Quốc xã đã sử dụng chiến lược tấn công chớp nhoáng để chiếm lĩnh lãnh thổ của các quốc gia châu Âu.", "S2": "Chiến lược tấn công chớp nhoáng, còn được gọi là Blitzkrieg, là một trong những yếu tố chính giúp quân Đức nhanh chóng tiến sâu vào lãnh thổ của các quốc gia láng giềng.", "S3": "Cuộc chiến tranh thế giới thứ hai đã để lại những hậu quả nặng nề cho nhân loại, bao gồm sự tàn phá của các thành phố, sự thiệt mạng của hàng triệu người và sự tàn phá của môi trường."}'

## Bitext Retrieval

In [None]:
unit = "phrase" # ["sentence", "phrase", "passage"]
src_lang = "English" #
tgt_lang = "Vietnamese" #

high_score = 5 # [4, 4.5, 5]
low_score = 2.5 # 2.5, 3, 3.5

difficulty = "common knowledge" # ["elementary school", "high school", "college"]

bitext_retrieval_template = f"""Write a {unit} triple with one {unit} in {src_lang} and two {unit}s in {tgt_lang} with varying translation qualities in JSON
format.
The triple is denotes as ("S1", "S2", "S3"). The translation quality score ranges from 1 to 5, with higher scores are better.
Please adhere to the following guidelines:
- The values of "S1" is a string in {src_lang}, the value of "S2" and "S3" are strings in {tgt_lang}.
- There should be some word overlaps between "S2" and "S3".
- The translation quality score of "S2" with respect to "S1" should be {high_score}.
- The translation quality score of "S3" with respect to "S1" should be {low_score}.
- "S3" should be grammatical and fluent, but contain some keyword or number translation errors, or miss some information,
or contain some redundant information.
- "S1" requires {difficulty} level education to understand and should be diverse in terms of topic and length.
Your output must always be a JSON object only with three keys "S1", "S2" and "S3", do not explain yourself or output
anything else. Be creative!"""

In [None]:
bitext_retrieval_sample = gemini_generate(bitext_retrieval_template)

In [None]:
bitext_retrieval_sample

'{"S1": "The 19th Amendment to the U.S. Constitution granted women the right to vote.", "S2": "Tu chính án thứ 19 của Hiến pháp Hoa Kỳ đã trao quyền bầu cử cho phụ nữ.", "S3": "Hiến pháp Mỹ đã thông qua Tu chính án thứ mười chín, cho phép phụ nữ có quyền bầu cử vào năm 1920."} \n'