In [1]:
# configuration for using Gemini API
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
gemini_token = user_secrets.get_secret("GEMINI_API")
genai.configure(api_key=gemini_token)

In [2]:
# instruction and format configuration
sys_instruction = "You are an expert bilingual translator specializing in Arabic and English, with advanced knowledge in medical and engineering fields"

generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "response_schema": content.Schema(
    type = content.Type.OBJECT,
    required = ["Arabic", "Literal", "Intended"],
    properties = {
      "Arabic": content.Schema(
        type = content.Type.STRING,
      ),
      "Literal": content.Schema(
        type = content.Type.STRING,
      ),
      "Intended": content.Schema(
        type = content.Type.STRING,
      ),
    },
  ),
  "response_mime_type": "application/json",
}

In [3]:
# initialize model
model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=generation_config,
  system_instruction=sys_instruction,
)

# test
prompt = "Generate examples of slang, idioms, or technical phrases in Arabic that may lead to inaccurate translation into English. Include the Arabic phrase, its literal English translation, the intended meaning."
response = model.generate_content(prompt)
print(response.text)

{
    "Arabic": "يا زلمة",
    "Intended": "Dude, man",
    "Literal": "Oh, man (literally, oh, guy)"
}


In [4]:
# generate synthetic data
import json

Arabic = []
English = []

def generate_data(prompt):
    for i in range(10):
        response = model.generate_content(prompt)
        response = json.loads(response.text)
        Arabic.append(response['Arabic'])
        English.append(response['Intended'])

In [5]:
# create interval to avoid API quota limitiations
import time
time.sleep(65)

for _ in range(10):
    generate_data("prompt")
    time.sleep(65)

In [6]:
# print the result
print("*" * 50)
print("The lenth of the data:")
print(len(English))

print("*" * 50)
print("Example:")
print("English:", English[10])
print("Arabic:", Arabic[10])

**************************************************
The lenth of the data:
100
**************************************************
Example:
English: Hello
Arabic: مرحبا


In [7]:
# save as .csv file
import pandas as pd
df = pd.DataFrame({"English" : English, "Arabic" : Arabic})
df.to_csv("syntheticData.csv", index=False)