## Gemini API Inilialization

In [1]:
# preconfiguration
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
from kaggle_secrets import UserSecretsClient
import json
import time
import pandas as pd


user_secrets = UserSecretsClient()
gemini_token = user_secrets.get_secret("GEMINI_API")
genai.configure(api_key=gemini_token)

In [2]:
# Gemini instruction and format configuration
sys_instruction = "You are an expert bilingual translator specializing in Arabic and English."

generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "response_schema": content.Schema(
    type = content.Type.OBJECT,
    required = ["Arabic", "Literal", "Intended"],
    properties = {
      "Arabic": content.Schema(
        type = content.Type.STRING,
      ),
      "Literal": content.Schema(
        type = content.Type.STRING,
      ),
      "Intended": content.Schema(
        type = content.Type.STRING,
      ),
    },
  ),
  "response_mime_type": "application/json",
}

## Creating Slang, Idioms Translation Dataset

In [3]:
# initialize model
model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=generation_config,
  system_instruction=sys_instruction,
)

# test
prompt = "Generate examples of slang, idioms, or technical phrases in Arabic that may lead to inaccurate translation into English. Include the Arabic phrase, its literal English translation, the intended meaning."
response = model.generate_content(prompt)
print(response.text)

{
  "Arabic": "يا عمي",
  "Intended": "Dude",
  "Literal": "Oh my uncle"
}


In [4]:
# generate synthetic data
Arabic = []
English = []

def generate_data(prompt):
    for i in range(10):
        response = model.generate_content(prompt)
        response = json.loads(response.text)
        Arabic.append(response['Arabic'])
        English.append(response['Intended'])

In [5]:
# create interval to avoid API quota limitiations
time.sleep(65)

for _ in range(6):
    generate_data(prompt)
    time.sleep(65)

In [6]:
# print the result
print("*" * 50)
print("The lenth of the data:")
print(len(English))

print("*" * 50)
print("Example:")
print("English:", English[10])
print("Arabic:", Arabic[10])

**************************************************
The lenth of the data:
60
**************************************************
Example:
English: Shut up or get lost
Arabic: ياكل تبن


In [7]:
# save as .csv file
df = pd.DataFrame({"English" : English, "Arabic" : Arabic})
df.to_csv("syntheticData_translation.csv", index=False)

## Creating Medical Domain Dataset

In [8]:
# load raw dataset
from datasets import load_dataset
import numpy as np

medical_dataset = load_dataset(
    "FreedomIntelligence/medical-o1-verifiable-problem",
    split = "train"
)

README.md:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

medical_o1_verifiable_problem.json:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40644 [00:00<?, ? examples/s]

In [9]:
# truncation
medical_dataset = medical_dataset[:100]

In [10]:
# Gemini instruction and format configuration
sys_instruction = "You are an expert bilingual translator specializing in Arabic and English."

generation_config = {
    "temperature": 1,
    "top_p": 0.95,
    "response_schema": {
        "type": "object",
        "required": ["Translation"],
        "properties": {
            "Translation": {
                "type": "string"
            }
        }
    },
    "response_mime_type": "application/json"
}

prompt = "Translate it into Arabic."

# initialize model
model = genai.GenerativeModel(
  model_name="gemini-2.0-flash-exp",
  generation_config=generation_config,
  system_instruction=sys_instruction,
)

In [11]:
# translating function
Translation = []

def translation(prompt):
        response = model.generate_content(prompt)
        response = json.loads(response.text)
        Translation.append(response['Translation'])

In [12]:
# gathering data
count = 1
for sentence in medical_dataset['Ground-True Answer']:
    if count % 10 == 0:
        time.sleep(65)
    translation(prompt+sentence)
    count += 1

In [13]:
# save as .csv file
df = pd.DataFrame({"Question": medical_dataset['Open-ended Verifiable Question'],"English" : medical_dataset['Ground-True Answer'][:100], "Arabic" : Translation})
df.to_csv("syntheticData_medical.csv", index=False)