### Use gemini to create synthetic data

In [None]:
from google import genai
import json
import pandas as pd
import numpy as np
# !gcloud auth application-default login

In [None]:
model = "gemini-2.5-flash"
PROJECT_ID = '$YOUR_PROJECT_ID'

client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location="global")

In [None]:
prompt = '''
## Task
You're a question generator, for matching unpopular acronyms.
Please Generate a sample Multiple-choice question, using information below.
The question is matching acronyms to full names, in various fields.
Please make unpopular meaning as an target.

## Field definitions
- "question" should define the industry and acronym you're asking.
- "input" must be full name options that can be chosen, with its choice numbers, which must be string.
    - For instance, "[1] Random Access Memory, [2] Read-Only Memory"
    - Do not include question.
- "target" must be the target, which must be a integer, indicating the right choice number.
- "likelihood" is a probability that in reality, the acronym is used for that meaning, and must be between 0~1.
- "reason" is the reason of the target.

## Output Format
- Output must be in JSON format
- Output must contain four keys : "question", "input", "target", "likelihood", "reason"

'''

In [None]:
response_list = []
for idx in range(30):
    response = client.models.generate_content(
        model=model,
        contents=[
            prompt,
        ],
    )
    if response.text is not None:
        response_list.append(response.text)

In [None]:
def process_api_output(api_output_string):
    try:
        # 비-JSON 문자를 제거하고 JSON 문자열만 남깁니다.
        json_text = api_output_string.strip().removeprefix('```json\n').removesuffix('\n```')
        
        # JSON 문자열을 파이썬 딕셔너리로 변환합니다.
        return json.loads(json_text)
        
    except (json.JSONDecodeError, AttributeError) as e:
        print(f"Error when parsing JSON : {e}")
        return None
    
processed_response_list = [process_api_output(x) for x in response_list]
df = pd.DataFrame(processed_response_list)

In [None]:
df.head(3)

In [None]:
np.random.seed(1001)
df['rand'] = np.random.randint(0, 4, len(df))
df_train = df[df['rand'] > 0].drop('rand', axis=1).reset_index(drop=True)
df_valid = df[df['rand'] == 0].drop('rand', axis=1).reset_index(drop=True)

In [None]:
print(df_train.shape)
print(df_valid.shape)

In [None]:
df_train.to_json("./sample_data/train.jsonl", orient='records', lines=True, force_ascii=False, indent=0)
df_valid.to_json("./sample_data/valid.jsonl", orient='records', lines=True, force_ascii=False, indent=0)

### Upload to Cloud storage

In [None]:
REGION = "us-central1"
BUCKET_NAME = "$YOUR_BUCKET_NAME"
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
# Create bucket
# ! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

In [None]:
# Upload to gcs
from google.cloud import storage

def upload_df_to_gcs_as_jsonl(df: pd.DataFrame, bucket_name: str, destination_blob_name: str):
    # DataFrame을 JSON Lines 형식 문자열로 변환
    jsonl_string = df.to_json(orient='records', lines=True)

    # Google Cloud Storage 클라이언트 초기화
    storage_client = storage.Client()

    # 버킷 객체 가져오기
    bucket = storage_client.bucket(bucket_name)

    # Blob(파일) 객체 생성
    blob = bucket.blob(destination_blob_name)

    # 문자열 데이터를 Blob에 업로드
    blob.upload_from_string(jsonl_string, content_type='application/json')

    print(f"Uploaded : {bucket_name} / {destination_blob_name}")
    

In [None]:
train_data_path = 'inputs/train.jsonl'
valid_data_path = 'inputs/valid.jsonl'

upload_df_to_gcs_as_jsonl(df_train, BUCKET_NAME, train_data_path)
upload_df_to_gcs_as_jsonl(df_valid, BUCKET_NAME, valid_data_path)