### Use gemini to create synthetic data

In [None]:
from google import genai
import json
import pandas as pd
import numpy as np
import json

In [10]:
# !gcloud auth application-default login

In [None]:
model = "gemini-2.5-flash"
PROJECT_ID = '$YOUR_PROJECT_ID'

client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location="global")

In [None]:
prompt = '''
## Task
You're a question generator, for matching unpopular acronyms.
Please Generate a sample Multiple-choice question, using information below.
The question is matching acronyms to full names, in various fields.
Please make unpopular meaning as an answer.

## Field definitions
- "question" should define the industry and acronym you're asking.
- "choices" must be full name options that can be chosen, with its choice numbers, which must be string.
    - For instance, "[1] Random Access Memory, [2] Read-Only Memory"
    - Do not include question.
- "answer" must be the answer, which must be a integer, indicating the right choice number.
- "likelihood" is a probability that in reality, the acronym is used for that meaning, and must be between 0~1.
- "reason" is the reason of the answer.

## Output Format
- Output must be in JSON format
- Output must contain four keys : "question", "choices", "answer", "likelihood", "reason"

'''

In [None]:
response_list = []
for idx in range(50):
    response = client.models.generate_content(
        model=model,
        contents=[
            prompt,
        ],
    )
    if response.text is not None:
        response_list.append(response.text)

In [19]:
def process_api_output(api_output_string):
    try:
        # Only JSON 
        json_text = api_output_string.strip().removeprefix('```json\n').removesuffix('\n```')
        return json.loads(json_text)
        
    except (json.JSONDecodeError, AttributeError) as e:
        print(f"Error when parsing JSON : {e}")
        return None
    
processed_response_list = [process_api_output(x) for x in response_list]
df = pd.DataFrame(processed_response_list)

In [62]:
np.random.seed(1)
df['rand'] = np.random.randint(0, 5, len(df))
df_train = df[df['rand'] > 0].drop('rand', axis=1).reset_index(drop=True)
df_valid = df[df['rand'] == 0].drop('rand', axis=1).reset_index(drop=True)

In [63]:
print(df_train.shape)
print(df_valid.shape)

(40, 5)
(10, 5)


In [64]:
df_train.head(3)

Unnamed: 0,question,choices,answer,likelihood,reason
0,"In the specialized field of molecular biology,...","[1] Redundant Array of Independent Disks, [2] ...",3,0.07,"""RNA-associated Degradation"" is a specific bio..."
1,In the field of advanced microscopy and nanosc...,"[1] Standard Nanoscale Optical Module, [2] Sca...",2,0.05,SNOM stands for Scanning Near-field Optical Mi...
2,In the field of networking and computer scienc...,"[1] System Yield Neutralizer, [2] Synchronous ...",3,0.08,While 'SYN' is most commonly known as the firs...


### Preprocess

In [65]:
def row_to_formatted_json(row):
    json_content = row.to_json(orient='index', double_precision=2, indent=2)
    
    formatted_string = '```json\n' + json_content + '\n```'
    
    return formatted_string

label_columns = ['answer', 'reason', 'likelihood']
df_train['target'] = df_train[label_columns].apply(row_to_formatted_json, axis=1)
df_valid['target'] = df_valid[label_columns].apply(row_to_formatted_json, axis=1)

In [66]:
df_train.head(2)

Unnamed: 0,question,choices,answer,likelihood,reason,target
0,"In the specialized field of molecular biology,...","[1] Redundant Array of Independent Disks, [2] ...",3,0.07,"""RNA-associated Degradation"" is a specific bio...","```json\n{\n ""answer"":3,\n ""reason"":""\""RNA-a..."
1,In the field of advanced microscopy and nanosc...,"[1] Standard Nanoscale Optical Module, [2] Sca...",2,0.05,SNOM stands for Scanning Near-field Optical Mi...,"```json\n{\n ""answer"":2,\n ""reason"":""SNOM st..."


In [67]:
def refine_string(text_data):
    cleaned_text = text_data.strip()
    if cleaned_text.startswith("```json"):
        cleaned_text = cleaned_text[len("```json"):].strip()

    if cleaned_text.endswith("```"):
        cleaned_text = cleaned_text[:-len("```")].strip()

    if cleaned_text.startswith('{') and cleaned_text.endswith('}'):
        json_string = cleaned_text
    else:
        json_string = None

    data_dict = {}
    reason = None
    if json_string:
        try:
            data_dict = json.loads(json_string)
        except json.JSONDecodeError as e:
          reason = 'Wrong JSON format'
    else:
      reason = 'Wrong JSON format'
    return data_dict, reason

In [69]:
df_train.to_json("./sample_data/train.jsonl", orient='records', lines=True, force_ascii=False, indent=0)
df_valid.to_json("./sample_data/valid.jsonl", orient='records', lines=True, force_ascii=False, indent=0)

### Upload to Cloud storage

In [58]:
REGION = "us-central1"
# BUCKET_NAME = "$YOUR_BUCKET_NAME"
BUCKET_NAME = "redstone_base_bucket"

BUCKET_URI = f"gs://{BUCKET_NAME}"

In [None]:
# Create bucket if needed
# ! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}

In [59]:
# Upload to gcs
from google.cloud import storage

def upload_df_to_gcs_as_jsonl(df: pd.DataFrame, bucket_name: str, destination_blob_name: str):
    jsonl_string = df.to_json(orient='records', lines=True)
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(jsonl_string, content_type='application/json')

    print(f"Uploaded : {bucket_name} / {destination_blob_name}")
    

In [None]:
train_data_path = 'inputs/train.jsonl'
valid_data_path = 'inputs/valid.jsonl'

upload_df_to_gcs_as_jsonl(df_train, BUCKET_NAME, train_data_path)
upload_df_to_gcs_as_jsonl(df_valid, BUCKET_NAME, valid_data_path)