In [None]:
!pip install --quiet --upgrade google-cloud-aiplatform google-cloud-storage

In [None]:
from IPython.display import display
from IPython.display import Markdown
import textwrap


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

We load the dataset from Huggingface. We want the model to expand the short stories to make them more engaging for children, and produce a lesson at the end from the story.

In [None]:
from datasets import load_dataset

ds = load_dataset("roneneldan/TinyStories")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import sys

PROJECT_ID = "jkwng-vertex-playground"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
BUCKET = "jkwng-vertex-experiments" # @param {type:"string"}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    raise ValueError("Please set your PROJECT_ID")


import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

We loaded 1000 random stories from the dataset.

In [None]:
shuffle_ds = ds['train'].shuffle(seed=42)
sample_ds = shuffle_ds[:1000]

Here is a sample story.

In [None]:
to_markdown(sample_ds['text'][0])

> Tim and Mia like to play in the park. They see a big club on the ground. It is brown and long and heavy.
> 
> "Look, a club!" Tim says. "I can lift it!"
> 
> He tries to lift the club, but it is too tough. He falls down and drops the club.
> 
> "Ouch!" he says. "That hurt!"
> 
> Mia laughs. She is not mean, she just thinks it is funny.
> 
> "Let me try!" she says. "I can balance it!"
> 
> She picks up the club and puts it on her head. She walks slowly and carefully. She does not fall down.
> 
> "Wow!" Tim says. "You are good at balancing!"
> 
> "Thank you!" Mia says. "It is fun!"
> 
> They take turns balancing the club on their heads, arms, and legs. They have a lot of fun with the club. They are happy and proud. They are good friends.

Test out the system instruction and set up the response schema. We want the model to output the story and the lesson as separate properties of the json so we can evaluate each of them individually later.

In [None]:
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part, SafetySetting, FinishReason

max_tokens = 8192
temperature = 1
top_p = 0.95

response_schema = {
  "type": "object",
  "properties": {
    "story": {
      "type": "string",
    },
    "lesson": {
      "type": "string",
    },
  },
  "required": ["story", "lesson"],
}
generation_config = GenerationConfig(
    temperature=temperature,
    top_p=top_p,
    max_output_tokens=max_tokens,
    response_mime_type="application/json",
    response_schema=response_schema,
)

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

system_prompt = """
You are a children\'s story editor. You will be shown a simple children\'s story that you will be asked to add
details to the story to make it more engaging for children.
"""

vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel(
  "gemini-1.5-pro-002",
  system_instruction=[system_prompt],
)

Define the instruction prompt template.

In [None]:
instr_prompt = """
Make the story more interesting. Do not add, remove, or change any details from the story.  Only add sentences to elaborate
on what is already written. **Do not change any of the character\'s dialog.** Include a lesson at the end that children can
learn from the story.

Story:
"""

prompt = instr_prompt + sample_ds['text'][0]
print(prompt)


Make the story more interesting. Do not add, remove, or change any details from the story.  Only add sentences to elaborate
on what is already written. **Do not change any of the character's dialog.** Include a lesson at the end that children can
learn from the story.

Story:
Tim and Mia like to play in the park. They see a big club on the ground. It is brown and long and heavy.

"Look, a club!" Tim says. "I can lift it!"

He tries to lift the club, but it is too tough. He falls down and drops the club.

"Ouch!" he says. "That hurt!"

Mia laughs. She is not mean, she just thinks it is funny.

"Let me try!" she says. "I can balance it!"

She picks up the club and puts it on her head. She walks slowly and carefully. She does not fall down.

"Wow!" Tim says. "You are good at balancing!"

"Thank you!" Mia says. "It is fun!"

They take turns balancing the club on their heads, arms, and legs. They have a lot of fun with the club. They are happy and proud. They are good friends.


In [None]:
response = model.generate_content(
    [prompt],
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [None]:
import json

resp_json = json.loads(response.text)
print(json.dumps(resp_json, indent=2))

{
  "story": "Tim and Mia, best friends since kindergarten, loved playing in their neighborhood park after school.  The park, filled with towering oak trees and colorful flowers, was their favorite place to explore and have adventures. One sunny afternoon, while playing hide-and-seek amongst the trees, they stumbled upon a big club lying on the ground. Its bark was a deep, rich brown, worn smooth by time and weather. The club was long, like a fallen branch, and surprisingly heavy. \"Look, a club!\" Tim exclaimed, his eyes wide with excitement. \"I can lift it!\"  Tim, always eager to show off his strength, bent down and grasped the heavy club. He pulled with all his might, his face turning red with effort, but the club wouldn't budge.  With a grunt, he lost his balance and tumbled to the ground, the club falling harmlessly beside him. \"Ouch!\" he cried, rubbing his bruised knee. \"That hurt!\" Mia giggled, not to be mean, but because Tim's fall was quite comical.  She couldn't help bu

Build out the batch request with the 1000 story samples from the dataset above.

In [None]:
# TODO generate jsonl for all the stories using this prompt template
# Import the Google Cloud client library and JSON library
from google.cloud import storage
import json

bucket_name = 'jkwng-vertex-experiments'
stories_prefix = 'stories'
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
output_path = f"{stories_prefix}/batch_input.jsonl"
output_blob = bucket.blob(output_path)

Write out the dataset to json lines to prepare the batch prediction.

In [None]:
with output_blob.open(mode='w') as f:
  lineCount = 0

  for data in sample_ds['text']:
    item = {}

    prompt = f"""
Make the story more interesting. Do not add, remove, or change any details from the story.  Only add sentences to elaborate
on what is already written. **Do not change any of the character\'s dialog.** Include a lesson at the end that children can
learn from the story.

Story:
{data}
"""

    # batch prediction is json representation of GenerateContentRequest

    #print(vars(generation_config))
    #print(data)
    item['id'] = str(lineCount)
    item['request'] = {
        "contents": [
            {
                "role": "user",
                "parts": [{"text": prompt}],
            }
        ],
        "system_instruction": {
          "parts": [{"text": system_prompt}],
        },
        "generation_config": generation_config.to_dict(),
        "safety_settings": [s.to_dict() for s in safety_settings],
    }

    line = json.dumps(item).replace("\"type_\":", "\"type\":") # annoying hack because type is a reserved key
    lineCount += 1
    f.write(f"{line}\n")

    if lineCount % 100 == 0:
      print(f"- Wrote {lineCount} lines to gs://{bucket_name}/{output_path} ...")

print(f"Wrote {lineCount} total lines to gs://{bucket_name}/{output_path}")

- Wrote 100 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 200 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 300 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 400 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 500 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 600 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 700 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 800 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 900 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
- Wrote 1000 lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl ...
Wrote 1000 total lines to gs://jkwng-vertex-experiments/stories/batch_input.jsonl


Batch Prediction

In [None]:
from vertexai.batch_prediction import BatchPredictionJob
from datetime import datetime
import time

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
output_uri_prefix=f"gs://{bucket_name}/{stories_prefix}/batch_output_{TIMESTAMP}"

job = BatchPredictionJob.submit(
    source_model="gemini-1.5-pro-002",
    input_dataset=f"gs://{bucket_name}/{output_path}",
    output_uri_prefix=output_uri_prefix
)

print(f"Writing to output: gs://{bucket_name}/{output_uri_prefix}")
print(f"Job resource name: {job.resource_name}")
print(f"Model resource name: {job.model_name}")
print(f"Job state: {job.state.name}")

# Refresh the job until complete
waitcount = 0
while not job.has_ended:
    time.sleep(5)
    waitcount += 1
    job.refresh()
    if waitcount % 6 == 0:
      print (f"after {waitcount * 5} seconds, job state is {job.state.name} ...")

# Check if the job succeeds
if job.has_succeeded:
    print(f"Job succeeded after {waitcount * 5} seconds. output: {job.output_location}")
else:
    print(f"Job failed: {job.error}")

INFO:vertexai.batch_prediction._batch_prediction:BatchPredictionJob created. Resource name: projects/205512073711/locations/us-central1/batchPredictionJobs/7486593743181578240
INFO:vertexai.batch_prediction._batch_prediction:To use this BatchPredictionJob in another session:
INFO:vertexai.batch_prediction._batch_prediction:job = batch_prediction.BatchPredictionJob('projects/205512073711/locations/us-central1/batchPredictionJobs/7486593743181578240')
INFO:vertexai.batch_prediction._batch_prediction:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/7486593743181578240?project=205512073711


Writing to output: gs://jkwng-vertex-experiments/gs://jkwng-vertex-experiments/stories/batch_output_20241103153139
Job resource name: projects/205512073711/locations/us-central1/batchPredictionJobs/7486593743181578240
Model resource name: publishers/google/models/gemini-1.5-pro-002
Job state: JOB_STATE_PENDING
after 30 seconds, job state is JOB_STATE_RUNNING ...
after 60 seconds, job state is JOB_STATE_RUNNING ...
after 90 seconds, job state is JOB_STATE_RUNNING ...
after 120 seconds, job state is JOB_STATE_RUNNING ...
after 150 seconds, job state is JOB_STATE_RUNNING ...
after 180 seconds, job state is JOB_STATE_RUNNING ...
after 210 seconds, job state is JOB_STATE_RUNNING ...
after 240 seconds, job state is JOB_STATE_RUNNING ...
Job succeeded after 265 seconds. output: gs://jkwng-vertex-experiments/stories/batch_output_20241103153139/prediction-model-2024-11-03T15:31:39.935840Z


In [None]:
print(f"{job.output_location}")
job_output_location = job.output_location

gs://jkwng-vertex-experiments/stories/batch_output_20241103153139/prediction-model-2024-11-03T15:31:39.935840Z


Preprocess the prediction output into clear json objects that we can use for evaluation later.

The json object schema looks like:
```
{
  "instruction": <prompt instruction>,
  "original": <original story>,
  "system_instruction": <system prompt>,
  "story": <AI generated story based on the original story>,
  "lesson": <AI generated lesson drawn from the story>,
}
```

In [None]:
# produce a dataset we can save for evaluation later
#job_output_location = "gs://jkwng-vertex-experiments/stories/dataset.jsonl/20241103131100/prediction-model-2024-11-03T13:11:00.461891Z"

from google.cloud.storage.blob import Blob

output_blob = Blob.from_string(f"{job_output_location}/predictions.jsonl", storage_client)

all_data = []

with output_blob.open(mode='r') as f:
  while line := f.readline():
    data_obj = {}
    output_data = json.loads(line)
    data_id = output_data['id']

    request = output_data['request']
    req_system_instr = request['system_instruction']


    #snip out the prompt, which should be in request[0].contents[0].parts[0].text
    prompt_orig = request['contents'][0]['parts'][0]['text']
    data_obj['instruction'] = instr_prompt
    data_obj['original'] = prompt_orig.replace(instr_prompt, "")
    data_obj['system_instruction'] = req_system_instr['parts'][0]['text']

    response = output_data['response']
    #print(response)

    resp_obj = json.loads(response['candidates'][0]['content']['parts'][0]['text'])
    data_obj['story'] = resp_obj['story']
    data_obj['lesson'] = resp_obj['lesson']

    #print(data_obj)
    all_data.append(data_obj)


# combine
final_output_path = f"{stories_prefix}/stories_dataset.jsonl"
final_output = bucket.blob(final_output_path)
with final_output.open(mode='w') as f:
  lineCount = 0

  for data in all_data:
    line = json.dumps(data)
    lineCount += 1
    f.write(f"{line}\n")

    if lineCount % 100 == 0:
      print(f"- Wrote {lineCount} lines to gs://{bucket_name}/{final_output_path} ...")

print(f"Wrote {lineCount} total lines to gs://{bucket_name}/{final_output_path}")

- Wrote 100 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 200 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 300 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 400 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 500 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 600 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 700 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 800 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 900 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
- Wrote 1000 lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl ...
Wrote 1000 total lines to gs://jkwng-vertex-experiments/stories/stories_dataset.jsonl
